From 6273c083d36da01d4c70ca6cad1448e69d52a39e Mon Sep 17 00:00:00 2001 From: Hanminghao Date: Sun, 8 Dec 2024 05:29:55 +0000 Subject: [PATCH] deploy: d189da6be7f787f8a80df451a98bb141ae21035c --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 71849 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 72244 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/cache.json b/cache.json new file mode 100644 index 0000000..28dedad --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-12-02T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.16921v2","updated":"2024-12-02T18:01:53Z","published":"2024-09-25T13:27:29Z","title":"Moner: Motion Correction in Undersampled Radial MRI with Unsupervised\n Neural Representation","summary":" Motion correction (MoCo) in radial MRI is a challenging problem due to the\nunpredictability of subject's motion. Current state-of-the-art (SOTA) MoCo\nalgorithms often use extensive high-quality MR images to pre-train neural\nnetworks, obtaining excellent reconstructions. However, the need for\nlarge-scale datasets significantly increases costs and limits model\ngeneralization. In this work, we propose Moner, an unsupervised MoCo method\nthat jointly solves artifact-free MR images and accurate motion from\nundersampled, rigid motion-corrupted k-space data, without requiring training\ndata. Our core idea is to leverage the continuous prior of implicit neural\nrepresentation (INR) to constrain this ill-posed inverse problem, enabling\nideal solutions. Specifically, we incorporate a quasi-static motion model into\nthe INR, granting its ability to correct subject's motion. To stabilize model\noptimization, we reformulate radial MRI as a back-projection problem using the\nFourier-slice theorem. Additionally, we propose a novel coarse-to-fine hash\nencoding strategy, significantly enhancing MoCo accuracy. Experiments on\nmultiple MRI datasets show our Moner achieves performance comparable to SOTA\nMoCo techniques on in-domain data, while demonstrating significant improvements\non out-of-domain data.\n","authors":["Qing Wu","Chenhe Du","XuanYu Tian","Jingyi Yu","Yuyao Zhang","Hongjiang Wei"],"pdf_url":"https://arxiv.org/pdf/2409.16921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24060v5","updated":"2024-12-02T18:00:18Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15098v3","updated":"2024-12-02T17:59:40Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":" In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19895v2","updated":"2024-12-02T17:44:52Z","published":"2024-11-29T17:59:03Z","title":"GuardSplat: Efficient and Robust Watermarking for 3D Gaussian Splatting","summary":" 3D Gaussian Splatting (3DGS) has recently created impressive assets for\nvarious applications. However, the copyright of these assets is not well\nprotected as existing watermarking methods are not suited for 3DGS considering\nsecurity, capacity, and invisibility. Besides, these methods often require\nhours or even days for optimization, limiting the application scenarios. In\nthis paper, we propose GuardSplat, an innovative and efficient framework that\neffectively protects the copyright of 3DGS assets. Specifically, 1) We first\npropose a CLIP-guided Message Decoupling Optimization module for training the\nmessage decoder, leveraging CLIP's aligning capability and rich representations\nto achieve a high extraction accuracy with minimal optimization costs,\npresenting exceptional capability and efficiency. 2) Then, we propose a\nSpherical-harmonic-aware (SH-aware) Message Embedding module tailored for 3DGS,\nwhich employs a set of SH offsets to seamlessly embed the message into the SH\nfeatures of each 3D Gaussian while maintaining the original 3D structure. It\nenables the 3DGS assets to be watermarked with minimal fidelity trade-offs and\nprevents malicious users from removing the messages from the model files,\nmeeting the demands for invisibility and security. 3) We further propose an\nAnti-distortion Message Extraction module to improve robustness against various\nvisual distortions. Extensive experiments demonstrate that GuardSplat\noutperforms the state-of-the-art methods and achieves fast optimization speed.\n","authors":["Zixuan Chen","Guangcong Wang","Jiahao Zhu","Jianhuang Lai","Xiaohua Xie"],"pdf_url":"https://arxiv.org/pdf/2411.19895v2.pdf","comment":"Project page: https://narcissusex.github.io/GuardSplat and Code:\n https://github.com/NarcissusEx/GuardSplat"},{"id":"http://arxiv.org/abs/2411.07118v3","updated":"2024-12-02T17:11:07Z","published":"2024-11-11T16:45:18Z","title":"ConvMixFormer- A Resource-efficient Convolution Mixer for\n Transformer-based Dynamic Hand Gesture Recognition","summary":" Transformer models have demonstrated remarkable success in many domains such\nas natural language processing (NLP) and computer vision. With the growing\ninterest in transformer-based architectures, they are now utilized for gesture\nrecognition. So, we also explore and devise a novel ConvMixFormer architecture\nfor dynamic hand gestures. The transformers use quadratic scaling of the\nattention features with the sequential data, due to which these models are\ncomputationally complex and heavy. We have considered this drawback of the\ntransformer and designed a resource-efficient model that replaces the\nself-attention in the transformer with the simple convolutional layer-based\ntoken mixer. The computational cost and the parameters used for the\nconvolution-based mixer are comparatively less than the quadratic\nself-attention. Convolution-mixer helps the model capture the local spatial\nfeatures that self-attention struggles to capture due to their sequential\nprocessing nature. Further, an efficient gate mechanism is employed instead of\na conventional feed-forward network in the transformer to help the model\ncontrol the flow of features within different stages of the proposed model.\nThis design uses fewer learnable parameters which is nearly half the vanilla\ntransformer that helps in fast and efficient training. The proposed method is\nevaluated on NVidia Dynamic Hand Gesture and Briareo datasets and our model has\nachieved state-of-the-art results on single and multimodal inputs. We have also\nshown the parameter efficiency of the proposed ConvMixFormer model compared to\nother methods. The source code is available at\nhttps://github.com/mallikagarg/ConvMixFormer.\n","authors":["Mallika Garg","Debashis Ghosh","Pyari Mohan Pradhan"],"pdf_url":"https://arxiv.org/pdf/2411.07118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.01493v3","updated":"2024-12-02T17:10:34Z","published":"2024-06-03T16:20:24Z","title":"Learning Temporally Consistent Video Depth from Video Diffusion Priors","summary":" This work addresses the challenge of streamed video depth estimation, which\nexpects not only per-frame accuracy but, more importantly, cross-frame\nconsistency. We argue that sharing contextual information between frames or\nclips is pivotal in fostering temporal consistency. Thus, instead of directly\ndeveloping a depth estimator from scratch, we reformulate this predictive task\ninto a conditional generation problem to provide contextual information within\na clip and across clips. Specifically, we propose a consistent context-aware\ntraining and inference strategy for arbitrarily long videos to provide\ncross-clip context. We sample independent noise levels for each frame within a\nclip during training while using a sliding window strategy and initializing\noverlapping frames with previously predicted frames without adding noise.\nMoreover, we design an effective training strategy to provide context within a\nclip. Extensive experimental results validate our design choices and\ndemonstrate the superiority of our approach, dubbed ChronoDepth. Project page:\nhttps://xdimlab.github.io/ChronoDepth/.\n","authors":["Jiahao Shao","Yuanbo Yang","Hongyu Zhou","Youmin Zhang","Yujun Shen","Vitor Guizilini","Yue Wang","Matteo Poggi","Yiyi Liao"],"pdf_url":"https://arxiv.org/pdf/2406.01493v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10789v2","updated":"2024-12-02T17:04:07Z","published":"2024-08-20T12:30:37Z","title":"PartGS:Learning Part-aware 3D Representations by Fusing 2D Gaussians and\n Superquadrics","summary":" Low-level 3D representations, such as point clouds, meshes, NeRFs, and 3D\nGaussians, are commonly used to represent 3D objects or scenes. However, human\nperception typically understands 3D objects at a higher level as a composition\nof parts or structures rather than points or voxels. Representing 3D objects or\nscenes as semantic parts can benefit further understanding and applications. In\nthis paper, we introduce $\\textbf{PartGS}$, $\\textbf{part}$-aware 3D\nreconstruction by a hybrid representation of 2D $\\textbf{G}$aussians and\n$\\textbf{S}$uperquadrics, which parses objects or scenes into semantic parts,\ndigging 3D structural clues from multi-view image inputs. Accurate structured\ngeometry reconstruction and high-quality rendering are achieved at the same\ntime. Our method simultaneously optimizes superquadric meshes and Gaussians by\ncoupling their parameters within our hybrid representation. On one hand, this\nhybrid representation inherits the advantage of superquadrics to represent\ndifferent shape primitives, supporting flexible part decomposition of scenes.\nOn the other hand, 2D Gaussians capture complex texture and geometry details,\nensuring high-quality appearance and geometry reconstruction. Our method is\nfully unsupervised and outperforms existing state-of-the-art approaches in\nextensive experiments on DTU, ShapeNet, and real-life datasets.\n","authors":["Zhirui Gao","Renjiao Yi","Yuhang Huang","Wei Chen","Chenyang Zhu","Kai Xu"],"pdf_url":"https://arxiv.org/pdf/2408.10789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17251v2","updated":"2024-12-02T16:37:41Z","published":"2024-11-26T09:29:27Z","title":"DGNN-YOLO: Dynamic Graph Neural Networks with YOLO11 for Small Object\n Detection and Tracking in Traffic Surveillance","summary":" Accurate detection and tracking of small objects such as pedestrians,\ncyclists, and motorbikes are critical for traffic surveillance systems, which\nare crucial in improving road safety and decision-making in intelligent\ntransportation systems. However, traditional methods struggle with challenges\nsuch as occlusion, low resolution, and dynamic traffic conditions,\nnecessitating innovative approaches to address these limitations. This paper\nintroduces DGNN-YOLO, a novel framework integrating dynamic graph neural\nnetworks (DGNN) with YOLO11 to enhance small object detection and tracking in\ntraffic surveillance systems. The framework leverages YOLO11's advanced spatial\nfeature extraction capabilities for precise object detection and incorporates\nDGNN to model spatial-temporal relationships for robust real-time tracking\ndynamically. By constructing and updating graph structures, DGNN-YOLO\neffectively represents objects as nodes and their interactions as edges,\nensuring adaptive and accurate tracking in complex and dynamic environments.\nExtensive experiments demonstrate that DGNN-YOLO consistently outperforms\nstate-of-the-art methods in detecting and tracking small objects under diverse\ntraffic conditions, achieving the highest precision (0.8382), recall (0.6875),\nand mAP@0.5:0.95 (0.6476), showcasing its robustness and scalability,\nparticularly in challenging scenarios involving small and occluded objects.\nThis work provides a scalable, real-time traffic surveillance and analysis\nsolution, significantly contributing to intelligent transportation systems.\n","authors":["Shahriar Soudeep","M. F. Mridha","Md Abrar Jahin","Nilanjan Dey"],"pdf_url":"https://arxiv.org/pdf/2411.17251v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13983v2","updated":"2024-12-02T16:29:46Z","published":"2024-04-22T08:44:10Z","title":"Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph\n Network","summary":" Given a source portrait, the automatic human body reshaping task aims at\nediting it to an aesthetic body shape. As the technology has been widely used\nin media, several methods have been proposed mainly focusing on generating\noptical flow to warp the body shape. However, those previous works only\nconsider the local transformation of different body parts (arms, torso, and\nlegs), ignoring the global affinity, and limiting the capacity to ensure\nconsistency and quality across the entire body. In this paper, we propose a\nnovel Adaptive Affinity-Graph Network (AAGN), which extracts the global\naffinity between different body parts to enhance the quality of the generated\noptical flow. Specifically, our AAGN primarily introduces the following\ndesigns: (1) we propose an Adaptive Affinity-Graph (AAG) Block that leverages\nthe characteristic of a fully connected graph. AAG represents different body\nparts as nodes in an adaptive fully connected graph and captures all the\naffinities between nodes to obtain a global affinity map. The design could\nbetter improve the consistency between body parts. (2) Besides, for\nhigh-frequency details are crucial for photo aesthetics, a Body Shape\nDiscriminator (BSD) is designed to extract information from both high-frequency\nand spatial domain. Particularly, an SRM filter is utilized to extract\nhigh-frequency details, which are combined with spatial features as input to\nthe BSD. With this design, BSD guides the Flow Generator (FG) to pay attention\nto various fine details rather than rigid pixel-level fitting. Extensive\nexperiments conducted on the BR-5K dataset demonstrate that our framework\nsignificantly enhances the aesthetic appeal of reshaped photos, surpassing all\nprevious work to achieve state-of-the-art in all evaluation metrics.\n","authors":["Qiwen Deng","Yangcen Liu","Wen Li","Guoqing Wang"],"pdf_url":"https://arxiv.org/pdf/2404.13983v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2406.14539v3","updated":"2024-12-02T16:26:57Z","published":"2024-06-20T17:49:11Z","title":"Invertible Consistency Distillation for Text-Guided Image Editing in\n Around 7 Steps","summary":" Diffusion distillation represents a highly promising direction for achieving\nfaithful text-to-image generation in a few sampling steps. However, despite\nrecent successes, existing distilled models still do not provide the full\nspectrum of diffusion abilities, such as real image inversion, which enables\nmany precise image manipulation methods. This work aims to enrich distilled\ntext-to-image diffusion models with the ability to effectively encode real\nimages into their latent space. To this end, we introduce invertible\nConsistency Distillation (iCD), a generalized consistency distillation\nframework that facilitates both high-quality image synthesis and accurate image\nencoding in only 3-4 inference steps. Though the inversion problem for\ntext-to-image diffusion models gets exacerbated by high classifier-free\nguidance scales, we notice that dynamic guidance significantly reduces\nreconstruction errors without noticeable degradation in generation performance.\nAs a result, we demonstrate that iCD equipped with dynamic guidance may serve\nas a highly effective tool for zero-shot text-guided image editing, competing\nwith more expensive state-of-the-art alternatives.\n","authors":["Nikita Starodubcev","Mikhail Khoroshikh","Artem Babenko","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2406.14539v3.pdf","comment":"Project page: https://yandex-research.github.io/invertible-cd/"},{"id":"http://arxiv.org/abs/2411.04630v2","updated":"2024-12-02T15:47:17Z","published":"2024-11-07T11:29:55Z","title":"Brain Tumour Removing and Missing Modality Generation using 3D WDM","summary":" This paper presents the second-placed solution for task 8 and the\nparticipation solution for task 7 of BraTS 2024. The adoption of automated\nbrain analysis algorithms to support clinical practice is increasing. However,\nmany of these algorithms struggle with the presence of brain lesions or the\nabsence of certain MRI modalities. The alterations in the brain's morphology\nleads to high variability and thus poor performance of predictive models that\nwere trained only on healthy brains. The lack of information that is usually\nprovided by some of the missing MRI modalities also reduces the reliability of\nthe prediction models trained with all modalities. In order to improve the\nperformance of these models, we propose the use of conditional 3D wavelet\ndiffusion models. The wavelet transform enabled full-resolution image training\nand prediction on a GPU with 48 GB VRAM, without patching or downsampling,\npreserving all information for prediction. The code for these tasks is\navailable at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Gijs Luijten","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04630v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08085v2","updated":"2024-12-02T15:20:08Z","published":"2024-11-12T16:52:51Z","title":"Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation,\n Embrace Orthogonality","summary":" We introduce a yat-product-powered neural network, the Neural Matter Network\n(NMN), a breakthrough in deep learning that achieves non-linear pattern\nrecognition without activation functions. Our key innovation relies on the\nyat-product and yat-product, which naturally induces non-linearity by\nprojecting inputs into a pseudo-metric space, eliminating the need for\ntraditional activation functions while maintaining only a softmax layer for\nfinal class probability distribution. This approach simplifies network\narchitecture and provides unprecedented transparency into the network's\ndecision-making process. Our comprehensive empirical evaluation across\ndifferent datasets demonstrates that NMN consistently outperforms traditional\nMLPs. The results challenge the assumption that separate activation functions\nare necessary for effective deep-learning models. The implications of this work\nextend beyond immediate architectural benefits, by eliminating intermediate\nactivation functions while preserving non-linear capabilities, yat-MLP\nestablishes a new paradigm for neural network design that combines simplicity\nwith effectiveness. Most importantly, our approach provides unprecedented\ninsights into the traditionally opaque \"black-box\" nature of neural networks,\noffering a clearer understanding of how these models process and classify\ninformation.\n","authors":["Taha Bouhsine"],"pdf_url":"https://arxiv.org/pdf/2411.08085v2.pdf","comment":"fixed proof, added softermax"},{"id":"http://arxiv.org/abs/2402.06390v2","updated":"2024-12-02T15:04:25Z","published":"2024-02-09T13:11:57Z","title":"Deepfake for the Good: Generating Avatars through Face-Swapping with\n Implicit Deepfake Generation","summary":" Numerous emerging deep-learning techniques have had a substantial impact on\ncomputer graphics. Among the most promising breakthroughs are the rise of\nNeural Radiance Fields (NeRFs) and Gaussian Splatting (GS). NeRFs encode the\nobject's shape and color in neural network weights using a handful of images\nwith known camera positions to generate novel views. In contrast, GS provides\naccelerated training and inference without a decrease in rendering quality by\nencoding the object's characteristics in a collection of Gaussian\ndistributions. These two techniques have found many use cases in spatial\ncomputing and other domains. On the other hand, the emergence of deepfake\nmethods has sparked considerable controversy. Deepfakes refers to artificial\nintelligence-generated videos that closely mimic authentic footage. Using\ngenerative models, they can modify facial features, enabling the creation of\naltered identities or expressions that exhibit a remarkably realistic\nappearance to a real person. Despite these controversies, deepfake can offer a\nnext-generation solution for avatar creation and gaming when of desirable\nquality. To that end, we show how to combine all these emerging technologies to\nobtain a more plausible outcome. Our ImplicitDeepfake uses the classical\ndeepfake algorithm to modify all training images separately and then train NeRF\nand GS on modified faces. Such simple strategies can produce plausible 3D\ndeepfake-based avatars.\n","authors":["Georgii Stanishevskii","Jakub Steczkiewicz","Tomasz Szczepanik","Sławomir Tadeja","Jacek Tabor","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2402.06390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17773v2","updated":"2024-12-02T14:55:49Z","published":"2024-11-26T09:36:02Z","title":"Efficient Multi-modal Large Language Models via Visual Token Grouping","summary":" The development of Multi-modal Large Language Models (MLLMs) enhances Large\nLanguage Models (LLMs) with the ability to perceive data formats beyond text,\nsignificantly advancing a range of downstream applications, such as visual\nquestion answering and image captioning. However, the substantial computational\ncosts associated with processing high-resolution images and videos pose a\nbarrier to their broader adoption. To address this challenge, compressing\nvision tokens in MLLMs has emerged as a promising approach to reduce inference\ncosts. While existing methods conduct token reduction in the feature alignment\nphase. In this paper, we introduce VisToG, a novel grouping mechanism that\nleverages the capabilities of pre-trained vision encoders to group similar\nimage segments without the need for segmentation masks. Specifically, we\nconcatenate semantic tokens to represent image semantic segments after the\nlinear projection layer before feeding into the vision encoder. Besides, with\nthe isolated attention we adopt, VisToG can identify and eliminate redundant\nvisual tokens utilizing the prior knowledge in the pre-trained vision encoder,\nwhich effectively reduces computational demands. Extensive experiments\ndemonstrate the effectiveness of VisToG, maintaining 98.1% of the original\nperformance while achieving a reduction of over 27\\% inference time.\n","authors":["Minbin Huang","Runhui Huang","Han Shi","Yimeng Chen","Chuanyang Zheng","Xiangguo Sun","Xin Jiang","Zhenguo Li","Hong Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.17773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06154v2","updated":"2024-12-02T14:49:55Z","published":"2024-10-08T15:55:40Z","title":"GLOV: Guided Large Language Models as Implicit Optimizers for Vision\n Language Models","summary":" In this work, we propose a novel method (GLOV) enabling Large Language Models\n(LLMs) to act as implicit Optimizers for Vision-Langugage Models (VLMs) to\nenhance downstream vision tasks. Our GLOV meta-prompts an LLM with the\ndownstream task description, querying it for suitable VLM prompts (e.g., for\nzero-shot classification with CLIP). These prompts are ranked according to a\npurity measure obtained through a fitness function. In each respective\noptimization step, the ranked prompts are fed as in-context examples (with\ntheir accuracies) to equip the LLM with the knowledge of the type of text\nprompts preferred by the downstream VLM. Furthermore, we also explicitly steer\nthe LLM generation process in each optimization step by specifically adding an\noffset difference vector of the embeddings from the positive and negative\nsolutions found by the LLM, in previous optimization steps, to the intermediate\nlayer of the network for the next generation step. This offset vector steers\nthe LLM generation toward the type of language preferred by the downstream VLM,\nresulting in enhanced performance on the downstream vision tasks. We\ncomprehensively evaluate our GLOV on 16 diverse datasets using two families of\nVLMs, i.e., dual-encoder (e.g., CLIP) and encoder-decoder (e.g., LLaVa) models\n-- showing that the discovered solutions can enhance the recognition\nperformance by up to 15.0% and 57.5% (3.8% and 21.6% on average) for these\nmodels.\n","authors":["M. Jehanzeb Mirza","Mengjie Zhao","Zhuoyuan Mao","Sivan Doveh","Wei Lin","Paul Gavrikov","Michael Dorkenwald","Shiqi Yang","Saurav Jha","Hiromi Wakaki","Yuki Mitsufuji","Horst Possegger","Rogerio Feris","Leonid Karlinsky","James Glass"],"pdf_url":"https://arxiv.org/pdf/2410.06154v2.pdf","comment":"Code: https://github.com/jmiemirza/GLOV"},{"id":"http://arxiv.org/abs/2411.01819v2","updated":"2024-12-02T14:42:09Z","published":"2024-11-04T05:39:01Z","title":"Free-Mask: A Novel Paradigm of Integration Between the Segmentation\n Diffusion Model and Image Editing to Improve Segmentation Ability","summary":" Current semantic segmentation models typically require a substantial amount\nof manually annotated data, a process that is both time-consuming and\nresource-intensive. Alternatively, leveraging advanced text-to-image models\nsuch as Midjourney and Stable Diffusion has emerged as an efficient strategy,\nenabling the automatic generation of synthetic data in place of manual\nannotations. However, previous methods have been limited to generating\nsingle-instance images, as the generation of multiple instances with Stable\nDiffusion has proven unstable. To address this limitation and expand the scope\nand diversity of synthetic datasets, we propose a framework \\textbf{Free-Mask}\nthat combines a Diffusion Model for segmentation with advanced image editing\ncapabilities, allowing for the integration of multiple objects into images via\ntext-to-image models. Our method facilitates the creation of highly realistic\ndatasets that closely emulate open-world environments while generating accurate\nsegmentation masks. It reduces the labor associated with manual annotation and\nalso ensures precise mask generation. Experimental results demonstrate that\nsynthetic data generated by \\textbf{Free-Mask} enables segmentation models to\noutperform those trained on real data, especially in zero-shot settings.\nNotably, \\textbf{Free-Mask} achieves new state-of-the-art results on previously\nunseen classes in the VOC 2012 benchmark.\n","authors":["Bo Gao","Fangxu Xing","Daniel Tang"],"pdf_url":"https://arxiv.org/pdf/2411.01819v2.pdf","comment":"16 pages,5 figures,5 tables"},{"id":"http://arxiv.org/abs/2108.11986v2","updated":"2024-12-02T14:25:58Z","published":"2021-08-25T11:45:40Z","title":"Anomaly Detection in Medical Imaging -- A Mini Review","summary":" The increasing digitization of medical imaging enables machine learning based\nimprovements in detecting, visualizing and segmenting lesions, easing the\nworkload for medical experts. However, supervised machine learning requires\nreliable labelled data, which is is often difficult or impossible to collect or\nat least time consuming and thereby costly. Therefore methods requiring only\npartly labeled data (semi-supervised) or no labeling at all (unsupervised\nmethods) have been applied more regularly. Anomaly detection is one possible\nmethodology that is able to leverage semi-supervised and unsupervised methods\nto handle medical imaging tasks like classification and segmentation. This\npaper uses a semi-exhaustive literature review of relevant anomaly detection\npapers in medical imaging to cluster into applications, highlight important\nresults, establish lessons learned and give further advice on how to approach\nanomaly detection in medical imaging. The qualitative analysis is based on\ngoogle scholar and 4 different search terms, resulting in 120 different\nanalysed papers. The main results showed that the current research is mostly\nmotivated by reducing the need for labelled data. Also, the successful and\nsubstantial amount of research in the brain MRI domain shows the potential for\napplications in further domains like OCT and chest X-ray.\n","authors":["Maximilian E. Tschuchnig","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2108.11986v2.pdf","comment":"Accepted and presented at iDSC2021 edit: During work on this\n publication Maximilian Ernst Tschuchnig was affiliated with Salzburg\n University of Applied Sciences and University of Salzburg"},{"id":"http://arxiv.org/abs/2204.10942v2","updated":"2024-12-02T14:12:18Z","published":"2022-04-22T21:48:56Z","title":"Evaluation of Multi-Scale Multiple Instance Learning to Improve Thyroid\n Cancer Classification","summary":" Thyroid cancer is currently the fifth most common malignancy diagnosed in\nwomen. Since differentiation of cancer sub-types is important for treatment and\ncurrent, manual methods are time consuming and subjective, automatic\ncomputer-aided differentiation of cancer types is crucial. Manual\ndifferentiation of thyroid cancer is based on tissue sections, analysed by\npathologists using histological features. Due to the enormous size of gigapixel\nwhole slide images, holistic classification using deep learning methods is not\nfeasible. Patch based multiple instance learning approaches, combined with\naggregations such as bag-of-words, is a common approach. This work's\ncontribution is to extend a patch based state-of-the-art method by generating\nand combining feature vectors of three different patch resolutions and\nanalysing three distinct ways of combining them. The results showed\nimprovements in one of the three multi-scale approaches, while the others led\nto decreased scores. This provides motivation for analysis and discussion of\nthe individual approaches.\n","authors":["Maximilian E. Tschuchnig","Philipp Grubmüller","Lea M. Stangassinger","Christina Kreutzer","Sébastien Couillard-Després","Gertie J. Oostingh","Anton Hittmair","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2204.10942v2.pdf","comment":"Accepted and presented at IPTA 2022 (Best Paper) edit: During work on\n this publication Maximilian Ernst Tschuchnig was affiliated with Salzburg\n University of Applied Sciences and University of Salzburg"},{"id":"http://arxiv.org/abs/2411.12440v3","updated":"2024-12-02T13:44:39Z","published":"2024-11-19T11:59:54Z","title":"Beyond Gaussians: Fast and High-Fidelity 3D Splatting with Linear\n Kernels","summary":" Recent advancements in 3D Gaussian Splatting (3DGS) have substantially\nimproved novel view synthesis, enabling high-quality reconstruction and\nreal-time rendering. However, blurring artifacts, such as floating primitives\nand over-reconstruction, remain challenging. Current methods address these\nissues by refining scene structure, enhancing geometric representations,\naddressing blur in training images, improving rendering consistency, and\noptimizing density control, yet the role of kernel design remains\nunderexplored. We identify the soft boundaries of Gaussian ellipsoids as one of\nthe causes of these artifacts, limiting detail capture in high-frequency\nregions. To bridge this gap, we introduce 3D Linear Splatting (3DLS), which\nreplaces Gaussian kernels with linear kernels to achieve sharper and more\nprecise results, particularly in high-frequency regions. Through evaluations on\nthree datasets, 3DLS demonstrates state-of-the-art fidelity and accuracy, along\nwith a 30% FPS improvement over baseline 3DGS. The implementation will be made\npublicly available upon acceptance.\n","authors":["Haodong Chen","Runnan Chen","Qiang Qu","Zhaoqing Wang","Tongliang Liu","Xiaoming Chen","Yuk Ying Chung"],"pdf_url":"https://arxiv.org/pdf/2411.12440v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12990v2","updated":"2024-12-02T13:40:59Z","published":"2023-12-20T12:48:18Z","title":"Multi-task Learning To Improve Semantic Segmentation Of CBCT Scans Using\n Image Reconstruction","summary":" Semantic segmentation is a crucial task in medical image processing,\nessential for segmenting organs or lesions such as tumors. In this study we aim\nto improve automated segmentation in CBCTs through multi-task learning. To\nevaluate effects on different volume qualities, a CBCT dataset is synthesised\nfrom the CT Liver Tumor Segmentation Benchmark (LiTS) dataset. To improve\nsegmentation, two approaches are investigated. First, we perform multi-task\nlearning to add morphology based regularization through a volume reconstruction\ntask. Second, we use this reconstruction task to reconstruct the best quality\nCBCT (most similar to the original CT), facilitating denoising effects. We\nexplore both holistic and patch-based approaches. Our findings reveal that,\nespecially using a patch-based approach, multi-task learning improves\nsegmentation in most cases and that these results can further be improved by\nour denoising approach.\n","authors":["Maximilian Ernst Tschuchnig","Julia Coste-Marin","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2312.12990v2.pdf","comment":"Accepted and presented at German Conference on Medical Image\n Computing (BVM) 2024 edit: During work on this publication Maximilian Ernst\n Tschuchnig was affiliated with Salzburg University of Applied Sciences and\n University of Salzburg"},{"id":"http://arxiv.org/abs/2406.11536v2","updated":"2024-12-02T13:20:23Z","published":"2024-06-17T13:38:57Z","title":"RO-SVD: A Reconfigurable Hardware Copyright Protection Framework for\n AIGC Applications","summary":" The dramatic surge in the utilisation of generative artificial intelligence\n(GenAI) underscores the need for a secure and efficient mechanism to\nresponsibly manage, use and disseminate multi-dimensional data generated by\nartificial intelligence (AI). In this paper, we propose a blockchain-based\ncopyright traceability framework called ring oscillator-singular value\ndecomposition (RO-SVD), which introduces decomposition computing to approximate\nlow-rank matrices generated from hardware entropy sources and establishes an\nAI-generated content (AIGC) copyright traceability mechanism at the device\nlevel. By leveraging the parallelism and reconfigurability of\nfield-programmable gate arrays (FPGAs), our framework can be easily constructed\non existing AI-accelerated devices and provide a low-cost solution to emerging\ncopyright issues of AIGC. We developed a hardware-software (HW/SW) co-design\nprototype based on comprehensive analysis and on-board experiments with\nmultiple AI-applicable FPGAs. Using AI-generated images as a case study, our\nframework demonstrated effectiveness and emphasised customisation,\nunpredictability, efficiency, management and reconfigurability. To the best of\nour knowledge, this is the first practical hardware study discussing and\nimplementing copyright traceability specifically for AI-generated content.\n","authors":["Zhuoheng Ran","Muhammad A. A. Abdelgawad","Zekai Zhang","Ray C. C. Cheung","Hong Yan"],"pdf_url":"https://arxiv.org/pdf/2406.11536v2.pdf","comment":"Accepted on 20 May 2024 as a full paper at ASAP 2024"},{"id":"http://arxiv.org/abs/2410.07926v2","updated":"2024-12-02T13:17:25Z","published":"2024-10-10T13:53:42Z","title":"Multimodal Perception System for Real Open Environment","summary":" This paper presents a novel multimodal perception system for a real open\nenvironment. The proposed system includes an embedded computation platform,\ncameras, ultrasonic sensors, GPS, and IMU devices. Unlike the traditional\nframeworks, our system integrates multiple sensors with advanced computer\nvision algorithms to help users walk outside reliably. The system can\nefficiently complete various tasks, including navigating to specific locations,\npassing through obstacle regions, and crossing intersections. Specifically, we\nalso use ultrasonic sensors and depth cameras to enhance obstacle avoidance\nperformance. The path planning module is designed to find the locally optimal\nroute based on various feedback and the user's current state. To evaluate the\nperformance of the proposed system, we design several experiments under\ndifferent scenarios. The results show that the system can help users walk\nefficiently and independently in complex situations.\n","authors":["Yuyang Sha"],"pdf_url":"https://arxiv.org/pdf/2410.07926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18328v3","updated":"2024-12-02T13:04:47Z","published":"2023-11-30T07:58:54Z","title":"Advances in 3D Neural Stylization: A Survey","summary":" Modern artificial intelligence offers a novel and transformative approach to\ncreating digital art across diverse styles and modalities like images, videos\nand 3D data, unleashing the power of creativity and revolutionizing the way\nthat we perceive and interact with visual content. This paper reports on recent\nadvances in stylized 3D asset creation and manipulation with the expressive\npower of neural networks. We establish a taxonomy for neural stylization,\nconsidering crucial design choices such as scene representation, guidance data,\noptimization strategies, and output styles. Building on such taxonomy, our\nsurvey first revisits the background of neural stylization on 2D images, and\nthen presents in-depth discussions on recent neural stylization methods for 3D\ndata, accompanied by a benchmark evaluating selected mesh and neural field\nstylization methods. Based on the insights gained from the survey, we highlight\nthe practical significance, open challenges, future research, and potential\nimpacts of neural stylization, which facilitates researchers and practitioners\nto navigate the rapidly evolving landscape of 3D content creation using modern\nartificial intelligence.\n","authors":["Yingshu Chen","Guocheng Shao","Ka Chun Shum","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.18328v3.pdf","comment":"curated list of papers:\n https://github.com/chenyingshu/advances_3d_neural_stylization"},{"id":"http://arxiv.org/abs/2312.14132v3","updated":"2024-12-02T13:00:56Z","published":"2023-12-21T18:52:14Z","title":"DUSt3R: Geometric 3D Vision Made Easy","summary":" Multi-view stereo reconstruction (MVS) in the wild requires to first estimate\nthe camera parameters e.g. intrinsic and extrinsic parameters. These are\nusually tedious and cumbersome to obtain, yet they are mandatory to triangulate\ncorresponding pixels in 3D space, which is the core of all best performing MVS\nalgorithms. In this work, we take an opposite stance and introduce DUSt3R, a\nradically novel paradigm for Dense and Unconstrained Stereo 3D Reconstruction\nof arbitrary image collections, i.e. operating without prior information about\ncamera calibration nor viewpoint poses. We cast the pairwise reconstruction\nproblem as a regression of pointmaps, relaxing the hard constraints of usual\nprojective camera models. We show that this formulation smoothly unifies the\nmonocular and binocular reconstruction cases. In the case where more than two\nimages are provided, we further propose a simple yet effective global alignment\nstrategy that expresses all pairwise pointmaps in a common reference frame. We\nbase our network architecture on standard Transformer encoders and decoders,\nallowing us to leverage powerful pretrained models. Our formulation directly\nprovides a 3D model of the scene as well as depth information, but\ninterestingly, we can seamlessly recover from it, pixel matches, relative and\nabsolute camera. Exhaustive experiments on all these tasks showcase that the\nproposed DUSt3R can unify various 3D vision tasks and set new SoTAs on\nmonocular/multi-view depth estimation as well as relative pose estimation. In\nsummary, DUSt3R makes many geometric 3D vision tasks easy.\n","authors":["Shuzhe Wang","Vincent Leroy","Yohann Cabon","Boris Chidlovskii","Jerome Revaud"],"pdf_url":"https://arxiv.org/pdf/2312.14132v3.pdf","comment":"fixing the ref for StaticThings3D dataset"},{"id":"http://arxiv.org/abs/2411.18025v2","updated":"2024-12-02T12:42:28Z","published":"2024-11-27T03:44:21Z","title":"Pixel-aligned RGB-NIR Stereo Imaging and Dataset for Robot Vision","summary":" Integrating RGB and NIR stereo imaging provides complementary spectral\ninformation, potentially enhancing robotic 3D vision in challenging lighting\nconditions. However, existing datasets and imaging systems lack pixel-level\nalignment between RGB and NIR images, posing challenges for downstream vision\ntasks. In this paper, we introduce a robotic vision system equipped with\npixel-aligned RGB-NIR stereo cameras and a LiDAR sensor mounted on a mobile\nrobot. The system simultaneously captures pixel-aligned pairs of RGB stereo\nimages, NIR stereo images, and temporally synchronized LiDAR points. Utilizing\nthe mobility of the robot, we present a dataset containing continuous video\nframes under diverse lighting conditions. We then introduce two methods that\nutilize the pixel-aligned RGB-NIR images: an RGB-NIR image fusion method and a\nfeature fusion method. The first approach enables existing RGB-pretrained\nvision models to directly utilize RGB-NIR information without fine-tuning. The\nsecond approach fine-tunes existing vision models to more effectively utilize\nRGB-NIR information. Experimental results demonstrate the effectiveness of\nusing pixel-aligned RGB-NIR images across diverse lighting conditions.\n","authors":["Jinnyeong Kim","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2411.18025v2.pdf","comment":"8 pages for main article, 32 pages for supplemental document. Fix\n typos"},{"id":"http://arxiv.org/abs/2408.16886v3","updated":"2024-12-02T12:39:07Z","published":"2024-08-29T20:19:10Z","title":"LV-UNet: A Lightweight and Vanilla Model for Medical Image Segmentation","summary":" While large models have achieved significant progress in computer vision,\nchallenges such as optimization complexity, the intricacy of transformer\narchitectures, computational constraints, and practical application demands\nhighlight the importance of simpler model designs in medical image\nsegmentation. This need is particularly pronounced in mobile medical devices,\nwhich require lightweight, deployable models with real-time performance.\nHowever, existing lightweight models often suffer from poor robustness across\ndatasets, limiting their widespread adoption. To address these challenges, this\npaper introduces LV-UNet, a lightweight and vanilla model that leverages\npre-trained MobileNetv3-Large backbones and incorporates fusible modules.\nLV-UNet employs an enhanced deep training strategy and switches to a deployment\nmode during inference by re-parametrization, significantly reducing parameter\ncount and computational overhead. Experimental results on ISIC 2016, BUSI,\nCVC-ClinicDB, CVC-ColonDB, and Kvair-SEG datasets demonstrate a better\ntrade-off between performance and the computational load. The code will be\nreleased at \\url{https://github.com/juntaoJianggavin/LV-UNet}.\n","authors":["Juntao Jiang","Mengmeng Wang","Huizhong Tian","Lingbo Cheng","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2408.16886v3.pdf","comment":"Accepted by IEEE BIBM2024 ML4BMI workshop"},{"id":"http://arxiv.org/abs/2411.14951v2","updated":"2024-12-02T12:38:39Z","published":"2024-11-22T14:09:56Z","title":"Morph: A Motion-free Physics Optimization Framework for Human Motion\n Generation","summary":" Human motion generation plays a vital role in applications such as digital\nhumans and humanoid robot control. However, most existing approaches disregard\nphysics constraints, leading to the frequent production of physically\nimplausible motions with pronounced artifacts such as floating and foot\nsliding. In this paper, we propose \\textbf{Morph}, a\n\\textbf{Mo}tion-f\\textbf{r}ee \\textbf{ph}ysics optimization framework,\ncomprising a Motion Generator and a Motion Physics Refinement module, for\nenhancing physical plausibility without relying on costly real-world motion\ndata. Specifically, the Motion Generator is responsible for providing\nlarge-scale synthetic motion data, while the Motion Physics Refinement Module\nutilizes these synthetic data to train a motion imitator within a physics\nsimulator, enforcing physical constraints to project the noisy motions into a\nphysically-plausible space. These physically refined motions, in turn, are used\nto fine-tune the Motion Generator, further enhancing its capability.\nExperiments on both text-to-motion and music-to-dance generation tasks\ndemonstrate that our framework achieves state-of-the-art motion generation\nquality while improving physical plausibility drastically.\n","authors":["Zhuo Li","Mingshuang Luo","Ruibing Hou","Xin Zhao","Hao Liu","Hong Chang","Zimo Liu","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2411.14951v2.pdf","comment":"15 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.07648v2","updated":"2024-12-02T12:23:47Z","published":"2024-06-11T18:29:13Z","title":"Multi-View Large Reconstruction Model via Geometry-Aware Positional\n Encoding and Attention","summary":" Despite recent advancements in the Large Reconstruction Model (LRM)\ndemonstrating impressive results, when extending its input from single image to\nmultiple images, it exhibits inefficiencies, subpar geometric and texture\nquality, as well as slower convergence speed than expected. It is attributed to\nthat, LRM formulates 3D reconstruction as a naive images-to-3D translation\nproblem, ignoring the strong 3D coherence among the input images. In this\npaper, we propose a Multi-view Large Reconstruction Model (M-LRM) designed to\nreconstruct high-quality 3D shapes from multi-views in a 3D-aware manner.\nSpecifically, we introduce a multi-view consistent cross-attention scheme to\nenable M-LRM to accurately query information from the input images. Moreover,\nwe employ the 3D priors of the input multi-view images to initialize the\ntriplane tokens. Compared to previous methods, the proposed M-LRM can generate\n3D shapes of high fidelity. Experimental studies demonstrate that our model\nachieves a significant performance gain and faster training convergence.\nProject page: \\url{https://murphylmf.github.io/M-LRM/}.\n","authors":["Mengfei Li","Xiaoxiao Long","Yixun Liang","Weiyu Li","Yuan Liu","Peng Li","Wenhan Luo","Wenping Wang","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2406.07648v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13272v2","updated":"2024-12-02T12:18:12Z","published":"2024-06-19T07:08:48Z","title":"AniFaceDiff: Animating Stylized Avatars via Parametric Conditioned\n Diffusion Models","summary":" Animating stylized avatars with dynamic poses and expressions has attracted\nincreasing attention for its broad range of applications. Previous research has\nmade significant progress by training controllable generative models to\nsynthesize animations based on reference characteristics, pose, and expression\nconditions. However, the mechanisms used in these methods to control pose and\nexpression often inadvertently introduce unintended features from the target\nmotion, while also causing a loss of expression-related details, particularly\nwhen applied to stylized animation. This paper proposes a new method based on\nStable Diffusion, called AniFaceDiff, incorporating a new conditioning module\nfor animating stylized avatars. First, we propose a refined spatial\nconditioning approach by Facial Alignment to prevent the inclusion of identity\ncharacteristics from the target motion. Then, we introduce an Expression\nAdapter that incorporates additional cross-attention layers to address the\npotential loss of expression-related information. Our approach effectively\npreserves pose and expression from the target video while maintaining input\nimage consistency. Extensive experiments demonstrate that our method achieves\nstate-of-the-art results, showcasing superior image quality, preservation of\nreference features, and expression accuracy, particularly for out-of-domain\nanimation across diverse styles, highlighting its versatility and strong\ngeneralization capabilities. This work aims to enhance the quality of virtual\nstylized animation for positive applications. To promote responsible use in\nvirtual environments, we contribute to the advancement of detection for\ngenerative content by evaluating state-of-the-art detectors, highlighting\npotential areas for improvement, and suggesting solutions.\n","authors":["Ken Chen","Sachith Seneviratne","Wei Wang","Dongting Hu","Sanjay Saha","Md. Tarek Hasan","Sanka Rasnayaka","Tamasha Malepathirana","Mingming Gong","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2406.13272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13656v4","updated":"2024-12-02T12:11:13Z","published":"2023-01-31T14:18:19Z","title":"A Survey and Benchmark of Automatic Surface Reconstruction from Point\n Clouds","summary":" We present a comprehensive survey and benchmark of both traditional and\nlearning-based methods for surface reconstruction from point clouds. This task\nis particularly challenging for real-world acquisitions due to factors such as\nnoise, outliers, non-uniform sampling, and missing data. Traditional approaches\noften simplify the problem by imposing handcrafted priors on either the input\npoint clouds or the resulting surface, a process that can require tedious\nhyperparameter tuning. In contrast, deep learning models have the capability to\ndirectly learn the properties of input point clouds and desired surfaces from\ndata. We study the influence of handcrafted and learned priors on the precision\nand robustness of surface reconstruction techniques. We evaluate various\ntime-tested and contemporary methods in a standardized manner. When both\ntrained and evaluated on point clouds with identical characteristics, the\nlearning-based models consistently produce higher-quality surfaces compared to\ntheir traditional counterparts -- even in scenarios involving novel shape\ncategories. However, traditional methods demonstrate greater resilience to the\ndiverse anomalies commonly found in real-world 3D acquisitions. For the benefit\nof the research community, we make our code and datasets available, inviting\nfurther enhancements to learning-based surface reconstruction. This can be\naccessed at https://github.com/raphaelsulzer/dsr-benchmark .\n","authors":["Raphael Sulzer","Renaud Marlet","Bruno Vallet","Loic Landrieu"],"pdf_url":"https://arxiv.org/pdf/2301.13656v4.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2411.03795v4","updated":"2024-12-02T12:09:39Z","published":"2024-11-06T09:39:52Z","title":"VQA$^2$: Visual Question Answering for Video Quality Assessment","summary":" The advent and proliferation of large multi-modal models (LMMs) have\nintroduced new paradigms to computer vision, transforming various tasks into a\nunified visual question answering framework. Video Quality Assessment (VQA), a\nclassic field in low-level visual perception, focused initially on quantitative\nvideo quality scoring. However, driven by advances in LMMs, it is now\nprogressing toward more holistic visual quality understanding tasks. Recent\nstudies in the image domain have demonstrated that Visual Question Answering\n(VQA) can markedly enhance low-level visual quality evaluation. Nevertheless,\nrelated work has not been explored in the video domain, leaving substantial\nroom for improvement. To address this gap, we introduce the VQA2 Instruction\nDataset - the first visual question answering instruction dataset that focuses\non video quality assessment. This dataset consists of 3 subsets and covers\nvarious video types, containing 157,755 instruction question-answer pairs.\nThen, leveraging this foundation, we present the VQA2 series models. The VQA2\nseries models interleave visual and motion tokens to enhance the perception of\nspatial-temporal quality details in videos. We conduct extensive experiments on\nvideo quality scoring and understanding tasks, and results demonstrate that the\nVQA2series models achieve excellent performance in both tasks. Notably, our\nfinal model, the VQA2-Assistant, exceeds the renowned GPT-4o in visual quality\nunderstanding tasks while maintaining strong competitiveness in quality scoring\ntasks. Our work provides a foundation and feasible approach for integrating\nlow-level video quality assessment and understanding with LMMs.\n","authors":["Ziheng Jia","Zicheng Zhang","Jiaying Qian","Haoning Wu","Wei Sun","Chunyi Li","Xiaohong Liu","Weisi Lin","Guangtao Zhai","Xiongkuo Min"],"pdf_url":"https://arxiv.org/pdf/2411.03795v4.pdf","comment":"23 pages 12 figures"},{"id":"http://arxiv.org/abs/2410.23132v2","updated":"2024-12-02T12:05:29Z","published":"2024-10-30T15:42:59Z","title":"Revisiting MAE pre-training for 3D medical image segmentation","summary":" Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the\npotential of vast, untapped clinical datasets, for various downstream\napplications that suffer from the scarcity of labeled data. While SSL has\nrevolutionized fields like natural language processing and computer vision, its\nadoption in 3D medical image computing has been limited by three key pitfalls:\nSmall pre-training dataset sizes, architectures inadequate for 3D medical image\nanalysis, and insufficient evaluation practices. In this paper, we address\nthese issues by i) leveraging a large-scale dataset of 39k 3D brain MRI volumes\nand ii) using a Residual Encoder U-Net architecture within the state-of-the-art\nnnU-Net framework. iii) A robust development framework, incorporating 5\ndevelopment and 8 testing brain MRI segmentation datasets, allowed\nperformance-driven design decisions to optimize the simple concept of Masked\nAuto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses\nprevious SSL methods but also outperforms the strong nnU-Net baseline by an\naverage of approximately 3 Dice points setting a new state-of-the-art. Our code\nand models are made available here.\n","authors":["Tassilo Wald","Constantin Ulrich","Stanislav Lukyanenko","Andrei Goncharov","Alberto Paderno","Leander Maerkisch","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2410.23132v2.pdf","comment":"Arxiv Preprint. Revised and under review"},{"id":"http://arxiv.org/abs/2411.06098v3","updated":"2024-12-02T11:49:05Z","published":"2024-11-09T07:19:56Z","title":"An Architectural Approach to Enhance Deep Long-Tailed Learning","summary":" Deep long-tailed recognition has been widely studied to address the issue of\nimbalanced data distributions in real-world scenarios. However, there has been\ninsufficient focus on the design of neural architectures, despite empirical\nevidence suggesting that architecture can significantly impact performance. In\nthis paper, we attempt to mitigate long-tailed issues through architectural\nimprovements. To simplify the design process, we utilize Differential\nArchitecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS\nmethods struggle to perform well in long-tailed scenarios. To tackle this\nchallenge, we introduce Long-Tailed Differential Architecture Search (LTDAS).\nSpecifically, we conduct extensive experiments to explore architectural\ncomponents that demonstrate better performance on long-tailed data and propose\na new search space based on our observations. This ensures that the\narchitecture obtained through our search process incorporates superior\ncomponents. Additionally, we propose replacing the learnable linear classifier\nwith an Equiangular Tight Frame (ETF) classifier to further enhance our method.\nThis classifier effectively alleviates the biased search process and prevents\nperformance collapse. Extensive experimental evaluations demonstrate that our\napproach consistently improves upon existing methods from an orthogonal\nperspective and achieves state-of-the-art results with simple enhancements.\n","authors":["Yuhan Pan","Yanan Sun","Wei Gong"],"pdf_url":"https://arxiv.org/pdf/2411.06098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06892v2","updated":"2024-12-02T11:24:20Z","published":"2024-03-11T16:48:25Z","title":"Real-time Transformer-based Open-Vocabulary Detection with Efficient\n Fusion Head","summary":" End-to-end transformer-based detectors (DETRs) have shown exceptional\nperformance in both closed-set and open-vocabulary object detection (OVD) tasks\nthrough the integration of language modalities. However, their demanding\ncomputational requirements have hindered their practical application in\nreal-time object detection (OD) scenarios. In this paper, we scrutinize the\nlimitations of two leading models in the OVDEval benchmark, OmDet and\nGrounding-DINO, and introduce OmDet-Turbo. This novel transformer-based\nreal-time OVD model features an innovative Efficient Fusion Head (EFH) module\ndesigned to alleviate the bottlenecks observed in OmDet and Grounding-DINO.\nNotably, OmDet-Turbo-Base achieves a 100.2 frames per second (FPS) with\nTensorRT and language cache techniques applied. Notably, in zero-shot scenarios\non COCO and LVIS datasets, OmDet-Turbo achieves performance levels nearly on\npar with current state-of-the-art supervised models. Furthermore, it\nestablishes new state-of-the-art benchmarks on ODinW and OVDEval, boasting an\nAP of 30.1 and an NMS-AP of 26.86, respectively. The practicality of\nOmDet-Turbo in industrial applications is underscored by its exceptional\nperformance on benchmark datasets and superior inference speed, positioning it\nas a compelling choice for real-time object detection tasks. Code:\n\\url{https://github.com/om-ai-lab/OmDet}\n","authors":["Tiancheng Zhao","Peng Liu","Xuan He","Lu Zhang","Kyusong Lee"],"pdf_url":"https://arxiv.org/pdf/2403.06892v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.12789v3","updated":"2024-12-02T11:16:09Z","published":"2024-05-21T13:40:30Z","title":"Anticipating Object State Changes in Long Procedural Videos","summary":" In this work, we introduce (a) the new problem of anticipating object state\nchanges in images and videos during procedural activities, (b) new curated\nannotation data for object state change classification based on the Ego4D\ndataset, and (c) the first method for addressing this challenging problem.\nSolutions to this new task have important implications in vision-based scene\nunderstanding, automated monitoring systems, and action planning. The proposed\nnovel framework predicts object state changes that will occur in the near\nfuture due to yet unseen human actions by integrating learned visual features\nthat represent recent visual information with natural language (NLP) features\nthat represent past object state changes and actions. Leveraging the extensive\nand challenging Ego4D dataset which provides a large-scale collection of\nfirst-person perspective videos across numerous interaction scenarios, we\nintroduce an extension noted Ego4D-OSCA that provides new curated annotation\ndata for the object state change anticipation task (OSCA). An extensive\nexperimental evaluation is presented demonstrating the proposed method's\nefficacy in predicting object state changes in dynamic scenarios. The\nperformance of the proposed approach also underscores the potential of\nintegrating video and linguistic cues to enhance the predictive performance of\nvideo understanding systems and lays the groundwork for future research on the\nnew task of object state change anticipation. The source code and the new\nannotation data (Ego4D-OSCA) will be made publicly available.\n","authors":["Victoria Manousaki","Konstantinos Bacharidis","Filippos Gouidis","Konstantinos Papoutsakis","Dimitris Plexousakis","Antonis Argyros"],"pdf_url":"https://arxiv.org/pdf/2405.12789v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04632v2","updated":"2024-12-02T10:48:28Z","published":"2024-11-07T11:35:31Z","title":"Improved Multi-Task Brain Tumour Segmentation with Synthetic Data\n Augmentation","summary":" This paper presents the winning solution of task 1 and the third-placed\nsolution of task 3 of the BraTS challenge. The use of automated tools in\nclinical practice has increased due to the development of more and more\nsophisticated and reliable algorithms. However, achieving clinical standards\nand developing tools for real-life scenarios is a major challenge. To this end,\nBraTS has organised tasks to find the most advanced solutions for specific\npurposes. In this paper, we propose the use of synthetic data to train\nstate-of-the-art frameworks in order to improve the segmentation of adult\ngliomas in a post-treatment scenario, and the segmentation of meningioma for\nradiotherapy planning. Our results suggest that the use of synthetic data leads\nto more robust algorithms, although the synthetic data generation pipeline is\nnot directly suited to the meningioma task. In task 1, we achieved a DSC of\n0.7900, 0.8076, 0.7760, 0.8926, 0.7874, 0.8938 and a HD95 of 35.63, 30.35,\n44.58, 16.87, 38.19, 17.95 for ET, NETC, RC, SNFH, TC and WT, respectively and,\nin task 3, we achieved a DSC of 0.801 and HD95 of 38.26, in the testing phase.\nThe code for these tasks is available at\nhttps://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Tiago Jesus","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09874v4","updated":"2024-12-02T10:33:19Z","published":"2023-03-17T10:38:27Z","title":"Image Statistics Predict the Sensitivity of Perceptual Quality Metrics","summary":" Previously, Barlow and Attneave hypothesised a link between biological vision\nand information maximisation. Following Shannon, information was defined using\nthe probability of natural images. Several physiological and psychophysical\nphenomena have been derived from principles like info-max, efficient coding, or\noptimal denoising. However, it remains unclear how this link is expressed in\nmathematical terms from image probability. Classical derivations were subjected\nto strong assumptions on the probability models and on the behaviour of the\nsensors. Moreover, the direct evaluation of the hypothesis was limited by the\ninability of classical image models to deliver accurate estimates of the\nprobability. Here, we directly evaluate image probabilities using a generative\nmodel for natural images, and analyse how probability-related factors can be\ncombined to predict the sensitivity of state-of-the-art subjective image\nquality metrics, a proxy for human perception. We use information theory and\nregression analysis to find a simple model that when combining just two\nprobability-related factors achieves 0.77 correlation with subjective metrics.\nThis probability-based model is validated in two ways: through direct\ncomparison with the opinion of real observers in a subjective quality\nexperiment, and by reproducing basic trends of classical psychophysical facts\nsuch as the Contrast Sensitivity Function, the Weber-law, and contrast masking.\n","authors":["Alexander Hepburn","Valero Laparra","Raúl Santos-Rodriguez","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2303.09874v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01574v4","updated":"2024-12-02T10:26:39Z","published":"2023-09-04T12:53:54Z","title":"Object-Size-Driven Design of Convolutional Neural Networks: Virtual Axle\n Detection based on Raw Data","summary":" As infrastructure ages, the need for efficient monitoring methods becomes\nincreasingly critical. Bridge Weigh-In-Motion (BWIM) systems are crucial for\ncost-effective determination of loads and, consequently, the residual service\nlife of road and railway infrastructure. However, conventional BWIM systems\nrequire additional sensors for axle detection, which must be installed in\npotentially inaccessible locations or places that interfere with bridge\noperation.\n This study presents a novel approach for real-time detection of train axles\nusing sensors arbitrarily placed on bridges, providing an alternative to\ndedicated axle detectors. The developed Virtual Axle Detector with Enhanced\nReceptive Field (VADER) has been validated on a single-track railway bridge\nusing only acceleration measurements, detecting 99.9% of axles with a spatial\nerror of 3.69cm. Using raw data as input outperformed the state-of-the-art\nspectrogram-based method in both speed and memory usage by 99%, thereby making\nreal-time application feasible for the first time.\n Additionally, we introduce the Maximum Receptive Field (MRF) rule, a novel\napproach to optimise hyperparameters of Convolutional Neural Networks (CNNs)\nbased on the size of objects. In this context, the object size relates to the\nfundamental frequency of a bridge. The MRF rule effectively narrows the\nhyperparameter search space, overcoming the need for extensive hyperparameter\ntuning. Since the MRF rule can theoretically be applied to all unstructured\ndata, it could have implications for a wide range of deep learning problems,\nfrom earthquake prediction to object recognition.\n","authors":["Henik Riedel","Robert Steven Lorenzen","Clemens Hübler"],"pdf_url":"https://arxiv.org/pdf/2309.01574v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12044v3","updated":"2024-12-02T10:17:48Z","published":"2024-06-17T19:31:24Z","title":"ARTIST: Improving the Generation of Text-rich Images with Disentangled\n Diffusion Models and Large Language Models","summary":" Diffusion models have demonstrated exceptional capabilities in generating a\nbroad spectrum of visual content, yet their proficiency in rendering text is\nstill limited: they often generate inaccurate characters or words that fail to\nblend well with the underlying image. To address these shortcomings, we\nintroduce a novel framework named, ARTIST, which incorporates a dedicated\ntextual diffusion model to focus on the learning of text structures\nspecifically. Initially, we pretrain this textual model to capture the\nintricacies of text representation. Subsequently, we finetune a visual\ndiffusion model, enabling it to assimilate textual structure information from\nthe pretrained textual model. This disentangled architecture design and\ntraining strategy significantly enhance the text rendering ability of the\ndiffusion models for text-rich image generation. Additionally, we leverage the\ncapabilities of pretrained large language models to interpret user intentions\nbetter, contributing to improved generation quality. Empirical results on the\nMARIO-Eval benchmark underscore the effectiveness of the proposed method,\nshowing an improvement of up to 15% in various metrics.\n","authors":["Jianyi Zhang","Yufan Zhou","Jiuxiang Gu","Curtis Wigington","Tong Yu","Yiran Chen","Tong Sun","Ruiyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.12044v3.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2303.17550v6","updated":"2024-12-02T10:06:28Z","published":"2023-03-30T17:18:31Z","title":"DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with\n Diffusion Autoencoder","summary":" While recent research has made significant progress in speech-driven talking\nface generation, the quality of the generated video still lags behind that of\nreal recordings. One reason for this is the use of handcrafted intermediate\nrepresentations like facial landmarks and 3DMM coefficients, which are designed\nbased on human knowledge and are insufficient to precisely describe facial\nmovements. Additionally, these methods require an external pretrained model for\nextracting these representations, whose performance sets an upper bound on\ntalking face generation. To address these limitations, we propose a novel\nmethod called DAE-Talker that leverages data-driven latent representations\nobtained from a diffusion autoencoder (DAE). DAE contains an image encoder that\nencodes an image into a latent vector and a DDIM image decoder that\nreconstructs the image from it. We train our DAE on talking face video frames\nand then extract their latent representations as the training target for a\nConformer-based speech2latent model. This allows DAE-Talker to synthesize full\nvideo frames and produce natural head movements that align with the content of\nspeech, rather than relying on a predetermined head pose from a template video.\nWe also introduce pose modelling in speech2latent for pose controllability.\nAdditionally, we propose a novel method for generating continuous video frames\nwith the DDIM image decoder trained on individual frames, eliminating the need\nfor modelling the joint distribution of consecutive frames directly. Our\nexperiments show that DAE-Talker outperforms existing popular methods in\nlip-sync, video fidelity, and pose naturalness. We also conduct ablation\nstudies to analyze the effectiveness of the proposed techniques and demonstrate\nthe pose controllability of DAE-Talker.\n","authors":["Chenpeng Du","Qi Chen","Tianyu He","Xu Tan","Xie Chen","Kai Yu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.17550v6.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2409.15344v2","updated":"2024-12-02T09:45:07Z","published":"2024-09-10T07:04:48Z","title":"Video-Driven Graph Network-Based Simulators","summary":" Lifelike visualizations in design, cinematography, and gaming rely on precise\nphysics simulations, typically requiring extensive computational resources and\ndetailed physical input. This paper presents a method that can infer a system's\nphysical properties from a short video, eliminating the need for explicit\nparameter input, provided it is close to the training condition. The learned\nrepresentation is then used within a Graph Network-based Simulator to emulate\nthe trajectories of physical systems. We demonstrate that the video-derived\nencodings effectively capture the physical properties of the system and\nshowcase a linear dependence between some of the encodings and the system's\nmotion.\n","authors":["Franciszek Szewczyk","Gilles Louppe","Matthia Sabatelli"],"pdf_url":"https://arxiv.org/pdf/2409.15344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18810v2","updated":"2024-12-02T09:10:34Z","published":"2024-11-27T23:32:54Z","title":"Enhancing Compositional Text-to-Image Generation with Reliable Random\n Seeds","summary":" Text-to-image diffusion models have demonstrated remarkable capability in\ngenerating realistic images from arbitrary text prompts. However, they often\nproduce inconsistent results for compositional prompts such as \"two dogs\" or \"a\npenguin on the right of a bowl\". Understanding these inconsistencies is crucial\nfor reliable image generation. In this paper, we highlight the significant role\nof initial noise in these inconsistencies, where certain noise patterns are\nmore reliable for compositional prompts than others. Our analyses reveal that\ndifferent initial random seeds tend to guide the model to place objects in\ndistinct image areas, potentially adhering to specific patterns of camera\nangles and image composition associated with the seed. To improve the model's\ncompositional ability, we propose a method for mining these reliable cases,\nresulting in a curated training set of generated images without requiring any\nmanual annotation. By fine-tuning text-to-image models on these generated\nimages, we significantly enhance their compositional capabilities. For\nnumerical composition, we observe relative increases of 29.3% and 19.5% for\nStable Diffusion and PixArt-{\\alpha}, respectively. Spatial composition sees\neven larger gains, with 60.7% for Stable Diffusion and 21.1% for\nPixArt-{\\alpha}.\n","authors":["Shuangqi Li","Hieu Le","Jingyi Xu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2411.18810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15778v2","updated":"2024-12-02T09:06:32Z","published":"2024-11-24T10:58:48Z","title":"Enhancing the automatic segmentation and analysis of 3D liver\n vasculature models","summary":" Surgical assessment of liver cancer patients requires identification of the\nvessel trees from medical images. Specifically, the venous trees - the portal\n(perfusing) and the hepatic (draining) trees are important for understanding\nthe liver anatomy and disease state, and perform surgery planning. This\nresearch aims to improve the 3D segmentation, skeletonization, and subsequent\nanalysis of vessel trees, by creating an automatic pipeline based on deep\nlearning and image processing techniques.\n The first part of this work explores the impact of differentiable\nskeletonization methods such as ClDice and morphological skeletonization loss,\non the overall liver vessel segmentation performance. To this aim, it studies\nhow to improve vessel tree connectivity.\n The second part of this study converts a single class vessel segmentation\ninto multi-class ones, separating the two venous trees. It builds on the\nprevious two-class vessel segmentation model, which vessel tree outputs might\nbe entangled, and on connected components and skeleton analyses of the trees.\n After providing sub-labeling of the specific anatomical branches of each\nvenous tree, these algorithms also enable a morphometric analysis of the vessel\ntrees by extracting various geometrical markers.\n In conclusion, we propose a method that successfully improves current\nskeletonization methods, for extensive vascular trees that contain vessels of\ndifferent calibers. The separation algorithm creates a clean multi-class\nsegmentation of the vessels, validated by surgeons to provide low error. A new,\npublicly shared high-quality liver vessel dataset of 77 cases is thus created.\nFinally a method to annotate vessel trees according to anatomy is provided,\nenabling a unique liver vessel morphometry analysis.\n","authors":["Yassine Machta","Omar Ali","Kevin Hakkakian","Ana Vlasceanu","Amaury Facque","Nicolas Golse","Irene Vignon-Clementel"],"pdf_url":"https://arxiv.org/pdf/2411.15778v2.pdf","comment":"Internship at Simbiotx"},{"id":"http://arxiv.org/abs/2411.17772v2","updated":"2024-12-02T09:04:20Z","published":"2024-11-26T08:55:20Z","title":"MVBoost: Boost 3D Reconstruction with Multi-View Refinement","summary":" Recent advancements in 3D object reconstruction have been remarkable, yet\nmost current 3D models rely heavily on existing 3D datasets. The scarcity of\ndiverse 3D datasets results in limited generalization capabilities of 3D\nreconstruction models. In this paper, we propose a novel framework for boosting\n3D reconstruction with multi-view refinement (MVBoost) by generating pseudo-GT\ndata. The key of MVBoost is combining the advantages of the high accuracy of\nthe multi-view generation model and the consistency of the 3D reconstruction\nmodel to create a reliable data source. Specifically, given a single-view input\nimage, we employ a multi-view diffusion model to generate multiple views,\nfollowed by a large 3D reconstruction model to produce consistent 3D data.\nMVBoost then adaptively refines these multi-view images, rendered from the\nconsistent 3D data, to build a large-scale multi-view dataset for training a\nfeed-forward 3D reconstruction model. Additionally, the input view optimization\nis designed to optimize the corresponding viewpoints based on the user's input\nimage, ensuring that the most important viewpoint is accurately tailored to the\nuser's needs. Extensive evaluations demonstrate that our method achieves\nsuperior reconstruction results and robust generalization compared to prior\nworks.\n","authors":["Xiangyu Liu","Xiaomei Zhang","Zhiyuan Ma","Xiangyu Zhu","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2411.17772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17213v2","updated":"2024-12-02T08:59:20Z","published":"2024-11-26T08:29:24Z","title":"Scaling nnU-Net for CBCT Segmentation","summary":" This paper presents our approach to scaling the nnU-Net framework for\nmulti-structure segmentation on Cone Beam Computed Tomography (CBCT) images,\nspecifically in the scope of the ToothFairy2 Challenge. We leveraged the\nnnU-Net ResEnc L model, introducing key modifications to patch size, network\ntopology, and data augmentation strategies to address the unique challenges of\ndental CBCT imaging. Our method achieved a mean Dice coefficient of 0.9253 and\nHD95 of 18.472 on the test set, securing a mean rank of 4.6 and with it the\nfirst place in the ToothFairy2 challenge. The source code is publicly\navailable, encouraging further research and development in the field.\n","authors":["Fabian Isensee","Yannick Kirchhoff","Lars Kraemer","Maximilian Rokuss","Constantin Ulrich","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2411.17213v2.pdf","comment":"Fabian Isensee and Yannick Kirchhoff contributed equally"},{"id":"http://arxiv.org/abs/2409.09318v2","updated":"2024-12-02T08:51:09Z","published":"2024-09-14T05:31:29Z","title":"ODE: Open-Set Evaluation of Hallucinations in Multimodal Large Language\n Models","summary":" Hallucination poses a persistent challenge for multimodal large language\nmodels (MLLMs). However, existing benchmarks for evaluating hallucinations are\ngenerally static, which may overlook the potential risk of data contamination.\nTo address this issue, we propose ODE, an open-set, dynamic protocol designed\nto evaluate object hallucinations in MLLMs at both the existence and attribute\nlevels. ODE employs a graph-based structure to represent real-world object\nconcepts, their attributes, and the distributional associations between them.\nThis structure facilitates the extraction of concept combinations based on\ndiverse distributional criteria, generating varied samples for structured\nqueries that evaluate hallucinations in both generative and discriminative\ntasks. Through the generation of new samples, dynamic concept combinations, and\nvaried distribution frequencies, ODE mitigates the risk of data contamination\nand broadens the scope of evaluation. This protocol is applicable to both\ngeneral and specialized scenarios, including those with limited data.\nExperimental results demonstrate the effectiveness of our protocol, revealing\nthat MLLMs exhibit higher hallucination rates when evaluated with ODE-generated\nsamples, which indicates potential data contamination. Furthermore, these\ngenerated samples aid in analyzing hallucination patterns and fine-tuning\nmodels, offering an effective approach to mitigating hallucinations in MLLMs.\n","authors":["Yahan Tu","Rui Hu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2409.09318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.11522v2","updated":"2024-12-02T08:43:40Z","published":"2024-07-16T09:00:45Z","title":"FIRE: A Dataset for Feedback Integration and Refinement Evaluation of\n Multimodal Models","summary":" Vision language models (VLMs) have achieved impressive progress in diverse\napplications, becoming a prevalent research direction. In this paper, we build\nFIRE, a feedback-refinement dataset, consisting of 1.1M multi-turn\nconversations that are derived from 27 source datasets, empowering VLMs to\nspontaneously refine their responses based on user feedback across diverse\ntasks. To scale up the data collection, FIRE is collected in two components:\nFIRE-100K and FIRE-1M, where FIRE-100K is generated by GPT-4V, and FIRE-1M is\nfreely generated via models trained on FIRE-100K. Then, we build FIRE-Bench, a\nbenchmark to comprehensively evaluate the feedback-refining capability of VLMs,\nwhich contains 11K feedback-refinement conversations as the test data, two\nevaluation settings, and a model to provide feedback for VLMs. We develop the\nFIRE-LLaVA model by fine-tuning LLaVA on FIRE-100K and FIRE-1M, which shows\nremarkable feedback-refining capability on FIRE-Bench and outperforms untrained\nVLMs by 50%, making more efficient user-agent interactions and underscoring the\nsignificance of the FIRE dataset.\n","authors":["Pengxiang Li","Zhi Gao","Bofei Zhang","Tao Yuan","Yuwei Wu","Mehrtash Harandi","Yunde Jia","Song-Chun Zhu","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2407.11522v2.pdf","comment":"NeurIPS 2024 Dataset & Benchmark Track"},{"id":"http://arxiv.org/abs/2410.07278v2","updated":"2024-12-02T08:43:33Z","published":"2024-10-09T07:13:22Z","title":"PAR: Prompt-Aware Token Reduction Method for Efficient Large Multimodal\n Models","summary":" Multimodal large language models (MLLMs) demonstrate strong performance\nacross visual tasks, but their efficiency is hindered by significant\ncomputational and memory demands from processing long contexts in multimodal\ninputs. To address this, we introduce PAR (Prompt-Aware Token Reduction), a\nnovel and plug-and-play approach that reduces visual tokens efficiently without\ncompromising model performance. Unlike previous methods that rely heavily on\nattention mechanisms and overlooking cross-modal interactions , we uses a\nprompt-aware strategy to adpative identify and cluster essential visual tokens.\nPAR categorizes visual context redundancy into two types: external and\ninternal. External redundancy is minimized through semantic retrieval, while\ninternal redundancy is addressed using a token routing mechanism. This method\nsubstantially reduces computational load without requiring additional training\nor complex architectural modifications. \\textbf{Experimental results\ndemonstrate that across various visual question answering tasks, PAR reduces\nFLOPs by 83\\% with a compression ratio of 89\\%, while retaining 97\\% of\nbaseline accuracy.} The adaptive design of PAR achieves a 2x token reduction\nratio compared to prior approaches, enabling a better balance between\nperformance and efficiency.\n","authors":["Yingen Liu","Fan Wu","Ruihui Li","Zhuo Tang","Kenli Li"],"pdf_url":"https://arxiv.org/pdf/2410.07278v2.pdf","comment":"10 pages, 5 figures,3 tables"},{"id":"http://arxiv.org/abs/2405.16605v2","updated":"2024-12-02T08:41:46Z","published":"2024-05-26T15:31:09Z","title":"Demystify Mamba in Vision: A Linear Attention Perspective","summary":" Mamba is an effective state space model with linear computation complexity.\nIt has recently shown impressive efficiency in dealing with high-resolution\ninputs across various vision tasks. In this paper, we reveal that the powerful\nMamba model shares surprising similarities with linear attention Transformer,\nwhich typically underperform conventional Transformer in practice. By exploring\nthe similarities and disparities between the effective Mamba and subpar linear\nattention Transformer, we provide comprehensive analyses to demystify the key\nfactors behind Mamba's success. Specifically, we reformulate the selective\nstate space model and linear attention within a unified formulation, rephrasing\nMamba as a variant of linear attention Transformer with six major distinctions:\ninput gate, forget gate, shortcut, no attention normalization, single-head, and\nmodified block design. For each design, we meticulously analyze its pros and\ncons, and empirically evaluate its impact on model performance in vision tasks.\nInterestingly, the results highlight the forget gate and block design as the\ncore contributors to Mamba's success, while the other four designs are less\ncrucial. Based on these findings, we propose a Mamba-Inspired Linear Attention\n(MILA) model by incorporating the merits of these two key designs into linear\nattention. The resulting model outperforms various vision Mamba models in both\nimage classification and high-resolution dense prediction tasks, while enjoying\nparallelizable computation and fast inference speed. Code is available at\nhttps://github.com/LeapLabTHU/MLLA.\n","authors":["Dongchen Han","Ziyi Wang","Zhuofan Xia","Yizeng Han","Yifan Pu","Chunjiang Ge","Jun Song","Shiji Song","Bo Zheng","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2405.16605v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2303.10211v5","updated":"2024-12-02T08:37:24Z","published":"2023-03-17T19:00:24Z","title":"SITReg: Multi-resolution architecture for symmetric, inverse consistent,\n and topology preserving image registration","summary":" Deep learning has emerged as a strong alternative for classical iterative\nmethods for deformable medical image registration, where the goal is to find a\nmapping between the coordinate systems of two images. Popular classical image\nregistration methods enforce the useful inductive biases of symmetricity,\ninverse consistency, and topology preservation by construction. However, while\nmany deep learning registration methods encourage these properties via loss\nfunctions, no earlier methods enforce all of them by construction. Here, we\npropose a novel registration architecture based on extracting multi-resolution\nfeature representations which is by construction symmetric, inverse consistent,\nand topology preserving. We also develop an implicit layer for memory efficient\ninversion of the deformation fields. Our method achieves state-of-the-art\nregistration accuracy on three datasets. The code is available at\nhttps://github.com/honkamj/SITReg.\n","authors":["Joel Honkamaa","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2303.10211v5.pdf","comment":"Accepted for publication at the Journal of Machine Learning for\n Biomedical Imaging (MELBA) https://melba-journal.org/2024:026"},{"id":"http://arxiv.org/abs/2408.10845v2","updated":"2024-12-02T08:31:40Z","published":"2024-08-19T09:53:49Z","title":"CoVLA: Comprehensive Vision-Language-Action Dataset for Autonomous\n Driving","summary":" Autonomous driving, particularly navigating complex and unanticipated\nscenarios, demands sophisticated reasoning and planning capabilities. While\nMulti-modal Large Language Models (MLLMs) offer a promising avenue for this,\ntheir use has been largely confined to understanding complex environmental\ncontexts or generating high-level driving commands, with few studies extending\ntheir application to end-to-end path planning. A major research bottleneck is\nthe lack of large-scale annotated datasets encompassing vision, language, and\naction. To address this issue, we propose CoVLA (Comprehensive\nVision-Language-Action) Dataset, an extensive dataset comprising real-world\ndriving videos spanning more than 80 hours. This dataset leverages a novel,\nscalable approach based on automated data processing and a caption generation\npipeline to generate accurate driving trajectories paired with detailed natural\nlanguage descriptions of driving environments and maneuvers. This approach\nutilizes raw in-vehicle sensor data, allowing it to surpass existing datasets\nin scale and annotation richness. Using CoVLA, we investigate the driving\ncapabilities of MLLMs that can handle vision, language, and action in a variety\nof driving scenarios. Our results illustrate the strong proficiency of our\nmodel in generating coherent language and action outputs, emphasizing the\npotential of Vision-Language-Action (VLA) models in the field of autonomous\ndriving. This dataset establishes a framework for robust, interpretable, and\ndata-driven autonomous driving systems by providing a comprehensive platform\nfor training and evaluating VLA models, contributing to safer and more reliable\nself-driving vehicles. The dataset is released for academic purpose.\n","authors":["Hidehisa Arai","Keita Miwa","Kento Sasaki","Yu Yamaguchi","Kohei Watanabe","Shunsuke Aoki","Issei Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2408.10845v2.pdf","comment":"WACV 2025, Project Page: https://turingmotors.github.io/covla-ad/"},{"id":"http://arxiv.org/abs/2411.12787v2","updated":"2024-12-02T07:41:38Z","published":"2024-11-19T11:03:09Z","title":"Visual Cue Enhancement and Dual Low-Rank Adaptation for Efficient Visual\n Instruction Fine-Tuning","summary":" Parameter-efficient fine-tuning multimodal large language models (MLLMs)\npresents significant challenges, including reliance on high-level visual\nfeatures that limit fine-grained detail comprehension, and data conflicts that\narise from task complexity. To address these issues, we propose an efficient\nfine-tuning framework with two novel approaches: Vision Cue Enhancement (VCE)\nand Dual Low-Rank Adaptation (Dual-LoRA). VCE enhances the vision projector by\nintegrating multi-level visual cues, improving the model's ability to capture\nfine-grained visual features. Dual-LoRA introduces a dual low-rank structure\nfor instruction tuning, decoupling learning into skill and task spaces to\nenable precise control and efficient adaptation across diverse tasks. Our\nmethod simplifies implementation, enhances visual comprehension, and improves\nadaptability. Experiments on both downstream tasks and general benchmarks\ndemonstrate the effectiveness of our proposed approach.\n","authors":["Pengkun Jiao","Bin Zhu","Jingjing Chen","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.12787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12027v3","updated":"2024-12-02T07:22:40Z","published":"2024-03-18T17:57:09Z","title":"From Pixels to Insights: A Survey on Automatic Chart Understanding in\n the Era of Large Foundation Models","summary":" Data visualization in the form of charts plays a pivotal role in data\nanalysis, offering critical insights and aiding in informed decision-making.\nAutomatic chart understanding has witnessed significant advancements with the\nrise of large foundation models in recent years. Foundation models, such as\nlarge language models, have revolutionized various natural language processing\ntasks and are increasingly being applied to chart understanding tasks. This\nsurvey paper provides a comprehensive overview of the recent developments,\nchallenges, and future directions in chart understanding within the context of\nthese foundation models. We review fundamental building blocks crucial for\nstudying chart understanding tasks. Additionally, we explore various tasks and\ntheir evaluation metrics and sources of both charts and textual inputs. Various\nmodeling strategies are then examined, encompassing both classification-based\nand generation-based approaches, along with tool augmentation techniques that\nenhance chart understanding performance. Furthermore, we discuss the\nstate-of-the-art performance of each task and discuss how we can improve the\nperformance. Challenges and future directions are addressed, highlighting the\nimportance of several topics, such as domain-specific charts, lack of efforts\nin developing evaluation metrics, and agent-oriented settings. This survey\npaper serves as a comprehensive resource for researchers and practitioners in\nthe fields of natural language processing, computer vision, and data analysis,\nproviding valuable insights and directions for future research in chart\nunderstanding leveraging large foundation models. The studies mentioned in this\npaper, along with emerging new research, will be continually updated at:\nhttps://github.com/khuangaf/Awesome-Chart-Understanding.\n","authors":["Kung-Hsiang Huang","Hou Pong Chan","Yi R. Fung","Haoyi Qiu","Mingyang Zhou","Shafiq Joty","Shih-Fu Chang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2403.12027v3.pdf","comment":"IEEE Transactions on Knowledge and Data Engineering (TKDE)"},{"id":"http://arxiv.org/abs/2411.16316v3","updated":"2024-12-02T07:05:29Z","published":"2024-11-25T12:09:43Z","title":"Monocular Lane Detection Based on Deep Learning: A Survey","summary":" Lane detection plays an important role in autonomous driving perception\nsystems. As deep learning algorithms gain popularity, monocular lane detection\nmethods based on them have demonstrated superior performance and emerged as a\nkey research direction in autonomous driving perception. The core designs of\nthese algorithmic frameworks can be summarized as follows: (1) Task paradigm,\nfocusing on lane instance-level discrimination; (2) Lane modeling, representing\nlanes as a set of learnable parameters in the neural network; (3) Global\ncontext supplementation, enhancing inference on the obscure lanes; (4)\nPerspective effect elimination, providing accurate 3D lanes for downstream\napplications. From these perspectives, this paper presents a comprehensive\noverview of existing methods, encompassing both the increasingly mature 2D lane\ndetection approaches and the developing 3D lane detection works. Besides, this\npaper compares the performance of mainstream methods on different benchmarks\nand investigates their inference speed under a unified setting for fair\ncomparison. Moreover, we present some extended works on lane detection,\nincluding multi-task perception, video lane detection, online high-definition\nmap construction, and lane topology reasoning, to offer readers a comprehensive\nroadmap for the evolution of lane detection. Finally, we point out some\npotential future research directions in this field. We exhaustively collect the\npapers and codes of existing works at\nhttps://github.com/Core9724/Awesome-Lane-Detection and will keep tracing the\nresearch.\n","authors":["Xin He","Haiyun Guo","Kuan Zhu","Bingke Zhu","Xu Zhao","Jianwu Fang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.16316v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18363v2","updated":"2024-12-02T07:04:40Z","published":"2024-11-27T14:11:10Z","title":"ChatRex: Taming Multimodal LLM for Joint Perception and Understanding","summary":" Perception and understanding are two pillars of computer vision. While\nmultimodal large language models (MLLM) have demonstrated remarkable visual\nunderstanding capabilities, they arguably lack accurate perception abilities,\ne.g. the stage-of-the-art model Qwen2-VL only achieves a 43.9 recall rate on\nthe COCO dataset, limiting many tasks requiring the combination of perception\nand understanding. In this work, we aim to bridge this perception gap from both\nmodel designing and data development perspectives. We first introduce ChatRex,\nan MLLM with a decoupled perception design. Instead of having the LLM directly\npredict box coordinates, we feed the output boxes from a universal proposal\nnetwork into the LLM, allowing it to output the corresponding box indices to\nrepresent its detection results, turning the regression task into a\nretrieval-based task that LLM handles more proficiently. From the data\nperspective, we build a fully automated data engine and construct the\nRexverse-2M dataset which possesses multiple granularities to support the joint\ntraining of perception and understanding. After standard two-stage training,\nChatRex demonstrates strong perception capabilities while preserving multimodal\nunderstanding performance. The combination of these two capabilities\nsimultaneously unlocks many attractive applications, demonstrating the\ncomplementary roles of both perception and understanding in MLLM. Code is\navailable at \\url{https://github.com/IDEA-Research/ChatRex}.\n","authors":["Qing Jiang","Gen Luo","Yuqin Yang","Yuda Xiong","Yihao Chen","Zhaoyang Zeng","Tianhe Ren","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.18363v2.pdf","comment":"35 pages, 19 figures"},{"id":"http://arxiv.org/abs/2411.19951v2","updated":"2024-12-02T06:54:47Z","published":"2024-11-29T18:59:54Z","title":"T2Vid: Translating Long Text into Multi-Image is the Catalyst for\n Video-LLMs","summary":" The success of Multimodal Large Language Models (MLLMs) in the image domain\nhas garnered wide attention from the research community. Drawing on previous\nsuccessful experiences, researchers have recently explored extending the\nsuccess to the video understanding realms. Apart from training from scratch, an\nefficient way is to utilize the pre-trained image-LLMs, leading to two\nmainstream approaches, i.e. zero-shot inference and further fine-tuning with\nvideo data. In this work, our study of these approaches harvests an effective\ndata augmentation method. We first make a deeper inspection of the zero-shot\ninference way and identify two limitations, i.e. limited generalization and\nlack of temporal understanding capabilities. Thus, we further investigate the\nfine-tuning approach and find a low learning efficiency when simply using all\nthe video data samples, which can be attributed to a lack of instruction\ndiversity. Aiming at this issue, we develop a method called T2Vid to synthesize\nvideo-like samples to enrich the instruction diversity in the training corpus.\nIntegrating these data enables a simple and efficient training scheme, which\nachieves performance comparable to or even superior to using full video\ndatasets by training with just 15% the sample size. Meanwhile, we find that the\nproposed scheme can boost the performance of long video understanding without\ntraining with long video samples. We hope our study will spark more thinking\nabout using MLLMs for video understanding and curation of high-quality data.\nThe code is released at https://github.com/xjtupanda/T2Vid.\n","authors":["Shukang Yin","Chaoyou Fu","Sirui Zhao","Yunhang Shen","Chunjiang Ge","Yan Yang","Zuwei Long","Yuhan Dai","Tong Xu","Xing Sun","Ran He","Caifeng Shan","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.19951v2.pdf","comment":"Project page: https://github.com/xjtupanda/T2Vid"},{"id":"http://arxiv.org/abs/2311.06394v3","updated":"2024-12-02T06:22:54Z","published":"2023-11-10T20:50:36Z","title":"A design of Convolutional Neural Network model for the Diagnosis of the\n COVID-19","summary":" With the spread of COVID-19 around the globe over the past year, the usage of\nartificial intelligence (AI) algorithms and image processing methods to analyze\nthe X-ray images of patients' chest with COVID-19 has become essential. The\nCOVID-19 virus recognition in the lung area of a patient is one of the basic\nand essential needs of clicical centers and hospitals. Most research in this\nfield has been devoted to papers on the basis of deep learning methods\nutilizing CNNs (Convolutional Neural Network), which mainly deal with the\nscreening of sick and healthy people.In this study, a new structure of a\n19-layer CNN has been recommended for accurately recognition of the COVID-19\nfrom the X-ray pictures of chest. The offered CNN is developed to serve as a\nprecise diagnosis system for a three class (viral pneumonia, Normal, COVID) and\na four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A\ncomparison is conducted among the outcomes of the offered procedure and some\npopular pretrained networks, including Inception, Alexnet, ResNet50,\nSqueezenet, and VGG19 and based on Specificity, Accuracy, Precision,\nSensitivity, Confusion Matrix, and F1-score. The experimental results of the\noffered CNN method specify its dominance over the existing published\nprocedures. This method can be a useful tool for clinicians in deciding\nproperly about COVID-19.\n","authors":["Xinyuan Song"],"pdf_url":"https://arxiv.org/pdf/2311.06394v3.pdf","comment":"Important mistakes. Also, another author has contributed some to the\n revised version. So it is not appropriate for it to be with only my name"},{"id":"http://arxiv.org/abs/2411.11477v2","updated":"2024-12-02T05:58:49Z","published":"2024-11-18T11:26:11Z","title":"SL-YOLO: A Stronger and Lighter Drone Target Detection Model","summary":" Detecting small objects in complex scenes, such as those captured by drones,\nis a daunting challenge due to the difficulty in capturing the complex features\nof small targets. While the YOLO family has achieved great success in large\ntarget detection, its performance is less than satisfactory when faced with\nsmall targets. Because of this, this paper proposes a revolutionary model\nSL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small\ntarget detection. We propose the Hierarchical Extended Path Aggregation Network\n(HEPAN), a pioneering cross-scale feature fusion method that can ensure\nunparalleled detection accuracy even in the most challenging environments. At\nthe same time, without sacrificing detection capabilities, we design the C2fDCB\nlightweight module and add the SCDown downsampling module to greatly reduce the\nmodel's parameters and computational complexity. Our experimental results on\nthe VisDrone2019 dataset reveal a significant improvement in performance, with\nmAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to\n28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M,\nand the FPS can reach 132, making it an ideal solution for real-time small\nobject detection in resource-constrained environments.\n","authors":["Defan Chen","Luchan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11477v2.pdf","comment":"We are withdrawing this submission to incorporate substantial updates\n and improvements to the manuscript, including additional data and analysis"},{"id":"http://arxiv.org/abs/2411.19454v2","updated":"2024-12-02T05:47:15Z","published":"2024-11-29T03:54:54Z","title":"GausSurf: Geometry-Guided 3D Gaussian Splatting for Surface\n Reconstruction","summary":" 3D Gaussian Splatting has achieved impressive performance in novel view\nsynthesis with real-time rendering capabilities. However, reconstructing\nhigh-quality surfaces with fine details using 3D Gaussians remains a\nchallenging task. In this work, we introduce GausSurf, a novel approach to\nhigh-quality surface reconstruction by employing geometry guidance from\nmulti-view consistency in texture-rich areas and normal priors in texture-less\nareas of a scene. We observe that a scene can be mainly divided into two\nprimary regions: 1) texture-rich and 2) texture-less areas. To enforce\nmulti-view consistency at texture-rich areas, we enhance the reconstruction\nquality by incorporating a traditional patch-match based Multi-View Stereo\n(MVS) approach to guide the geometry optimization in an iterative scheme. This\nscheme allows for mutual reinforcement between the optimization of Gaussians\nand patch-match refinement, which significantly improves the reconstruction\nresults and accelerates the training process. Meanwhile, for the texture-less\nareas, we leverage normal priors from a pre-trained normal estimation model to\nguide optimization. Extensive experiments on the DTU and Tanks and Temples\ndatasets demonstrate that our method surpasses state-of-the-art methods in\nterms of reconstruction quality and computation time.\n","authors":["Jiepeng Wang","Yuan Liu","Peng Wang","Cheng Lin","Junhui Hou","Xin Li","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2411.19454v2.pdf","comment":"Project page: https://jiepengwang.github.io/GausSurf/"},{"id":"http://arxiv.org/abs/2411.10831v2","updated":"2024-12-02T05:36:39Z","published":"2024-11-16T16:24:28Z","title":"Neighboring Slice Noise2Noise: Self-Supervised Medical Image Denoising\n from Single Noisy Image Volume","summary":" In the last few years, with the rapid development of deep learning\ntechnologies, supervised methods based on convolutional neural networks have\ngreatly enhanced the performance of medical image denoising. However, these\nmethods require large quantities of noisy-clean image pairs for training, which\ngreatly limits their practicality. Although some researchers have attempted to\ntrain denoising networks using only single noisy images, existing\nself-supervised methods, including blind-spot-based and data-splitting-based\nmethods, heavily rely on the assumption that noise is pixel-wise independent.\nHowever, this assumption often does not hold in real-world medical images.\nTherefore, in the field of medical imaging, there remains a lack of simple and\npractical denoising methods that can achieve high-quality denoising performance\nusing only single noisy images. In this paper, we propose a novel\nself-supervised medical image denoising method, Neighboring Slice Noise2Noise\n(NS-N2N). The proposed method utilizes neighboring slices within a single noisy\nimage volume to construct weighted training data, and then trains the denoising\nnetwork using a self-supervised scheme with regional consistency loss and\ninter-slice continuity loss. NS-N2N only requires a single noisy image volume\nobtained from one medical imaging procedure to achieve high-quality denoising\nof the image volume itself. Extensive experiments demonstrate that the proposed\nmethod outperforms state-of-the-art self-supervised denoising methods in both\ndenoising performance and processing efficiency. Furthermore, since NS-N2N\noperates solely in the image domain, it is free from device-specific issues\nsuch as reconstruction geometry, making it easier to apply in various clinical\npractices.\n","authors":["Langrui Zhou","Ziteng Zhou","Xinyu Huang","Xiangyu Zhang","Huiru Wang","Guang Li"],"pdf_url":"https://arxiv.org/pdf/2411.10831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09806v2","updated":"2024-12-02T05:18:11Z","published":"2024-07-13T08:52:44Z","title":"Asynchronous Feedback Network for Perceptual Point Cloud Quality\n Assessment","summary":" Recent years have witnessed the success of the deep learning-based technique\nin research of no-reference point cloud quality assessment (NR-PCQA). For a\nmore accurate quality prediction, many previous studies have attempted to\ncapture global and local features in a bottom-up manner, but ignored the\ninteraction and promotion between them. To solve this problem, we propose a\nnovel asynchronous feedback quality prediction network (AFQ-Net). Motivated by\nhuman visual perception mechanisms, AFQ-Net employs a dual-branch structure to\ndeal with global and local features, simulating the left and right hemispheres\nof the human brain, and constructs a feedback module between them.\nSpecifically, the input point clouds are first fed into a transformer-based\nglobal encoder to generate the attention maps that highlight these semantically\nrich regions, followed by being merged into the global feature. Then, we\nutilize the generated attention maps to perform dynamic convolution for\ndifferent semantic regions and obtain the local feature. Finally, a\ncoarse-to-fine strategy is adopted to merge the two features into the final\nquality score. We conduct comprehensive experiments on three datasets and\nachieve superior performance over the state-of-the-art approaches on all of\nthese datasets. The code will be available at The code will be available at\nhttps://github.com/zhangyujie-1998/AFQ-Net.\n","authors":["Yujie Zhang","Qi Yang","Ziyu Shan","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2407.09806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15523v4","updated":"2024-12-02T05:09:06Z","published":"2023-05-24T19:20:59Z","title":"Task-aware Distributed Source Coding under Dynamic Bandwidth","summary":" Efficient compression of correlated data is essential to minimize\ncommunication overload in multi-sensor networks. In such networks, each sensor\nindependently compresses the data and transmits them to a central node due to\nlimited communication bandwidth. A decoder at the central node decompresses and\npasses the data to a pre-trained machine learning-based task to generate the\nfinal output. Thus, it is important to compress the features that are relevant\nto the task. Additionally, the final performance depends heavily on the total\navailable bandwidth. In practice, it is common to encounter varying\navailability in bandwidth, and higher bandwidth results in better performance\nof the task. We design a novel distributed compression framework composed of\nindependent encoders and a joint decoder, which we call neural distributed\nprincipal component analysis (NDPCA). NDPCA flexibly compresses data from\nmultiple sources to any available bandwidth with a single model, reducing\ncomputing and storage overhead. NDPCA achieves this by learning low-rank task\nrepresentations and efficiently distributing bandwidth among sensors, thus\nproviding a graceful trade-off between performance and bandwidth. Experiments\nshow that NDPCA improves the success rate of multi-view robotic arm\nmanipulation by 9% and the accuracy of object detection tasks on satellite\nimagery by 14% compared to an autoencoder with uniform bandwidth allocation.\n","authors":["Po-han Li","Sravan Kumar Ankireddy","Ruihan Zhao","Hossein Nourkhiz Mahjoub","Ehsan Moradi-Pari","Ufuk Topcu","Sandeep Chinchali","Hyeji Kim"],"pdf_url":"https://arxiv.org/pdf/2305.15523v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18203v2","updated":"2024-12-02T05:00:19Z","published":"2024-11-27T10:28:57Z","title":"Critic-V: VLM Critics Help Catch VLM Errors in Multimodal Reasoning","summary":" Vision-language models (VLMs) have shown remarkable advancements in\nmultimodal reasoning tasks. However, they still often generate inaccurate or\nirrelevant responses due to issues like hallucinated image understandings or\nunrefined reasoning paths. To address these challenges, we introduce Critic-V,\na novel framework inspired by the Actor-Critic paradigm to boost the reasoning\ncapability of VLMs. This framework decouples the reasoning process and critic\nprocess by integrating two independent components: the Reasoner, which\ngenerates reasoning paths based on visual and textual inputs, and the Critic,\nwhich provides constructive critique to refine these paths. In this approach,\nthe Reasoner generates reasoning responses according to text prompts, which can\nevolve iteratively as a policy based on feedback from the Critic. This\ninteraction process was theoretically driven by a reinforcement learning\nframework where the Critic offers natural language critiques instead of scalar\nrewards, enabling more nuanced feedback to boost the Reasoner's capability on\ncomplex reasoning tasks. The Critic model is trained using Direct Preference\nOptimization (DPO), leveraging a preference dataset of critiques ranked by\nRule-based Reward~(RBR) to enhance its critic capabilities. Evaluation results\nshow that the Critic-V framework significantly outperforms existing methods,\nincluding GPT-4V, on 5 out of 8 benchmarks, especially regarding reasoning\naccuracy and efficiency. Combining a dynamic text-based policy for the Reasoner\nand constructive feedback from the preference-optimized Critic enables a more\nreliable and context-sensitive multimodal reasoning process. Our approach\nprovides a promising solution to enhance the reliability of VLMs, improving\ntheir performance in real-world reasoning-heavy multimodal applications such as\nautonomous driving and embodied intelligence.\n","authors":["Di Zhang","Junxian Li","Jingdi Lei","Xunzhi Wang","Yujie Liu","Zonglin Yang","Jiatong Li","Weida Wang","Suorong Yang","Jianbo Wu","Peng Ye","Wanli Ouyang","Dongzhan Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.18203v2.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2411.18673v2","updated":"2024-12-02T04:43:30Z","published":"2024-11-27T18:49:13Z","title":"AC3D: Analyzing and Improving 3D Camera Control in Video Diffusion\n Transformers","summary":" Numerous works have recently integrated 3D camera control into foundational\ntext-to-video models, but the resulting camera control is often imprecise, and\nvideo generation quality suffers. In this work, we analyze camera motion from a\nfirst principles perspective, uncovering insights that enable precise 3D camera\nmanipulation without compromising synthesis quality. First, we determine that\nmotion induced by camera movements in videos is low-frequency in nature. This\nmotivates us to adjust train and test pose conditioning schedules, accelerating\ntraining convergence while improving visual and motion quality. Then, by\nprobing the representations of an unconditional video diffusion transformer, we\nobserve that they implicitly perform camera pose estimation under the hood, and\nonly a sub-portion of their layers contain the camera information. This\nsuggested us to limit the injection of camera conditioning to a subset of the\narchitecture to prevent interference with other video features, leading to 4x\nreduction of training parameters, improved training speed and 10% higher visual\nquality. Finally, we complement the typical dataset for camera control learning\nwith a curated dataset of 20K diverse dynamic videos with stationary cameras.\nThis helps the model disambiguate the difference between camera and scene\nmotion, and improves the dynamics of generated pose-conditioned videos. We\ncompound these findings to design the Advanced 3D Camera Control (AC3D)\narchitecture, the new state-of-the-art model for generative video modeling with\ncamera control.\n","authors":["Sherwin Bahmani","Ivan Skorokhodov","Guocheng Qian","Aliaksandr Siarohin","Willi Menapace","Andrea Tagliasacchi","David B. Lindell","Sergey Tulyakov"],"pdf_url":"https://arxiv.org/pdf/2411.18673v2.pdf","comment":"Project Page: https://snap-research.github.io/ac3d/"},{"id":"http://arxiv.org/abs/2410.10269v2","updated":"2024-12-02T04:27:39Z","published":"2024-10-14T08:21:08Z","title":"Two-Stage Approach for Brain MR Image Synthesis: 2D Image Synthesis and\n 3D Refinement","summary":" Despite significant advancements in automatic brain tumor segmentation\nmethods, their performance is not guaranteed when certain MR sequences are\nmissing. Addressing this issue, it is crucial to synthesize the missing MR\nimages that reflect the unique characteristics of the absent modality with\nprecise tumor representation. Typically, MRI synthesis methods generate partial\nimages rather than full-sized volumes due to computational constraints. This\nlimitation can lead to a lack of comprehensive 3D volumetric information and\nresult in image artifacts during the merging process. In this paper, we propose\na two-stage approach that first synthesizes MR images from 2D slices using a\nnovel intensity encoding method and then refines the synthesized MRI. The\nproposed intensity encoding reduces artifacts when synthesizing MRI on a 2D\nslice basis. Then, the \\textit{Refiner}, which leverages complete 3D volume\ninformation, further improves the quality of the synthesized images and\nenhances their applicability to segmentation methods. Experimental results\ndemonstrate that the intensity encoding effectively minimizes artifacts in the\nsynthesized MRI and improves perceptual quality. Furthermore, using the\n\\textit{Refiner} on synthesized MRI significantly improves brain tumor\nsegmentation results, highlighting the potential of our approach in practical\napplications.\n","authors":["Jihoon Cho","Seunghyuck Park","Jinah Park"],"pdf_url":"https://arxiv.org/pdf/2410.10269v2.pdf","comment":"MICCAI 2024 BraSyn Challenge 1st place"},{"id":"http://arxiv.org/abs/2411.16752v2","updated":"2024-12-02T04:12:35Z","published":"2024-11-24T05:27:21Z","title":"Imagine and Seek: Improving Composed Image Retrieval with an Imagined\n Proxy","summary":" The Zero-shot Composed Image Retrieval (ZSCIR) requires retrieving images\nthat match the query image and the relative captions. Current methods focus on\nprojecting the query image into the text feature space, subsequently combining\nthem with features of query texts for retrieval. However, retrieving images\nonly with the text features cannot guarantee detailed alignment due to the\nnatural gap between images and text. In this paper, we introduce Imagined Proxy\nfor CIR (IP-CIR), a training-free method that creates a proxy image aligned\nwith the query image and text description, enhancing query representation in\nthe retrieval process. We first leverage the large language model's\ngeneralization capability to generate an image layout, and then apply both the\nquery text and image for conditional generation. The robust query features are\nenhanced by merging the proxy image, query image, and text semantic\nperturbation. Our newly proposed balancing metric integrates text-based and\nproxy retrieval similarities, allowing for more accurate retrieval of the\ntarget image while incorporating image-side information into the process.\nExperiments on three public datasets demonstrate that our method significantly\nimproves retrieval performances. We achieve state-of-the-art (SOTA) results on\nthe CIRR dataset with a Recall@K of 70.07 at K=10. Additionally, we achieved an\nimprovement in Recall@10 on the FashionIQ dataset, rising from 45.11 to 45.74,\nand improved the baseline performance in CIRCO with a mAPK@10 score, increasing\nfrom 32.24 to 34.26.\n","authors":["You Li","Fan Ma","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.16752v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2309.10625v4","updated":"2024-12-02T04:10:31Z","published":"2023-09-19T14:04:04Z","title":"NoisyNN: Exploring the Impact of Information Entropy Change in Learning\n Systems","summary":" We investigate the impact of entropy change in deep learning systems by noise\ninjection at different levels, including the embedding space and the image. The\nseries of models that employ our methodology are collectively known as Noisy\nNeural Networks (NoisyNN), with examples such as NoisyViT and NoisyCNN. Noise\nis conventionally viewed as a harmful perturbation in various deep learning\narchitectures, such as convolutional neural networks (CNNs) and vision\ntransformers (ViTs), as well as different learning tasks like image\nclassification and transfer learning. However, this work shows noise can be an\neffective way to change the entropy of the learning system. We demonstrate that\nspecific noise can boost the performance of various deep models under certain\nconditions. We theoretically prove the enhancement gained from positive noise\nby reducing the task complexity defined by information entropy and\nexperimentally show the significant performance gain in large image datasets,\nsuch as the ImageNet. Herein, we use the information entropy to define the\ncomplexity of the task. We categorize the noise into two types, positive noise\n(PN) and harmful noise (HN), based on whether the noise can help reduce the\ntask complexity. Extensive experiments of CNNs and ViTs have shown performance\nimprovements by proactively injecting positive noise, where we achieved an\nunprecedented top 1 accuracy of 95$\\%$ on ImageNet. Both theoretical analysis\nand empirical evidence have confirmed that the presence of positive noise, can\nbenefit the learning process, while the traditionally perceived harmful noise\nindeed impairs deep learning models. The different roles of noise offer new\nexplanations for deep models on specific tasks and provide a new paradigm for\nimproving model performance. Moreover, it reminds us that we can influence the\nperformance of learning systems via information entropy change.\n","authors":["Xiaowei Yu","Zhe Huang","Minheng Chen","Yao Xue","Tianming Liu","Dajiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.10625v4.pdf","comment":"Task Entropy, NoisyViT, NoisyCNN"},{"id":"http://arxiv.org/abs/2404.13873v4","updated":"2024-12-02T04:06:47Z","published":"2024-04-22T04:47:52Z","title":"Texture, Shape and Order Matter: A New Transformer Design for Sequential\n DeepFake Detection","summary":" Sequential DeepFake detection is an emerging task that predicts the\nmanipulation sequence in order. Existing methods typically formulate it as an\nimage-to-sequence problem, employing conventional Transformer architectures.\nHowever, these methods lack dedicated design and consequently result in limited\nperformance. As such, this paper describes a new Transformer design, called\nTSOM, by exploring three perspectives: Texture, Shape, and Order of\nManipulations. Our method features four major improvements: \\ding{182} we\ndescribe a new texture-aware branch that effectively captures subtle\nmanipulation traces with a Diversiform Pixel Difference Attention module.\n\\ding{183} Then we introduce a Multi-source Cross-attention module to seek deep\ncorrelations among spatial and sequential features, enabling effective modeling\nof complex manipulation traces. \\ding{184} To further enhance the\ncross-attention, we describe a Shape-guided Gaussian mapping strategy,\nproviding initial priors of the manipulation shape. \\ding{185} Finally,\nobserving that the subsequent manipulation in a sequence may influence traces\nleft in the preceding one, we intriguingly invert the prediction order from\nforward to backward, leading to notable gains as expected. Extensive\nexperimental results demonstrate that our method outperforms others by a large\nmargin, highlighting the superiority of our method.\n","authors":["Yunfei Li","Yuezun Li","Xin Wang","Baoyuan Wu","Jiaran Zhou","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2404.13873v4.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2411.16749v2","updated":"2024-12-02T04:05:34Z","published":"2024-11-24T04:49:07Z","title":"AnySynth: Harnessing the Power of Image Synthetic Data Generation for\n Generalized Vision-Language Tasks","summary":" Diffusion models have recently been employed to generate high-quality images,\nreducing the need for manual data collection and improving model generalization\nin tasks such as object detection, instance segmentation, and image perception.\nHowever, the synthetic framework is usually designed with meticulous human\neffort for each task due to various requirements on image layout, content, and\nannotation formats, restricting the application of synthetic data on more\ngeneral scenarios. In this paper, we propose AnySynth, a unified framework\nintegrating adaptable, comprehensive, and highly controllable components\ncapable of generating an arbitrary type of synthetic data given diverse\nrequirements. Specifically, the Task-Specific Layout Generation Module is first\nintroduced to produce reasonable layouts for different tasks by leveraging the\ngeneration ability of large language models and layout priors of real-world\nimages. A Uni-Controlled Image Generation Module is then developed to create\nhigh-quality synthetic images that are controllable and based on the generated\nlayouts. In addition, user specific reference images, and style images can be\nincorporated into the generation to task requirements. Finally, the\nTask-Oriented Annotation Module offers precise and detailed annotations for the\ngenerated images across different tasks. We have validated our framework's\nperformance across various tasks, including Few-shot Object Detection,\nCross-domain Object Detection, Zero-shot Composed Image Retrieval, and\nMulti-modal Image Perception and Grounding. The specific data synthesized by\nour framework significantly improves model performance in these tasks,\ndemonstrating the generality and effectiveness of our framework.\n","authors":["You Li","Fan Ma","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.16749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13800v5","updated":"2024-12-02T03:55:49Z","published":"2023-03-24T04:45:45Z","title":"Aligning Step-by-Step Instructional Diagrams to Video Demonstrations","summary":" Multimodal alignment facilitates the retrieval of instances from one modality\nwhen queried using another. In this paper, we consider a novel setting where\nsuch an alignment is between (i) instruction steps that are depicted as\nassembly diagrams (commonly seen in Ikea assembly manuals) and (ii) video\nsegments from in-the-wild videos; these videos comprising an enactment of the\nassembly actions in the real world. To learn this alignment, we introduce a\nnovel supervised contrastive learning method that learns to align videos with\nthe subtle details in the assembly diagrams, guided by a set of novel losses.\nTo study this problem and demonstrate the effectiveness of our method, we\nintroduce a novel dataset: IAW for Ikea assembly in the wild consisting of 183\nhours of videos from diverse furniture assembly collections and nearly 8,300\nillustrations from their associated instruction manuals and annotated for their\nground truth alignments. We define two tasks on this dataset: First, nearest\nneighbor retrieval between video segments and illustrations, and, second,\nalignment of instruction steps and the segments for each video. Extensive\nexperiments on IAW demonstrate superior performances of our approach against\nalternatives.\n","authors":["Jiahao Zhang","Anoop Cherian","Yanbin Liu","Yizhak Ben-Shabat","Cristian Rodriguez","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2303.13800v5.pdf","comment":"Accepted to CVPR'23"},{"id":"http://arxiv.org/abs/2411.00769v2","updated":"2024-12-02T03:54:23Z","published":"2024-11-01T17:59:17Z","title":"GameGen-X: Interactive Open-world Game Video Generation","summary":" We introduce GameGen-X, the first diffusion transformer model specifically\ndesigned for both generating and interactively controlling open-world game\nvideos. This model facilitates high-quality, open-domain generation by\nsimulating an extensive array of game engine features, such as innovative\ncharacters, dynamic environments, complex actions, and diverse events.\nAdditionally, it provides interactive controllability, predicting and altering\nfuture content based on the current clip, thus allowing for gameplay\nsimulation. To realize this vision, we first collected and built an Open-World\nVideo Game Dataset from scratch. It is the first and largest dataset for\nopen-world game video generation and control, which comprises over a million\ndiverse gameplay video clips sampling from over 150 games with informative\ncaptions from GPT-4o. GameGen-X undergoes a two-stage training process,\nconsisting of foundation model pre-training and instruction tuning. Firstly,\nthe model was pre-trained via text-to-video generation and video continuation,\nendowing it with the capability for long-sequence, high-quality open-domain\ngame video generation. Further, to achieve interactive controllability, we\ndesigned InstructNet to incorporate game-related multi-modal control signal\nexperts. This allows the model to adjust latent representations based on user\ninputs, unifying character interaction and scene content control for the first\ntime in video generation. During instruction tuning, only the InstructNet is\nupdated while the pre-trained foundation model is frozen, enabling the\nintegration of interactive controllability without loss of diversity and\nquality of generated video content.\n","authors":["Haoxuan Che","Xuanhua He","Quande Liu","Cheng Jin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.00769v2.pdf","comment":"Homepage: https://gamegen-x.github.io/ Github:\n https://github.com/GameGen-X/GameGen-X"},{"id":"http://arxiv.org/abs/2407.12066v4","updated":"2024-12-02T03:39:10Z","published":"2024-07-16T05:44:30Z","title":"Temporally Grounding Instructional Diagrams in Unconstrained Videos","summary":" We study the challenging problem of simultaneously localizing a sequence of\nqueries in the form of instructional diagrams in a video. This requires\nunderstanding not only the individual queries but also their\ninterrelationships. However, most existing methods focus on grounding one query\nat a time, ignoring the inherent structures among queries such as the general\nmutual exclusiveness and the temporal order. Consequently, the predicted\ntimespans of different step diagrams may overlap considerably or violate the\ntemporal order, thus harming the accuracy. In this paper, we tackle this issue\nby simultaneously grounding a sequence of step diagrams. Specifically, we\npropose composite queries, constructed by exhaustively pairing up the visual\ncontent features of the step diagrams and a fixed number of learnable\npositional embeddings. Our insight is that self-attention among composite\nqueries carrying different content features suppress each other to reduce\ntimespan overlaps in predictions, while the cross-attention corrects the\ntemporal misalignment via content and position joint guidance. We demonstrate\nthe effectiveness of our approach on the IAW dataset for grounding step\ndiagrams and the YouCook2 benchmark for grounding natural language queries,\nsignificantly outperforming existing methods while simultaneously grounding\nmultiple queries.\n","authors":["Jiahao Zhang","Frederic Z. Zhang","Cristian Rodriguez","Yizhak Ben-Shabat","Anoop Cherian","Stephen Gould"],"pdf_url":"https://arxiv.org/pdf/2407.12066v4.pdf","comment":"Accepted to WACV'25"},{"id":"http://arxiv.org/abs/2411.19527v2","updated":"2024-12-02T03:34:45Z","published":"2024-11-29T07:54:56Z","title":"DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow\n Decoding","summary":" Human motion, inherently continuous and dynamic, presents significant\nchallenges for generative models. Despite their dominance, discrete\nquantization methods, such as VQ-VAEs, suffer from inherent limitations,\nincluding restricted expressiveness and frame-wise noise artifacts. Continuous\napproaches, while producing smoother and more natural motions, often falter due\nto high-dimensional complexity and limited training data. To resolve this\n\"discord\" between discrete and continuous representations, we introduce\nDisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a\nnovel method that decodes discrete motion tokens into continuous motion through\nrectified flow. By employing an iterative refinement process in the continuous\nspace, DisCoRD captures fine-grained dynamics and ensures smoother and more\nnatural motions. Compatible with any discrete-based framework, our method\nenhances naturalness without compromising faithfulness to the conditioning\nsignals. Extensive evaluations demonstrate that DisCoRD achieves\nstate-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on\nKIT-ML. These results solidify DisCoRD as a robust solution for bridging the\ndivide between discrete efficiency and continuous realism. Our project page is\navailable at: https://whwjdqls.github.io/discord.github.io/.\n","authors":["Jungbin Cho","Junwan Kim","Jisoo Kim","Minseo Kim","Mingu Kang","Sungeun Hong","Tae-Hyun Oh","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2411.19527v2.pdf","comment":"20 pages 18 figures"},{"id":"http://arxiv.org/abs/2411.17606v2","updated":"2024-12-02T03:19:04Z","published":"2024-11-26T17:18:20Z","title":"HyperSeg: Towards Universal Visual Segmentation with Large Language\n Model","summary":" This paper aims to address universal segmentation for image and video\nperception with the strong reasoning ability empowered by Visual Large Language\nModels (VLLMs). Despite significant progress in current unified segmentation\nmethods, limitations in adaptation to both image and video scenarios, as well\nas the complex reasoning segmentation, make it difficult for them to handle\nvarious challenging instructions and achieve an accurate understanding of\nfine-grained vision-language correlations. We propose HyperSeg, the first\nVLLM-based universal segmentation model for pixel-level image and video\nperception, encompassing generic segmentation tasks and more complex reasoning\nperception tasks requiring powerful reasoning abilities and world knowledge.\nBesides, to fully leverage the recognition capabilities of VLLMs and the\nfine-grained visual information, HyperSeg incorporates hybrid entity\nrecognition and fine-grained visual perceiver modules for various segmentation\ntasks. Combined with the temporal adapter, HyperSeg achieves a comprehensive\nunderstanding of temporal information. Experimental results validate the\neffectiveness of our insights in resolving universal image and video\nsegmentation tasks, including the more complex reasoning perception tasks. Our\ncode is available.\n","authors":["Cong Wei","Yujie Zhong","Haoxian Tan","Yong Liu","Zheng Zhao","Jie Hu","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2411.17606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12307v7","updated":"2024-12-02T03:18:41Z","published":"2023-03-22T04:49:23Z","title":"Predicting and Enhancing the Fairness of DNNs with the Curvature of\n Perceptual Manifolds","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we first establish a geometric\nperspective for analyzing model fairness and then systematically propose a\nseries of geometric measurements for perceptual manifolds in deep neural\nnetworks. Subsequently, we comprehensively explore the effect of the geometric\ncharacteristics of perceptual manifolds on classification difficulty and how\nlearning shapes the geometric characteristics of perceptual manifolds. An\nunanticipated finding is that the correlation between the class accuracy and\nthe separation degree of perceptual manifolds gradually decreases during\ntraining, while the negative correlation with the curvature gradually\nincreases, implying that curvature imbalance leads to model bias.Building upon\nthese observations, we propose curvature regularization to facilitate the model\nto learn curvature-balanced and flatter perceptual manifolds. Evaluations on\nmultiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Maoji Wen","Lingling Li","Wenping Ma","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12307v7.pdf","comment":"17pages, Accepted by CVPR 2023, Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2404.16323v2","updated":"2024-12-02T03:11:06Z","published":"2024-04-25T04:18:59Z","title":"LeanGaussian: Breaking Pixel or Point Cloud Correspondence in Modeling\n 3D Gaussians","summary":" Rencently, Gaussian splatting has demonstrated significant success in novel\nview synthesis. Current methods often regress Gaussians with pixel or point\ncloud correspondence, linking each Gaussian with a pixel or a 3D point. This\nleads to the redundancy of Gaussians being used to overfit the correspondence\nrather than the objects represented by the 3D Gaussians themselves,\nconsequently wasting resources and lacking accurate geometries or textures. In\nthis paper, we introduce LeanGaussian, a novel approach that treats each query\nin deformable Transformer as one 3D Gaussian ellipsoid, breaking the pixel or\npoint cloud correspondence constraints. We leverage deformable decoder to\niteratively refine the Gaussians layer-by-layer with the image features as keys\nand values. Notably, the center of each 3D Gaussian is defined as 3D reference\npoints, which are then projected onto the image for deformable attention in 2D\nspace. On both the ShapeNet SRN dataset (category level) and the Google Scanned\nObjects dataset (open-category level, trained with the Objaverse dataset), our\napproach, outperforms prior methods by approximately 6.1\\%, achieving a PSNR of\n25.44 and 22.36, respectively. Additionally, our method achieves a 3D\nreconstruction speed of 7.2 FPS and rendering speed 500 FPS. The code will be\nreleased at https://github.com/jwubz123/DIG3D.\n","authors":["Jiamin Wu","Kenkun Liu","Han Gao","Xiaoke Jiang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.16323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.13174v2","updated":"2024-12-02T02:55:29Z","published":"2024-01-24T01:41:26Z","title":"Towards Complementary Knowledge Distillation for Efficient Dense Image\n Prediction","summary":" It has been revealed that small efficient dense image prediction (EDIP)\nmodels, trained using the knowledge distillation (KD) framework, encounter two\nkey challenges, including maintaining boundary region completeness and\npreserving target region connectivity, despite their favorable capacity to\nrecognize main object regions. In this work, we propose a complementary\nboundary and context distillation (BCD) method within the KD framework for\nEDIPs, which facilitates the targeted knowledge transfer from large accurate\nteacher models to compact efficient student models. Specifically, the boundary\ndistillation component focuses on extracting explicit object-level semantic\nboundaries from the hierarchical feature maps of the backbone network to\nenhance the student model's mask quality in boundary regions. Concurrently, the\ncontext distillation component leverages self-relations as a bridge to transfer\nimplicit pixel-level contexts from the teacher model to the student model,\nensuring strong connectivity in target regions. Our proposed BCD method is\nspecifically designed for EDIP tasks and is characterized by its simplicity and\nefficiency. Extensive experimental results across semantic segmentation, object\ndetection, and instance segmentation on various representative datasets\ndemonstrate that our method can outperform existing methods without requiring\nextra supervisions or incurring increased inference costs, resulting in\nwell-defined object boundaries and smooth connecting regions.\n","authors":["Dong Zhang","Pingcheng Dong","Xinting Hu","Long Chen","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.13174v2.pdf","comment":"under submission"},{"id":"http://arxiv.org/abs/2407.12735v4","updated":"2024-12-02T02:34:47Z","published":"2024-07-17T16:55:42Z","title":"EchoSight: Advancing Visual-Language Models with Wiki Knowledge","summary":" Knowledge-based Visual Question Answering (KVQA) tasks require answering\nquestions about images using extensive background knowledge. Despite\nsignificant advancements, generative models often struggle with these tasks due\nto the limited integration of external knowledge. In this paper, we introduce\nEchoSight, a novel multimodal Retrieval-Augmented Generation (RAG) framework\nthat enables large language models (LLMs) to answer visual questions requiring\nfine-grained encyclopedic knowledge. To strive for high-performing retrieval,\nEchoSight first searches wiki articles by using visual-only information,\nsubsequently, these candidate articles are further reranked according to their\nrelevance to the combined text-image query. This approach significantly\nimproves the integration of multimodal knowledge, leading to enhanced retrieval\noutcomes and more accurate VQA responses. Our experimental results on the\nEncyclopedic VQA and InfoSeek datasets demonstrate that EchoSight establishes\nnew state-of-the-art results in knowledge-based VQA, achieving an accuracy of\n41.8% on Encyclopedic VQA and 31.3% on InfoSeek.\n","authors":["Yibin Yan","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2407.12735v4.pdf","comment":"Accepted by EMNLP 2024 findings; Project Page:\n https://go2heart.github.io/echosight"},{"id":"http://arxiv.org/abs/2411.06071v2","updated":"2024-12-02T02:24:08Z","published":"2024-11-09T05:22:13Z","title":"GlocalCLIP: Object-agnostic Global-Local Prompt Learning for Zero-shot\n Anomaly Detection","summary":" Zero-shot anomaly detection (ZSAD) is crucial for detecting anomalous\npatterns in target datasets without using training samples, specifically in\nscenarios where there are distributional differences between the target domain\nand training data or where data scarcity arises because of restricted access.\nAlthough recently pretrained vision-language models demonstrate strong\nzero-shot performance across various visual tasks, they focus on learning class\nsemantics, which makes their direct application to ZSAD challenging. To address\nthis scenario, we propose GlocalCLIP, which uniquely separates global and local\nprompts and jointly optimizes them. This approach enables the object-agnostic\nglocal semantic prompt to effectively capture general normal and anomalous\npatterns without dependency on specific objects in the image. We refine the\ntext prompts for more precise adjustments by utilizing deep-text prompt tuning\nin the text encoder. In the vision encoder, we apply V-V attention layers to\ncapture detailed local image features. Finally, we introduce glocal contrastive\nlearning to improve the complementary learning of global and local prompts,\neffectively detecting anomalous patterns across various domains. The\ngeneralization performance of GlocalCLIP in ZSAD was demonstrated on 15\nreal-world datasets from both the industrial and medical domains, achieving\nsuperior performance compared to existing methods. Code will be made available\nat https://github.com/YUL-git/GlocalCLIP.\n","authors":["Jiyul Ham","Yonggon Jung","Jun-Geol Baek"],"pdf_url":"https://arxiv.org/pdf/2411.06071v2.pdf","comment":"29 pages, 36 figures"},{"id":"http://arxiv.org/abs/2411.19224v2","updated":"2024-12-02T02:04:53Z","published":"2024-11-28T15:49:08Z","title":"Differentiable Voxel-based X-ray Rendering Improves Sparse-View 3D CBCT\n Reconstruction","summary":" We present DiffVox, a self-supervised framework for Cone-Beam Computed\nTomography (CBCT) reconstruction by directly optimizing a voxelgrid\nrepresentation using physics-based differentiable X-ray rendering. Further, we\ninvestigate how the different implementations of the X-ray image formation\nmodel in the renderer affect the quality of 3D reconstruction and novel view\nsynthesis. When combined with our regularized voxel-based learning framework,\nwe find that using an exact implementation of the discrete Beer-Lambert law for\nX-ray attenuation in the renderer outperforms both widely used iterative CBCT\nreconstruction algorithms and modern neural field approaches, particularly when\ngiven only a few input views. As a result, we reconstruct high-fidelity 3D CBCT\nvolumes from fewer X-rays, potentially reducing ionizing radiation exposure and\nimproving diagnostic utility. Our implementation is available at\nhttps://github.com/hossein-momeni/DiffVox.\n","authors":["Mohammadhossein Momeni","Vivek Gopalakrishnan","Neel Dey","Polina Golland","Sarah Frisken"],"pdf_url":"https://arxiv.org/pdf/2411.19224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18977v2","updated":"2024-12-02T02:01:05Z","published":"2024-11-28T07:58:30Z","title":"Det-SAM2:Technical Report on the Self-Prompting Segmentation Framework\n Based on Segment Anything Model 2","summary":" Segment Anything Model 2 (SAM2) demonstrates exceptional performance in video\nsegmentation and refinement of segmentation results. We anticipate that it can\nfurther evolve to achieve higher levels of automation for practical\napplications. Building upon SAM2, we conducted a series of practices that\nultimately led to the development of a fully automated pipeline, termed\nDet-SAM2, in which object prompts are automatically generated by a detection\nmodel to facilitate inference and refinement by SAM2. This pipeline enables\ninference on infinitely long video streams with constant VRAM and RAM usage,\nall while preserving the same efficiency and accuracy as the original SAM2.\n This technical report focuses on the construction of the overall Det-SAM2\nframework and the subsequent engineering optimization applied to SAM2. We\npresent a case demonstrating an application built on the Det-SAM2 framework: AI\nrefereeing in a billiards scenario, derived from our business context. The\nproject at \\url{https://github.com/motern88/Det-SAM2}.\n","authors":["Zhiting Wang","Qiangong Zhou","Zongyang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.18977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14064v3","updated":"2024-12-02T00:42:44Z","published":"2023-11-23T15:42:42Z","title":"HGCLIP: Exploring Vision-Language Models with Graph Representations for\n Hierarchical Understanding","summary":" Object categories are typically organized into a multi-granularity taxonomic\nhierarchy. When classifying categories at different hierarchy levels,\ntraditional uni-modal approaches focus primarily on image features, revealing\nlimitations in complex scenarios. Recent studies integrating Vision-Language\nModels (VLMs) with class hierarchies have shown promise, yet they fall short of\nfully exploiting the hierarchical relationships. These efforts are constrained\nby their inability to perform effectively across varied granularity of\ncategories. To tackle this issue, we propose a novel framework (HGCLIP) that\neffectively combines CLIP with a deeper exploitation of the Hierarchical class\nstructure via Graph representation learning. We explore constructing the class\nhierarchy into a graph, with its nodes representing the textual or image\nfeatures of each category. After passing through a graph encoder, the textual\nfeatures incorporate hierarchical structure information, while the image\nfeatures emphasize class-aware features derived from prototypes through the\nattention mechanism. Our approach demonstrates significant improvements on 11\ndiverse visual recognition benchmarks. Our codes are fully available at\nhttps://github.com/richard-peng-xia/HGCLIP.\n","authors":["Peng Xia","Xingtong Yu","Ming Hu","Lie Ju","Zhiyong Wang","Peibo Duan","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2311.14064v3.pdf","comment":"COLING 2025"},{"id":"http://arxiv.org/abs/2401.05738v3","updated":"2024-12-02T00:04:23Z","published":"2024-01-11T08:40:35Z","title":"Interpreting and Improving Attention From the Perspective of Large\n Kernel Convolution","summary":" Attention mechanisms have significantly advanced visual models by capturing\nglobal context effectively. However, their reliance on large-scale datasets and\nsubstantial computational resources poses challenges in data-scarce and\nresource-constrained scenarios. Moreover, traditional self-attention mechanisms\nlack inherent spatial inductive biases, making them suboptimal for modeling\nlocal features critical to tasks involving smaller datasets. In this work, we\nintroduce Large Kernel Convolutional Attention (LKCA), a novel formulation that\nreinterprets attention operations as a single large-kernel convolution. This\ndesign unifies the strengths of convolutional architectures locality and\ntranslation invariance with the global context modeling capabilities of\nself-attention. By embedding these properties into a computationally efficient\nframework, LKCA addresses key limitations of traditional attention mechanisms.\nThe proposed LKCA achieves competitive performance across various visual tasks,\nparticularly in data-constrained settings. Experimental results on CIFAR-10,\nCIFAR-100, SVHN, and Tiny-ImageNet demonstrate its ability to excel in image\nclassification, outperforming conventional attention mechanisms and vision\ntransformers in compact model settings. These findings highlight the\neffectiveness of LKCA in bridging local and global feature modeling, offering a\npractical and robust solution for real-world applications with limited data and\nresources.\n","authors":["Chenghao Li","Chaoning Zhang","Boheng Zeng","Yi Lu","Pengbo Shi","Qingzi Chen","Jirui Liu","Lingyun Zhu","Yang Yang","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2401.05738v3.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2410.16208v3","updated":"2024-12-02T18:59:28Z","published":"2024-10-21T17:11:21Z","title":"Compute-Constrained Data Selection","summary":" Data selection can reduce the amount of training data needed to finetune\nLLMs; however, the efficacy of data selection scales directly with its compute.\nMotivated by the practical challenge of compute-constrained finetuning, we\nconsider the setting in which both the cost of selecting data and training are\nbudgeted for. We first formalize the problem of data selection with a\ncost-aware utility function, and model the data selection problem as trading\noff initial-selection cost for training gain. We run a comprehensive sweep of\nexperiments across multiple tasks, varying compute budget by scaling finetuning\ntokens, model sizes, and data selection compute. Interestingly we find that\nmany powerful data selection methods are almost never compute-optimal, and that\ncheaper data selection alternatives dominate both from a theoretical and\nempirical perspective. For compute-optimal training, we find that perplexity\nand gradient data selection require training-to-selection model size ratios of\n5x and 10x, respectively.\n","authors":["Junjie Oscar Yin","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2410.16208v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17501v2","updated":"2024-12-02T18:54:28Z","published":"2024-11-26T15:13:06Z","title":"Inference Scaling fLaws: The Limits of LLM Resampling with Imperfect\n Verifiers","summary":" Recent research has generated hope that inference scaling could allow weaker\nlanguage models to match or exceed the accuracy of stronger models, such as by\nrepeatedly sampling solutions to a coding problem until it passes unit tests.\nThe central thesis of this paper is that there is no free lunch for inference\nscaling: indefinite accuracy improvement through resampling can only be\nrealized if the \"verifier\" (in this case, a set of unit tests) is perfect. When\nthe verifier is imperfect, as it almost always is in domains such as reasoning\nor coding (for example, unit tests have imperfect coverage), there is a nonzero\nprobability of false positives: incorrect solutions that pass the verifier.\nResampling cannot decrease this probability, so it imposes an upper bound to\nthe accuracy of resampling-based inference scaling even with an infinite\ncompute budget. We find that there is a very strong correlation between the\nmodel's single-sample accuracy (i.e. accuracy without unit tests) and its false\npositive rate on coding benchmarks HumanEval and MBPP, whose unit tests have\nlimited coverage. Therefore, no amount of inference scaling of weaker models\ncan enable them to match the single-sample accuracy of a sufficiently strong\nmodel (Fig. 1a). When we consider that false positives have a negative utility\ncompared to abstaining from producing a solution, it bends the inference\nscaling curve further downward. Empirically, we find that the optimal number of\nsamples can be less than 10 under realistic assumptions (Fig. 1b). Finally, we\nshow that beyond accuracy, false positives may have other undesirable\nqualities, such as poor adherence to coding style conventions.\n","authors":["Benedikt Stroebl","Sayash Kapoor","Arvind Narayanan"],"pdf_url":"https://arxiv.org/pdf/2411.17501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00170v2","updated":"2024-12-02T18:37:01Z","published":"2024-07-31T21:43:55Z","title":"CREW: Facilitating Human-AI Teaming Research","summary":" With the increasing deployment of artificial intelligence (AI) technologies,\nthe potential of humans working with AI agents has been growing at a great\nspeed. Human-AI teaming is an important paradigm for studying various aspects\nwhen humans and AI agents work together. The unique aspect of Human-AI teaming\nresearch is the need to jointly study humans and AI agents, demanding\nmultidisciplinary research efforts from machine learning to human-computer\ninteraction, robotics, cognitive science, neuroscience, psychology, social\nscience, and complex systems. However, existing platforms for Human-AI teaming\nresearch are limited, often supporting oversimplified scenarios and a single\ntask, or specifically focusing on either human-teaming research or multi-agent\nAI algorithms. We introduce CREW, a platform to facilitate Human-AI teaming\nresearch in real-time decision-making scenarios and engage collaborations from\nmultiple scientific disciplines, with a strong emphasis on human involvement.\nIt includes pre-built tasks for cognitive studies and Human-AI teaming with\nexpandable potentials from our modular design. Following conventional cognitive\nneuroscience research, CREW also supports multimodal human physiological signal\nrecording for behavior analysis. Moreover, CREW benchmarks real-time\nhuman-guided reinforcement learning agents using state-of-the-art algorithms\nand well-tuned baselines. With CREW, we were able to conduct 50 human subject\nstudies within a week to verify the effectiveness of our benchmark.\n","authors":["Lingyu Zhang","Zhengran Ji","Boyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.00170v2.pdf","comment":"Our project website is at: http://generalroboticslab.com/CREW"},{"id":"http://arxiv.org/abs/2406.16738v2","updated":"2024-12-02T18:27:02Z","published":"2024-06-24T15:45:20Z","title":"Inducing Group Fairness in Prompt-Based Language Model Decisions","summary":" Classifiers are used throughout industry to enforce policies, ranging from\nthe detection of toxic content to age-appropriate content filtering. While\nthese classifiers serve important functions, it is also essential that they are\nbuilt in ways that minimize unfair biases for users.\n One such fairness consideration is called group fairness, which desires that\ndifferent sub-population of users receive equal treatment. This is a\nwell-studied problem in the context of 'classical' classifiers. However, the\nemergence of prompt-based language model (LM) decision making has created new\nopportunities to solve text-based classification tasks, and the fairness\nproperties of these new classifiers are not yet well understood. Further, the\n`remediation toolkit' is incomplete for LM-based decision makers and little is\nunderstood about how to improve decision maker group fairness while maintaining\nclassifier performance.\n This work sets out to add more tools to that toolbox. We introduce\nadaptations of existing effective approaches from the classical classifier\nfairness to the prompt-based classifier space. We also devise simple methods\nthat take advantage of the new structure of prompt-based decision makers and\noperate at the prompt level. We compare these approaches empirically on real\ndata. Our results suggest that adaptations of approaches that are effective for\nclassical classifiers remain effective in the LM-based classifier environment.\nHowever, there is room for further exploration of prompt-based remediation\nmethods (and other remediation methods that take advantage of LM structure).\n","authors":["James Atwood","Nino Scherrer","Preethi Lahoti","Ananth Balashankar","Flavien Prost","Ahmad Beirami"],"pdf_url":"https://arxiv.org/pdf/2406.16738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05677v2","updated":"2024-12-02T18:13:28Z","published":"2024-09-09T14:44:19Z","title":"RIRAG: Regulatory Information Retrieval and Answer Generation","summary":" Regulatory documents, issued by governmental regulatory bodies, establish\nrules, guidelines, and standards that organizations must adhere to for legal\ncompliance. These documents, characterized by their length, complexity and\nfrequent updates, are challenging to interpret, requiring significant\nallocation of time and expertise on the part of organizations to ensure ongoing\ncompliance. Regulatory Natural Language Processing (RegNLP) is a\nmultidisciplinary field aimed at simplifying access to and interpretation of\nregulatory rules and obligations. We introduce a task of generating\nquestion-passages pairs, where questions are automatically created and paired\nwith relevant regulatory passages, facilitating the development of regulatory\nquestion-answering systems. We create the ObliQA dataset, containing 27,869\nquestions derived from the collection of Abu Dhabi Global Markets (ADGM)\nfinancial regulation documents, design a baseline Regulatory Information\nRetrieval and Answer Generation (RIRAG) system and evaluate it with RePASs, a\nnovel evaluation metric that tests whether generated answers accurately capture\nall relevant obligations while avoiding contradictions.\n","authors":["Tuba Gokhan","Kexin Wang","Iryna Gurevych","Ted Briscoe"],"pdf_url":"https://arxiv.org/pdf/2409.05677v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15098v3","updated":"2024-12-02T17:59:40Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":" In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17593v3","updated":"2024-12-02T17:43:20Z","published":"2024-11-26T17:01:27Z","title":"What Differentiates Educational Literature? A Multimodal Fusion Approach\n of Transformers and Computational Linguistics","summary":" The integration of new literature into the English curriculum remains a\nchallenge since educators often lack scalable tools to rapidly evaluate\nreadability and adapt texts for diverse classroom needs. This study proposes to\naddress this gap through a multimodal approach that combines transformer-based\ntext classification with linguistic feature analysis to align texts with UK Key\nStages. Eight state-of-the-art Transformers were fine-tuned on segmented text\ndata, with BERT achieving the highest unimodal F1 score of 0.75. In parallel,\n500 deep neural network topologies were searched for the classification of\nlinguistic characteristics, achieving an F1 score of 0.392. The fusion of these\nmodalities shows a significant improvement, with every multimodal approach\noutperforming all unimodal models. In particular, the ELECTRA Transformer fused\nwith the neural network achieved an F1 score of 0.996. Unimodal and multimodal\napproaches are shown to have statistically significant differences in all\nvalidation metrics (accuracy, precision, recall, F1 score) except for inference\ntime. The proposed approach is finally encapsulated in a stakeholder-facing web\napplication, providing non-technical stakeholder access to real-time insights\non text complexity, reading difficulty, curriculum alignment, and\nrecommendations for learning age range. The application empowers data-driven\ndecision making and reduces manual workload by integrating AI-based\nrecommendations into lesson planning for English literature.\n","authors":["Jordan J. Bird"],"pdf_url":"https://arxiv.org/pdf/2411.17593v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12850v2","updated":"2024-12-02T17:38:21Z","published":"2024-01-23T15:35:44Z","title":"End-to-End Supervised Hierarchical Graph Clustering for Speaker\n Diarization","summary":" Speaker diarization, the task of segmenting an audio recording based on\nspeaker identity, constitutes an important speech pre-processing step for\nseveral downstream applications.The conventional approach to diarization\ninvolves multiple steps of embedding extraction and clustering, which are often\noptimized in an isolated fashion. While end-to-end diarization systems attempt\nto learn a single model for the task, they are often cumbersome to train and\nrequire large supervised datasets. In this paper, we propose an end-to-end\nsupervised hierarchical clustering algorithm based on graph neural networks\n(GNN), called End-to-end Supervised HierARchical Clustering (E-SHARC). The\nembedding extractor is initialized using a pre-trained x-vector model while the\nGNN model is trained initially using the x-vector embeddings from the\npre-trained model. Finally, the E-SHARC model uses the front-end mel-filterbank\nfeatures as input and jointly optimizes the embedding extractor and the GNN\nclustering module, performing representation learning, metric learning, and\nclustering with end-to-end optimization. Further, with additional inputs from\nan external overlap detector, the E-SHARC approach is capable of predicting the\nspeakers in the overlapping speech regions. The experimental evaluation on\nbenchmark datasets like AMI, Voxconverse and DISPLACE, illustrates that the\nproposed E-SHARC framework provides competitive diarization results using graph\nbased clustering methods.\n","authors":["Prachi Singh","Sriram Ganapathy"],"pdf_url":"https://arxiv.org/pdf/2401.12850v2.pdf","comment":"11 pages. Under review IEEE TASLP. \\c{opyright} 2024 IEEE"},{"id":"http://arxiv.org/abs/2404.17644v4","updated":"2024-12-02T17:12:54Z","published":"2024-04-26T18:08:15Z","title":"A Conditional Independence Test in the Presence of Discretization","summary":" Testing conditional independence has many applications, such as in Bayesian\nnetwork learning and causal discovery. Different test methods have been\nproposed. However, existing methods generally can not work when only\ndiscretized observations are available. Specifically, consider $X_1$,\n$\\tilde{X}_2$ and $X_3$ are observed variables, where $\\tilde{X}_2$ is a\ndiscretization of latent variables $X_2$. Applying existing test methods to the\nobservations of $X_1$, $\\tilde{X}_2$ and $X_3$ can lead to a false conclusion\nabout the underlying conditional independence of variables $X_1$, $X_2$ and\n$X_3$. Motivated by this, we propose a conditional independence test\nspecifically designed to accommodate the presence of such discretization. To\nachieve this, we design the bridge equations to recover the parameter\nreflecting the statistical information of the underlying latent continuous\nvariables. An appropriate test statistic and its asymptotic distribution under\nthe null hypothesis of conditional independence have also been derived. Both\ntheoretical results and empirical validation have been provided, demonstrating\nthe effectiveness of our test methods.\n","authors":["Boyang Sun","Yu Yao","Huangyuan Hao","Yumou Qiu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.17644v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19839v3","updated":"2024-12-02T16:27:16Z","published":"2024-09-30T00:41:51Z","title":"ForecastBench: A Dynamic Benchmark of AI Forecasting Capabilities","summary":" Forecasts of future events are essential inputs into informed\ndecision-making. Machine learning (ML) systems have the potential to deliver\nforecasts at scale, but there is no framework for evaluating the accuracy of ML\nsystems on a standardized set of forecasting questions. To address this gap, we\nintroduce ForecastBench: a dynamic benchmark that evaluates the accuracy of ML\nsystems on an automatically generated and regularly updated set of 1,000\nforecasting questions. To avoid any possibility of data leakage, ForecastBench\nis comprised solely of questions about future events that have no known answer\nat the time of submission. We quantify the capabilities of current ML systems\nby collecting forecasts from expert (human) forecasters, the general public,\nand LLMs on a random subset of questions from the benchmark ($N=200$). While\nLLMs have achieved super-human performance on many benchmarks, they perform\nless well here: expert forecasters outperform the top-performing LLM (p-value\n$<0.01$). We display system and human scores in a public leaderboard at\nwww.forecastbench.org.\n","authors":["Ezra Karger","Houtan Bastani","Chen Yueh-Han","Zachary Jacobs","Danny Halawi","Fred Zhang","Philip E. Tetlock"],"pdf_url":"https://arxiv.org/pdf/2409.19839v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06562v2","updated":"2024-12-02T15:32:41Z","published":"2023-12-11T17:46:44Z","title":"On Meta-Prompting","summary":" Modern generative language models are capable of interpreting input strings\nas instructions, or prompts, and carry out tasks based on them. Many approaches\nto prompting and pre-training these models involve the automated generation of\nthese prompts: meta-prompting, or prompting to obtain prompts. We propose a\ntheoretical framework based on category theory to generalize and describe them.\nThis framework is flexible enough to account for stochasticity, and allows us\nto obtain formal results around task agnosticity and equivalence of various\nmeta-prompting approaches. Experimentally, we test our framework in two active\nareas of model research: creativity and ideation. We find that user preference\nstrongly favors (p < 0.01) the prompts generated under meta-prompting, as well\nas their corresponding outputs, over a series of hardcoded baseline prompts\nthat include the original task definition. Using our framework, we argue that\nmeta-prompting is more effective than basic prompting at generating desirable\noutputs.\n","authors":["Adrian de Wynter","Xun Wang","Qilong Gu","Si-Qing Chen"],"pdf_url":"https://arxiv.org/pdf/2312.06562v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2409.13730v2","updated":"2024-12-02T15:11:23Z","published":"2024-09-10T01:20:26Z","title":"VisScience: An Extensive Benchmark for Evaluating K12 Educational\n Multi-modal Scientific Reasoning","summary":" Multi-modal large language models (MLLMs) have demonstrated promising\ncapabilities across various tasks by integrating textual and visual information\nto achieve visual understanding in complex scenarios. Despite the availability\nof several benchmarks aims to evaluating MLLMs in tasks from visual question\nanswering to complex problem-solving, most focus predominantly on mathematics\nor general visual understanding tasks. This reveals a critical gap in current\nbenchmarks, which often overlook the inclusion of other key scientific\ndisciplines such as physics and chemistry. To address this gap, we meticulously\nconstruct a comprehensive benchmark, named VisScience, which is utilized to\nassess the multi-modal scientific reasoning across the three disciplines of\nmathematics, physics, and chemistry. This benchmark comprises 3,000 questions\ndrawn from K12 education - spanning elementary school through high school -\nequally distributed across three disciplines, with 1,000 questions per\ndiscipline. The questions within VisScience span 21 distinct subjects and are\ncategorized into five difficulty levels, offering a broad spectrum of topics\nwithin each discipline. With VisScience, we present a detailed evaluation of\nthe performance of 25 representative MLLMs in scientific reasoning.\nExperimental results demonstrate that closed-source MLLMs generally outperform\nopen-source models. The best performance observed include a 53.4\\% accuracy in\nmathematics by Claude3.5-Sonnet, 38.2\\% in physics by GPT-4o, and 47.0\\% in\nchemistry by Gemini-1.5-Pro. These results underscore the strengths and\nlimitations of MLLMs, suggesting areas for future improvement and highlighting\nthe importance of developing models that can effectively handle the diverse\ndemands of multi-modal scientific reasoning.\n","authors":["Zhihuan Jiang","Zhen Yang","Jinhao Chen","Zhengxiao Du","Weihan Wang","Bin Xu","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2409.13730v2.pdf","comment":"89 pages, 70 figures"},{"id":"http://arxiv.org/abs/2409.13729v2","updated":"2024-12-02T14:59:08Z","published":"2024-09-10T01:20:22Z","title":"MathGLM-Vision: Solving Mathematical Problems with Multi-Modal Large\n Language Model","summary":" Large language models (LLMs) have demonstrated significant capabilities in\nmathematical reasoning, particularly with text-based mathematical problems.\nHowever, current multi-modal large language models (MLLMs), especially those\nspecialized in mathematics, tend to focus predominantly on solving geometric\nproblems but ignore the diversity of visual information available in other\nareas of mathematics. Moreover, the geometric information for these specialized\nmathematical MLLMs is derived from several public datasets, which are typically\nlimited in diversity and complexity. To address these limitations, we aim to\nconstruct a fine-tuning dataset named MathVL, and develop a series of\nspecialized mathematical MLLMs termed MathGLM-Vision by conducting Supervised\nFine-Tuning (SFT) on MathVL with various parameter-scale backbones. To\nextensively evaluate the effectiveness of MathGLM-Vision, we conduct\nexperiments on several public benchmarks and our curated MathVL-test consisting\nof 2,000 problems. Experimental results demonstrate that MathGLM-Vision\nachieves significant improvements compared with some existing models, including\nbackbone models and open-source mathematical MLLMs. These findings indicate the\nimportance of diversity dataset in enhancing the mathematical reasoning\nabilities of MLLMs.\n","authors":["Zhen Yang","Jinhao Chen","Zhengxiao Du","Wenmeng Yu","Weihan Wang","Wenyi Hong","Zhihuan Jiang","Bin Xu","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2409.13729v2.pdf","comment":"30 pages,19 figures"},{"id":"http://arxiv.org/abs/2405.04101v2","updated":"2024-12-02T14:54:31Z","published":"2024-05-07T08:15:48Z","title":"Continual Learning in the Presence of Repetition","summary":" Continual learning (CL) provides a framework for training models in\never-evolving environments. Although re-occurrence of previously seen objects\nor tasks is common in real-world problems, the concept of repetition in the\ndata stream is not often considered in standard benchmarks for CL. Unlike with\nthe rehearsal mechanism in buffer-based strategies, where sample repetition is\ncontrolled by the strategy, repetition in the data stream naturally stems from\nthe environment. This report provides a summary of the CLVision challenge at\nCVPR 2023, which focused on the topic of repetition in class-incremental\nlearning. The report initially outlines the challenge objective and then\ndescribes three solutions proposed by finalist teams that aim to effectively\nexploit the repetition in the stream to learn continually. The experimental\nresults from the challenge highlight the effectiveness of ensemble-based\nsolutions that employ multiple versions of similar modules, each trained on\ndifferent but overlapping subsets of classes. This report underscores the\ntransformative potential of taking a different perspective in CL by employing\nrepetition in the data stream to foster innovative strategy design.\n","authors":["Hamed Hemati","Lorenzo Pellegrini","Xiaotian Duan","Zixuan Zhao","Fangfang Xia","Marc Masana","Benedikt Tscheschner","Eduardo Veas","Yuxiang Zheng","Shiji Zhao","Shao-Yuan Li","Sheng-Jun Huang","Vincenzo Lomonaco","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2405.04101v2.pdf","comment":"Accepted version, to appear in Neural Networks; Challenge Report of\n the 4th Workshop on Continual Learning in Computer Vision at CVPR"},{"id":"http://arxiv.org/abs/2411.01819v2","updated":"2024-12-02T14:42:09Z","published":"2024-11-04T05:39:01Z","title":"Free-Mask: A Novel Paradigm of Integration Between the Segmentation\n Diffusion Model and Image Editing to Improve Segmentation Ability","summary":" Current semantic segmentation models typically require a substantial amount\nof manually annotated data, a process that is both time-consuming and\nresource-intensive. Alternatively, leveraging advanced text-to-image models\nsuch as Midjourney and Stable Diffusion has emerged as an efficient strategy,\nenabling the automatic generation of synthetic data in place of manual\nannotations. However, previous methods have been limited to generating\nsingle-instance images, as the generation of multiple instances with Stable\nDiffusion has proven unstable. To address this limitation and expand the scope\nand diversity of synthetic datasets, we propose a framework \\textbf{Free-Mask}\nthat combines a Diffusion Model for segmentation with advanced image editing\ncapabilities, allowing for the integration of multiple objects into images via\ntext-to-image models. Our method facilitates the creation of highly realistic\ndatasets that closely emulate open-world environments while generating accurate\nsegmentation masks. It reduces the labor associated with manual annotation and\nalso ensures precise mask generation. Experimental results demonstrate that\nsynthetic data generated by \\textbf{Free-Mask} enables segmentation models to\noutperform those trained on real data, especially in zero-shot settings.\nNotably, \\textbf{Free-Mask} achieves new state-of-the-art results on previously\nunseen classes in the VOC 2012 benchmark.\n","authors":["Bo Gao","Fangxu Xing","Daniel Tang"],"pdf_url":"https://arxiv.org/pdf/2411.01819v2.pdf","comment":"16 pages,5 figures,5 tables"},{"id":"http://arxiv.org/abs/2409.08065v2","updated":"2024-12-02T14:29:14Z","published":"2024-09-12T14:16:56Z","title":"InvDesFlow: An AI search engine to explore possible high-temperature\n superconductors","summary":" The discovery of new superconducting materials, particularly those exhibiting\nhigh critical temperature ($T_c$), has been a vibrant area of study within the\nfield of condensed matter physics. Conventional approaches primarily rely on\nphysical intuition to search for potential superconductors within the existing\ndatabases. However, the known materials only scratch the surface of the\nextensive array of possibilities within the realm of materials. Here, we\ndevelop InvDesFlow, an AI search engine that integrates deep model pre-training\nand fine-tuning techniques, diffusion models, and physics-based approaches\n(e.g., first-principles electronic structure calculation) for the discovery of\nhigh-$T_c$ superconductors. Utilizing InvDesFlow, we have obtained 74\ndynamically stable materials with critical temperatures predicted by the AI\nmodel to be $T_c \\geq$ 15 K based on a very small set of samples. Notably,\nthese materials are not contained in any existing dataset. Furthermore, we\nanalyze trends in our dataset and individual materials including B$_4$CN$_3$\n(at 5 GPa) and B$_5$CN$_2$ (at ambient pressure) whose $T_c$s are 24.08 K and\n15.93 K, respectively. We demonstrate that AI technique can discover a set of\nnew high-$T_c$ superconductors, outline its potential for accelerating\ndiscovery of the materials with targeted properties.\n","authors":["Xiao-Qi Han","Zhenfeng Ouyang","Peng-Jie Guo","Hao Sun","Ze-Feng Gao","Zhong-Yi Lu"],"pdf_url":"https://arxiv.org/pdf/2409.08065v2.pdf","comment":"22 pages, 17 figures, 6 tables"},{"id":"http://arxiv.org/abs/2410.01639v2","updated":"2024-12-02T14:25:30Z","published":"2024-10-02T15:09:36Z","title":"Moral Alignment for LLM Agents","summary":" Decision-making agents based on pre-trained Large Language Models (LLMs) are\nincreasingly being deployed across various domains of human activity. While\ntheir applications are currently rather specialized, several research efforts\nare under way to develop more generalist agents. As LLM-based systems become\nmore agentic, their influence on human activity will grow and the transparency\nof this will decrease. Consequently, developing effective methods for aligning\nthem to human values is vital.\n The prevailing practice in alignment often relies on human preference data\n(e.g., in RLHF or DPO), in which values are implicit and are essentially\ndeduced from relative preferences over different model outputs. In this work,\ninstead of relying on human feedback, we introduce the design of reward\nfunctions that explicitly encode core human values for Reinforcement\nLearning-based fine-tuning of foundation agent models. Specifically, we use\nintrinsic rewards for the moral alignment of LLM agents.\n We evaluate our approach using the traditional philosophical frameworks of\nDeontological Ethics and Utilitarianism, quantifying moral rewards for agents\nin terms of actions and consequences on the Iterated Prisoner's Dilemma (IPD)\nenvironment. We also show how moral fine-tuning can be deployed to enable an\nagent to unlearn a previously developed selfish strategy. Finally, we find that\ncertain moral strategies learned on the IPD game generalize to several other\nmatrix game environments. In summary, we demonstrate that fine-tuning with\nintrinsic rewards is a promising general solution for aligning LLM agents to\nhuman values, and it might represent a more transparent and cost-effective\nalternative to currently predominant alignment techniques.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2410.01639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03235v2","updated":"2024-12-02T13:21:36Z","published":"2024-10-04T09:00:06Z","title":"Enriching Ontologies with Disjointness Axioms using Large Language\n Models","summary":" Ontologies often lack explicit disjointness declarations between classes,\ndespite their usefulness for sophisticated reasoning and consistency checking\nin Knowledge Graphs. In this study, we explore the potential of Large Language\nModels (LLMs) to enrich ontologies by identifying and asserting class\ndisjointness axioms. Our approach aims at leveraging the implicit knowledge\nembedded in LLMs, using prompt engineering to elicit this knowledge for\nclassifying ontological disjointness. We validate our methodology on the\nDBpedia ontology, focusing on open-source LLMs. Our findings suggest that LLMs,\nwhen guided by effective prompt strategies, can reliably identify disjoint\nclass relationships, thus streamlining the process of ontology completion\nwithout extensive manual input. For comprehensive disjointness enrichment, we\npropose a process that takes logical relationships between disjointness and\nsubclass statements into account in order to maintain satisfiability and reduce\nthe number of calls to the LLM. This work provides a foundation for future\napplications of LLMs in automated ontology enhancement and offers insights into\noptimizing LLM performance through strategic prompt design. Our code is\npublicly available on GitHub at https://github.com/n28div/llm-disjointness.\n","authors":["Elias Crum","Antonio De Santis","Manon Ovide","Jiaxin Pan","Alessia Pisu","Nicolas Lazzari","Sebastian Rudolph"],"pdf_url":"https://arxiv.org/pdf/2410.03235v2.pdf","comment":"Accepted at KBC-LM'24 workshop at ISWC 2024,\n https://ceur-ws.org/Vol-3853/paper1.pdf"},{"id":"http://arxiv.org/abs/2311.18328v3","updated":"2024-12-02T13:04:47Z","published":"2023-11-30T07:58:54Z","title":"Advances in 3D Neural Stylization: A Survey","summary":" Modern artificial intelligence offers a novel and transformative approach to\ncreating digital art across diverse styles and modalities like images, videos\nand 3D data, unleashing the power of creativity and revolutionizing the way\nthat we perceive and interact with visual content. This paper reports on recent\nadvances in stylized 3D asset creation and manipulation with the expressive\npower of neural networks. We establish a taxonomy for neural stylization,\nconsidering crucial design choices such as scene representation, guidance data,\noptimization strategies, and output styles. Building on such taxonomy, our\nsurvey first revisits the background of neural stylization on 2D images, and\nthen presents in-depth discussions on recent neural stylization methods for 3D\ndata, accompanied by a benchmark evaluating selected mesh and neural field\nstylization methods. Based on the insights gained from the survey, we highlight\nthe practical significance, open challenges, future research, and potential\nimpacts of neural stylization, which facilitates researchers and practitioners\nto navigate the rapidly evolving landscape of 3D content creation using modern\nartificial intelligence.\n","authors":["Yingshu Chen","Guocheng Shao","Ka Chun Shum","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.18328v3.pdf","comment":"curated list of papers:\n https://github.com/chenyingshu/advances_3d_neural_stylization"},{"id":"http://arxiv.org/abs/2312.02522v2","updated":"2024-12-02T12:49:50Z","published":"2023-12-05T06:05:04Z","title":"MASP: Scalable GNN-based Planning for Multi-Agent Navigation","summary":" We investigate multi-agent navigation tasks, where multiple agents need to\nreach initially unassigned goals in a limited time. Classical planning-based\nmethods suffer from expensive computation overhead at each step and offer\nlimited expressiveness for complex cooperation strategies. In contrast,\nreinforcement learning (RL) has recently become a popular approach for\naddressing this issue. However, RL struggles with low data efficiency and\ncooperation when directly exploring (nearly) optimal policies in a large\nexploration space, especially with an increased number of agents(e.g., 10+\nagents) or in complex environments (e.g., 3-D simulators). In this paper, we\npropose the Multi-Agent Scalable Graph-based Planner (MASP), a goal-conditioned\nhierarchical planner for navigation tasks with a substantial number of agents\nin the decentralized setting. MASP employs a hierarchical framework to reduce\nspace complexity by decomposing a large exploration space into multiple\ngoal-conditioned subspaces, where a high-level policy assigns agents goals, and\na low-level policy navigates agents toward designated goals. For agent\ncooperation and the adaptation to varying team sizes, we model agents and goals\nas graphs to better capture their relationship. The high-level policy, the Goal\nMatcher, leverages a graph-based Self-Encoder and Cross-Encoder to optimize\ngoal assignment by updating the agent and the goal graphs. The low-level\npolicy, the Coordinated Action Executor, introduces the Group Information\nFusion to facilitate group division and extract agent relationships across\ngroups, enhancing training efficiency for agent cooperation. The results\ndemonstrate that MASP outperforms RL and planning-based baselines in task\nefficiency.\n","authors":["Xinyi Yang","Xinting Yang","Chao Yu","Jiayu Chen","Wenbo Ding","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02522v2.pdf","comment":"Submitted to IEEE RA-L"},{"id":"http://arxiv.org/abs/2410.07836v4","updated":"2024-12-02T12:44:48Z","published":"2024-10-10T11:52:07Z","title":"Masked Generative Priors Improve World Models Sequence Modelling\n Capabilities","summary":" Deep Reinforcement Learning (RL) has become the leading approach for creating\nartificial agents in complex environments. Model-based approaches, which are RL\nmethods with world models that predict environment dynamics, are among the most\npromising directions for improving data efficiency, forming a critical step\ntoward bridging the gap between research and real-world deployment. In\nparticular, world models enhance sample efficiency by learning in imagination,\nwhich involves training a generative sequence model of the environment in a\nself-supervised manner. Recently, Masked Generative Modelling has emerged as a\nmore efficient and superior inductive bias for modelling and generating token\nsequences. Building on the Efficient Stochastic Transformer-based World Models\n(STORM) architecture, we replace the traditional MLP prior with a Masked\nGenerative Prior (e.g., MaskGIT Prior) and introduce GIT-STORM. We evaluate our\nmodel on two downstream tasks: reinforcement learning and video prediction.\nGIT-STORM demonstrates substantial performance gains in RL tasks on the Atari\n100k benchmark. Moreover, we apply Transformer-based World Models to continuous\naction environments for the first time, addressing a significant gap in prior\nresearch. To achieve this, we employ a state mixer function that integrates\nlatent state representations with actions, enabling our model to handle\ncontinuous control tasks. We validate this approach through qualitative and\nquantitative analyses on the DeepMind Control Suite, showcasing the\neffectiveness of Transformer-based World Models in this new domain. Our results\nhighlight the versatility and efficacy of the MaskGIT dynamics prior, paving\nthe way for more accurate world models and effective RL policies.\n","authors":["Cristian Meo","Mircea Lica","Zarif Ikram","Akihiro Nakano","Vedant Shah","Aniket Rajiv Didolkar","Dianbo Liu","Anirudh Goyal","Justin Dauwels"],"pdf_url":"https://arxiv.org/pdf/2410.07836v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03976v4","updated":"2024-12-02T12:44:25Z","published":"2023-11-07T13:24:01Z","title":"Topology Only Pre-Training: Towards Generalised Multi-Domain Graph\n Models","summary":" The principal benefit of unsupervised representation learning is that a\npre-trained model can be fine-tuned where data or labels are scarce. Existing\napproaches for graph representation learning are domain specific, maintaining\nconsistent node and edge features across the pre-training and target datasets.\nThis has precluded transfer to multiple domains. We present Topology Only\nPre-Training (ToP), a graph pre-training method based on node and edge feature\nexclusion. We show positive transfer on evaluation datasets from multiple\ndomains, including domains not present in pre-training data, running directly\ncontrary to assumptions made in contemporary works. On 75% of experiments, ToP\nmodels perform significantly $p \\leq 0.01$ better than a supervised baseline.\nPerformance is significantly positive on 85.7% of tasks when node and edge\nfeatures are used in fine-tuning. We further show that out-of-domain topologies\ncan produce more useful pre-training than in-domain. Under ToP we show better\ntransfer from non-molecule pre-training, compared to molecule pre-training, on\n79% of molecular benchmarks. Against the limited set of other generalist graph\nmodels ToP performs strongly, including against models with many orders of\nmagnitude larger. These findings show that ToP opens broad areas of research in\nboth transfer learning on scarcely populated graph domains and in graph\nfoundation models.\n","authors":["Alex O. Davies","Riku W. Green","Nirav S. Ajmeri","Telmo M. Silva Filho"],"pdf_url":"https://arxiv.org/pdf/2311.03976v4.pdf","comment":"28 pages, 5 figures, 5 tables. For in-development code see\n https://github.com/neutralpronoun/general-gcl"},{"id":"http://arxiv.org/abs/2411.02272v4","updated":"2024-12-02T12:36:30Z","published":"2024-11-04T17:03:55Z","title":"Combining Induction and Transduction for Abstract Reasoning","summary":" When learning an input-output mapping from very few examples, is it better to\nfirst infer a latent function that explains the examples, or is it better to\ndirectly predict new test outputs, e.g. using a neural network? We study this\nquestion on ARC by training neural models for induction (inferring latent\nfunctions) and transduction (directly predicting the test output for a given\ntest input). We train on synthetically generated variations of Python programs\nthat solve ARC training tasks. We find inductive and transductive models solve\ndifferent kinds of test problems, despite having the same training problems and\nsharing the same neural architecture: Inductive program synthesis excels at\nprecise computations, and at composing multiple concepts, while transduction\nsucceeds on fuzzier perceptual concepts. Ensembling them approaches human-level\nperformance on ARC.\n","authors":["Wen-Ding Li","Keya Hu","Carter Larsen","Yuqing Wu","Simon Alford","Caleb Woo","Spencer M. Dunn","Hao Tang","Michelangelo Naim","Dat Nguyen","Wei-Long Zheng","Zenna Tavares","Yewen Pu","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2411.02272v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07818v5","updated":"2024-12-02T12:29:47Z","published":"2024-02-12T17:24:15Z","title":"Differentially Private Zeroth-Order Methods for Scalable Large Language\n Model Finetuning","summary":" Fine-tuning on task-specific datasets is a widely-embraced paradigm of\nharnessing the powerful capability of pretrained LLMs for various downstream\ntasks. Due to the popularity of LLMs fine-tuning and its accompanying privacy\nconcerns, differentially private (DP) fine-tuning of pretrained LLMs has been\nwidely used to safeguarding the privacy of task-specific datasets. Lying at the\ndesign core of DP LLM fine-tuning methods is the satisfactory tradeoff among\nprivacy, utility, and scalability. Most existing methods build upon the seminal\nwork of DP-SGD. Despite pushing the scalability of DP-SGD to its limit,\nDP-SGD-based fine-tuning methods are unfortunately limited by the inherent\ninefficiency of SGD.\n In this paper, we investigate the potential of DP zeroth-order methods for\nLLM pretraining, which avoids the scalability bottleneck of SGD by\napproximating the gradient with the more efficient zeroth-order gradient.\nRather than treating the zeroth-order method as a drop-in replacement for SGD,\nthis paper presents a comprehensive study both theoretically and empirically.\nFirst, we propose the stagewise DP zeroth-order method (DP-ZOSO) that\ndynamically schedules key hyperparameters. This design is grounded on the\nsynergy between DP random perturbation and the gradient approximation error of\nthe zeroth-order method, and its effect on fine-tuning trajectory.\n We provide theoretical analysis for both proposed methods. We conduct\nextensive empirical analysis on both encoder-only masked language model and\ndecoder-only autoregressive language model, achieving impressive results in\nterms of scalability and utility regardless of the class of tasks (compared\nwith DPZero, DP-ZOPO improves $4.5\\%$ on SST-5, $5.5\\%$ on MNLI with\nRoBERTa-Large and 9.2\\% on CB, 3.9\\% on BoolQ with OPT-2.7b when $\\epsilon=4$,\ndemonstrates more significant enhancement in performance on more complicated\ntasks).\n","authors":["Z Liu","J Lou","W Bao","Y Hu","B Li","Z Qin","K Ren"],"pdf_url":"https://arxiv.org/pdf/2402.07818v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08559v3","updated":"2024-12-02T12:16:19Z","published":"2024-10-11T06:30:48Z","title":"Learning General Representation of 12-Lead Electrocardiogram with a\n Joint-Embedding Predictive Architecture","summary":" Electrocardiogram (ECG) captures the heart's electrical signals, offering\nvaluable information for diagnosing cardiac conditions. However, the scarcity\nof labeled data makes it challenging to fully leverage supervised learning in\nmedical domain. Self-supervised learning (SSL) offers a promising solution,\nenabling models to learn from unlabeled data and uncover meaningful patterns.\nIn this paper, we show that masked modeling in the latent space can be a\npowerful alternative to existing self-supervised methods in the ECG domain. We\nintroduce ECG-JEPA, a SSL model for 12-lead ECG analysis that learns semantic\nrepresentations of ECG data by predicting in the hidden latent space, bypassing\nthe need to reconstruct raw signals. This approach offers several advantages in\nthe ECG domain: (1) it avoids producing unnecessary details, such as noise,\nwhich is common in ECG; and (2) it addresses the limitations of na\\\"ive L2 loss\nbetween raw signals. Another key contribution is the introduction of\nCross-Pattern Attention (CroPA), a specialized masked attention mechanism\ntailored for 12-lead ECG data. ECG-JEPA is trained on the union of several open\nECG datasets, totaling approximately 180,000 samples, and achieves\nstate-of-the-art performance in various downstream tasks including ECG\nclassification and feature prediction. Our code is openly available at\nhttps://github.com/sehunfromdaegu/ECG_JEPA.\n","authors":["Sehun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.08559v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03795v4","updated":"2024-12-02T12:09:39Z","published":"2024-11-06T09:39:52Z","title":"VQA$^2$: Visual Question Answering for Video Quality Assessment","summary":" The advent and proliferation of large multi-modal models (LMMs) have\nintroduced new paradigms to computer vision, transforming various tasks into a\nunified visual question answering framework. Video Quality Assessment (VQA), a\nclassic field in low-level visual perception, focused initially on quantitative\nvideo quality scoring. However, driven by advances in LMMs, it is now\nprogressing toward more holistic visual quality understanding tasks. Recent\nstudies in the image domain have demonstrated that Visual Question Answering\n(VQA) can markedly enhance low-level visual quality evaluation. Nevertheless,\nrelated work has not been explored in the video domain, leaving substantial\nroom for improvement. To address this gap, we introduce the VQA2 Instruction\nDataset - the first visual question answering instruction dataset that focuses\non video quality assessment. This dataset consists of 3 subsets and covers\nvarious video types, containing 157,755 instruction question-answer pairs.\nThen, leveraging this foundation, we present the VQA2 series models. The VQA2\nseries models interleave visual and motion tokens to enhance the perception of\nspatial-temporal quality details in videos. We conduct extensive experiments on\nvideo quality scoring and understanding tasks, and results demonstrate that the\nVQA2series models achieve excellent performance in both tasks. Notably, our\nfinal model, the VQA2-Assistant, exceeds the renowned GPT-4o in visual quality\nunderstanding tasks while maintaining strong competitiveness in quality scoring\ntasks. Our work provides a foundation and feasible approach for integrating\nlow-level video quality assessment and understanding with LMMs.\n","authors":["Ziheng Jia","Zicheng Zhang","Jiaying Qian","Haoning Wu","Wei Sun","Chunyi Li","Xiaohong Liu","Weisi Lin","Guangtao Zhai","Xiongkuo Min"],"pdf_url":"https://arxiv.org/pdf/2411.03795v4.pdf","comment":"23 pages 12 figures"},{"id":"http://arxiv.org/abs/2410.23132v2","updated":"2024-12-02T12:05:29Z","published":"2024-10-30T15:42:59Z","title":"Revisiting MAE pre-training for 3D medical image segmentation","summary":" Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the\npotential of vast, untapped clinical datasets, for various downstream\napplications that suffer from the scarcity of labeled data. While SSL has\nrevolutionized fields like natural language processing and computer vision, its\nadoption in 3D medical image computing has been limited by three key pitfalls:\nSmall pre-training dataset sizes, architectures inadequate for 3D medical image\nanalysis, and insufficient evaluation practices. In this paper, we address\nthese issues by i) leveraging a large-scale dataset of 39k 3D brain MRI volumes\nand ii) using a Residual Encoder U-Net architecture within the state-of-the-art\nnnU-Net framework. iii) A robust development framework, incorporating 5\ndevelopment and 8 testing brain MRI segmentation datasets, allowed\nperformance-driven design decisions to optimize the simple concept of Masked\nAuto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses\nprevious SSL methods but also outperforms the strong nnU-Net baseline by an\naverage of approximately 3 Dice points setting a new state-of-the-art. Our code\nand models are made available here.\n","authors":["Tassilo Wald","Constantin Ulrich","Stanislav Lukyanenko","Andrei Goncharov","Alberto Paderno","Leander Maerkisch","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2410.23132v2.pdf","comment":"Arxiv Preprint. Revised and under review"},{"id":"http://arxiv.org/abs/2303.16668v3","updated":"2024-12-02T12:01:58Z","published":"2023-03-29T13:22:20Z","title":"Protecting Federated Learning from Extreme Model Poisoning Attacks via\n Multidimensional Time Series Anomaly Detection","summary":" Current defense mechanisms against model poisoning attacks in federated\nlearning (FL) systems have proven effective up to a certain threshold of\nmalicious clients. In this work, we introduce FLANDERS, a novel pre-aggregation\nfilter for FL resilient to large-scale model poisoning attacks, i.e., when\nmalicious clients far exceed legitimate participants. FLANDERS treats the\nsequence of local models sent by clients in each FL round as a matrix-valued\ntime series. Then, it identifies malicious client updates as outliers in this\ntime series by comparing actual observations with estimates generated by a\nmatrix autoregressive forecasting model maintained by the server. Experiments\nconducted in several non-iid FL setups show that FLANDERS significantly\nimproves robustness across a wide spectrum of attacks when paired with standard\nand robust existing aggregation methods.\n","authors":["Edoardo Gabrielli","Dimitri Belli","Zoe Matrullo","Vittorio Miori","Gabriele Tolomei"],"pdf_url":"https://arxiv.org/pdf/2303.16668v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06098v3","updated":"2024-12-02T11:49:05Z","published":"2024-11-09T07:19:56Z","title":"An Architectural Approach to Enhance Deep Long-Tailed Learning","summary":" Deep long-tailed recognition has been widely studied to address the issue of\nimbalanced data distributions in real-world scenarios. However, there has been\ninsufficient focus on the design of neural architectures, despite empirical\nevidence suggesting that architecture can significantly impact performance. In\nthis paper, we attempt to mitigate long-tailed issues through architectural\nimprovements. To simplify the design process, we utilize Differential\nArchitecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS\nmethods struggle to perform well in long-tailed scenarios. To tackle this\nchallenge, we introduce Long-Tailed Differential Architecture Search (LTDAS).\nSpecifically, we conduct extensive experiments to explore architectural\ncomponents that demonstrate better performance on long-tailed data and propose\na new search space based on our observations. This ensures that the\narchitecture obtained through our search process incorporates superior\ncomponents. Additionally, we propose replacing the learnable linear classifier\nwith an Equiangular Tight Frame (ETF) classifier to further enhance our method.\nThis classifier effectively alleviates the biased search process and prevents\nperformance collapse. Extensive experimental evaluations demonstrate that our\napproach consistently improves upon existing methods from an orthogonal\nperspective and achieves state-of-the-art results with simple enhancements.\n","authors":["Yuhan Pan","Yanan Sun","Wei Gong"],"pdf_url":"https://arxiv.org/pdf/2411.06098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14708v2","updated":"2024-12-02T10:52:21Z","published":"2024-11-22T03:33:51Z","title":"Understanding LLM Embeddings for Regression","summary":" With the rise of large language models (LLMs) for flexibly processing\ninformation as strings, a natural application is regression, specifically by\npreprocessing string representations into LLM embeddings as downstream features\nfor metric prediction. In this paper, we provide one of the first comprehensive\ninvestigations into embedding-based regression and demonstrate that LLM\nembeddings as features can be better for high-dimensional regression tasks than\nusing traditional feature engineering. This regression performance can be\nexplained in part due to LLM embeddings over numeric data inherently preserving\nLipschitz continuity over the feature space. Furthermore, we quantify the\ncontribution of different model effects, most notably model size and language\nunderstanding, which we find surprisingly do not always improve regression\nperformance.\n","authors":["Eric Tang","Bangding Yang","Xingyou Song"],"pdf_url":"https://arxiv.org/pdf/2411.14708v2.pdf","comment":"16 pages, 13 figures"},{"id":"http://arxiv.org/abs/2403.19211v2","updated":"2024-12-02T10:44:08Z","published":"2024-03-28T08:19:33Z","title":"Dual-Personalizing Adapter for Federated Foundation Models","summary":" Recently, foundation models, particularly large language models (LLMs), have\ndemonstrated an impressive ability to adapt to various tasks by fine-tuning\ndiverse instruction data. Notably, federated foundation models (FedFM) emerge\nas a privacy preservation method to fine-tune models collaboratively under\nfederated learning (FL) settings by leveraging many distributed datasets with\nnon-IID data. To alleviate communication and computation overhead,\nparameter-efficient methods are introduced for efficiency, and some research\nadapted personalization methods to FedFM for better user preferences alignment.\nHowever, a critical gap in existing research is the neglect of test-time\ndistribution shifts in real-world applications, and conventional methods for\ntest-time distribution shifts in personalized FL are less effective for FedFM\ndue to their failure to adapt to complex distribution shift scenarios and the\nrequirement to train all parameters. To bridge this gap, we refine the setting\nin FedFM, termed test-time personalization, which aims to learn personalized\nfederated foundation models on clients while effectively handling test-time\ndistribution shifts simultaneously. To address challenges in this setting, we\nexplore a simple yet effective solution, a Federated Dual-Personalizing Adapter\n(FedDPA) architecture. By co-working with a foundation model, a global adapter\nand a local adapter jointly tackle the test-time distribution shifts and\nclient-specific personalization. Additionally, we introduce an instance-wise\ndynamic weighting mechanism that dynamically integrates the global and local\nadapters for each test instance during inference, facilitating effective\ntest-time personalization. The effectiveness of the proposed method has been\nevaluated on benchmark datasets across different NLP tasks.\n","authors":["Yiyuan Yang","Guodong Long","Tao Shen","Jing Jiang","Michael Blumenstein"],"pdf_url":"https://arxiv.org/pdf/2403.19211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00115v2","updated":"2024-12-02T10:25:47Z","published":"2024-08-28T04:07:40Z","title":"Self-Adaptive Quantum Kernel Principal Components Analysis for Compact\n Readout of Chemiresistive Sensor Arrays","summary":" The rapid growth of Internet of Things (IoT) devices necessitates efficient\ndata compression techniques to handle the vast amounts of data generated by\nthese devices. Chemiresistive sensor arrays (CSAs), a simple-to-fabricate but\ncrucial component in IoT systems, generate large volumes of data due to their\nsimultaneous multi-sensor operations. Classical principal component analysis\n(cPCA) methods, a common solution to the data compression challenge, face\nlimitations in preserving critical information during dimensionality reduction.\nIn this study, we present self-adaptive quantum kernel (SAQK) PCA as a superior\nalternative to enhance information retention. Our findings demonstrate that\nSAQK PCA outperforms cPCA in various back-end machine-learning tasks,\nespecially in low-dimensional scenarios where access to quantum bits is\nlimited. These results highlight the potential of noisy intermediate-scale\nquantum (NISQ) computers to revolutionize data processing in real-world IoT\napplications by improving the efficiency and reliability of CSA data\ncompression and readout, despite the current constraints on qubit availability.\n","authors":["Zeheng Wang","Timothy van der Laan","Muhammad Usman"],"pdf_url":"https://arxiv.org/pdf/2409.00115v2.pdf","comment":"Version 2"},{"id":"http://arxiv.org/abs/2409.06067v2","updated":"2024-12-02T10:18:38Z","published":"2024-09-09T21:04:16Z","title":"MLLM-LLaVA-FL: Multimodal Large Language Model Assisted Federated\n Learning","summary":" Previous studies on federated learning (FL) often encounter performance\ndegradation due to data heterogeneity among different clients. In light of the\nrecent advances in multimodal large language models (MLLMs), such as GPT-4v and\nLLaVA, which demonstrate their exceptional proficiency in multimodal tasks,\nsuch as image captioning and multimodal question answering. We introduce a\nnovel federated learning framework, named Multimodal Large Language Model\nAssisted Federated Learning (MLLM-LLaVA-FL), which employs powerful MLLMs at\nthe server end to address the heterogeneous and long-tailed challenges. Owing\nto the advanced cross-modality representation capabilities and the extensive\nopen-vocabulary prior knowledge of MLLMs, our framework is adept at harnessing\nthe extensive, yet previously underexploited, open-source data accessible from\nwebsites and powerful server-side computational resources. Hence, the\nMLLM-LLaVA-FL not only enhances the performance but also avoids increasing the\nrisk of privacy leakage and the computational burden on local devices,\ndistinguishing it from prior methodologies. Our framework has three key stages.\nInitially, we conduct global visual-text pretraining of the model. This\npretraining is facilitated by utilizing the extensive open-source data\navailable online, with the assistance of MLLMs. Subsequently, the pretrained\nmodel is distributed among various clients for local training. Finally, once\nthe locally trained models are transmitted back to the server, a global\nalignment is carried out under the supervision of MLLMs to further enhance the\nperformance. Experimental evaluations on established benchmarks, show that our\nframework delivers promising performance in the typical scenarios with data\nheterogeneity and long-tail distribution across different clients in FL.\n","authors":["Jianyi Zhang","Hao Frank Yang","Ang Li","Xin Guo","Pu Wang","Haiming Wang","Yiran Chen","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2409.06067v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2409.19437v3","updated":"2024-12-02T10:15:47Z","published":"2024-09-28T18:56:48Z","title":"Strongly-polynomial time and validation analysis of policy gradient\n methods","summary":" This paper proposes a novel termination criterion, termed the advantage gap\nfunction, for finite state and action Markov decision processes (MDP) and\nreinforcement learning (RL). By incorporating this advantage gap function into\nthe design of step size rules and deriving a new linear rate of convergence\nthat is independent of the stationary state distribution of the optimal policy,\nwe demonstrate that policy gradient methods can solve MDPs in\nstrongly-polynomial time. To the best of our knowledge, this is the first time\nthat such strong convergence properties have been established for policy\ngradient methods. Moreover, in the stochastic setting, where only stochastic\nestimates of policy gradients are available, we show that the advantage gap\nfunction provides close approximations of the optimality gap for each\nindividual state and exhibits a sublinear rate of convergence at every state.\nThe advantage gap function can be easily estimated in the stochastic case, and\nwhen coupled with easily computable upper bounds on policy values, they provide\na convenient way to validate the solutions generated by policy gradient\nmethods. Therefore, our developments offer a principled and computable measure\nof optimality for RL, whereas current practice tends to rely on\nalgorithm-to-algorithm or baselines comparisons with no certificate of\noptimality.\n","authors":["Caleb Ju","Guanghui Lan"],"pdf_url":"https://arxiv.org/pdf/2409.19437v3.pdf","comment":"Add numerical experiments"},{"id":"http://arxiv.org/abs/2407.06125v2","updated":"2024-12-02T10:02:18Z","published":"2024-07-08T17:00:51Z","title":"Depression Detection and Analysis using Large Language Models on Textual\n and Audio-Visual Modalities","summary":" Depression has proven to be a significant public health issue, profoundly\naffecting the psychological well-being of individuals. If it remains\nundiagnosed, depression can lead to severe health issues, which can manifest\nphysically and even lead to suicide. Generally, Diagnosing depression or any\nother mental disorder involves conducting semi-structured interviews alongside\nsupplementary questionnaires, including variants of the Patient Health\nQuestionnaire (PHQ) by Clinicians and mental health professionals. This\napproach places significant reliance on the experience and judgment of trained\nphysicians, making the diagnosis susceptible to personal biases. Given that the\nunderlying mechanisms causing depression are still being actively researched,\nphysicians often face challenges in diagnosing and treating the condition,\nparticularly in its early stages of clinical presentation. Recently,\nsignificant strides have been made in Artificial neural computing to solve\nproblems involving text, image, and speech in various domains. Our analysis has\naimed to leverage these state-of-the-art (SOTA) models in our experiments to\nachieve optimal outcomes leveraging multiple modalities. The experiments were\nperformed on the Extended Distress Analysis Interview Corpus Wizard of Oz\ndataset (E-DAIC) corpus presented in the Audio/Visual Emotion Challenge (AVEC)\n2019 Challenge. The proposed solutions demonstrate better results achieved by\nProprietary and Open-source Large Language Models (LLMs), which achieved a Root\nMean Square Error (RMSE) score of 3.98 on Textual Modality, beating the AVEC\n2019 challenge baseline results and current SOTA regression analysis\narchitectures. Additionally, the proposed solution achieved an accuracy of\n71.43% in the classification task. The paper also includes a novel audio-visual\nmulti-modal network that predicts PHQ-8 scores with an RMSE of 6.51.\n","authors":["Chayan Tank","Sarthak Pol","Vinayak Katoch","Shaina Mehta","Avinash Anand","Rajiv Ratn Shah"],"pdf_url":"https://arxiv.org/pdf/2407.06125v2.pdf","comment":"12 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2405.14377v2","updated":"2024-12-02T09:48:21Z","published":"2024-05-23T09:52:15Z","title":"CoMERA: Computing- and Memory-Efficient Training via Rank-Adaptive\n Tensor Optimization","summary":" Training large AI models such as LLMs and DLRMs costs massive GPUs and\ncomputing time. The high training cost has become only affordable to big tech\ncompanies, meanwhile also causing increasing concerns about the environmental\nimpact. This paper presents CoMERA, a Computing- and Memory-Efficient training\nmethod via Rank-Adaptive tensor optimization. CoMERA achieves rank-adaptive\ntensor-compressed (pre)-training via a multi-objective optimization formulation\nand improves the training to provide both a high compression ratio and\nexcellent accuracy in the training process. Our optimized numerical computation\n(e.g., optimized tensorized embedding and tensor-network contractions) and GPU\nimplementation eliminate part of the run-time overhead in the tensorized\ntraining on GPU. This leads to, for the first time, $2-3\\times$ speedup per\ntraining epoch compared with standard training. CoMERA also outperforms the\nrecent GaLore in terms of both memory and computing efficiency. Specifically,\nCoMERA is $2\\times$ faster per training epoch and $9\\times$ more\nmemory-efficient than GaLore on a tested six-encoder transformer with\nsingle-batch training. Our method also shows $\\sim 2\\times$ speedup than\nstandard pre-training on a BERT-like code-generation LLM while achieving\n$4.23\\times$ compression ratio in pre-training. With further HPC optimization,\nCoMERA may reduce the pre-training cost of many other LLMs. An implementation\nof CoMERA is available at https://github.com/ziyangjoy/CoMERA.\n","authors":["Zi Yang","Ziyue Liu","Samridhi Choudhary","Xinfeng Xie","Cao Gao","Siegfried Kunzmann","Zheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.14377v2.pdf","comment":"Accepted by Neurips 2024"},{"id":"http://arxiv.org/abs/2407.04125v2","updated":"2024-12-02T09:42:24Z","published":"2024-07-04T18:54:30Z","title":"Query-Guided Self-Supervised Summarization of Nursing Notes","summary":" Nursing notes, an important part of Electronic Health Records (EHRs), track a\npatient's health during a care episode. Summarizing key information in nursing\nnotes can help clinicians quickly understand patients' conditions. However,\nexisting summarization methods in the clinical setting, especially abstractive\nmethods, have overlooked nursing notes and require reference summaries for\ntraining. We introduce QGSumm, a novel query-guided self-supervised domain\nadaptation approach for abstractive nursing note summarization. The method uses\npatient-related clinical queries for guidance, and hence does not need\nreference summaries for training. Through automatic experiments and manual\nevaluation by an expert clinician, we study our approach and other\nstate-of-the-art Large Language Models (LLMs) for nursing note summarization.\nOur experiments show: 1) GPT-4 is competitive in maintaining information in the\noriginal nursing notes, 2) QGSumm can generate high-quality summaries with a\ngood balance between recall of the original content and hallucination rate\nlower than other top methods. Ultimately, our work offers a new perspective on\nconditional text summarization, tailored to clinical applications.\n","authors":["Ya Gao","Hans Moen","Saila Koivusalo","Miika Koskinen","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2407.04125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15778v2","updated":"2024-12-02T09:06:32Z","published":"2024-11-24T10:58:48Z","title":"Enhancing the automatic segmentation and analysis of 3D liver\n vasculature models","summary":" Surgical assessment of liver cancer patients requires identification of the\nvessel trees from medical images. Specifically, the venous trees - the portal\n(perfusing) and the hepatic (draining) trees are important for understanding\nthe liver anatomy and disease state, and perform surgery planning. This\nresearch aims to improve the 3D segmentation, skeletonization, and subsequent\nanalysis of vessel trees, by creating an automatic pipeline based on deep\nlearning and image processing techniques.\n The first part of this work explores the impact of differentiable\nskeletonization methods such as ClDice and morphological skeletonization loss,\non the overall liver vessel segmentation performance. To this aim, it studies\nhow to improve vessel tree connectivity.\n The second part of this study converts a single class vessel segmentation\ninto multi-class ones, separating the two venous trees. It builds on the\nprevious two-class vessel segmentation model, which vessel tree outputs might\nbe entangled, and on connected components and skeleton analyses of the trees.\n After providing sub-labeling of the specific anatomical branches of each\nvenous tree, these algorithms also enable a morphometric analysis of the vessel\ntrees by extracting various geometrical markers.\n In conclusion, we propose a method that successfully improves current\nskeletonization methods, for extensive vascular trees that contain vessels of\ndifferent calibers. The separation algorithm creates a clean multi-class\nsegmentation of the vessels, validated by surgeons to provide low error. A new,\npublicly shared high-quality liver vessel dataset of 77 cases is thus created.\nFinally a method to annotate vessel trees according to anatomy is provided,\nenabling a unique liver vessel morphometry analysis.\n","authors":["Yassine Machta","Omar Ali","Kevin Hakkakian","Ana Vlasceanu","Amaury Facque","Nicolas Golse","Irene Vignon-Clementel"],"pdf_url":"https://arxiv.org/pdf/2411.15778v2.pdf","comment":"Internship at Simbiotx"},{"id":"http://arxiv.org/abs/2411.17772v2","updated":"2024-12-02T09:04:20Z","published":"2024-11-26T08:55:20Z","title":"MVBoost: Boost 3D Reconstruction with Multi-View Refinement","summary":" Recent advancements in 3D object reconstruction have been remarkable, yet\nmost current 3D models rely heavily on existing 3D datasets. The scarcity of\ndiverse 3D datasets results in limited generalization capabilities of 3D\nreconstruction models. In this paper, we propose a novel framework for boosting\n3D reconstruction with multi-view refinement (MVBoost) by generating pseudo-GT\ndata. The key of MVBoost is combining the advantages of the high accuracy of\nthe multi-view generation model and the consistency of the 3D reconstruction\nmodel to create a reliable data source. Specifically, given a single-view input\nimage, we employ a multi-view diffusion model to generate multiple views,\nfollowed by a large 3D reconstruction model to produce consistent 3D data.\nMVBoost then adaptively refines these multi-view images, rendered from the\nconsistent 3D data, to build a large-scale multi-view dataset for training a\nfeed-forward 3D reconstruction model. Additionally, the input view optimization\nis designed to optimize the corresponding viewpoints based on the user's input\nimage, ensuring that the most important viewpoint is accurately tailored to the\nuser's needs. Extensive evaluations demonstrate that our method achieves\nsuperior reconstruction results and robust generalization compared to prior\nworks.\n","authors":["Xiangyu Liu","Xiaomei Zhang","Zhiyuan Ma","Xiangyu Zhu","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2411.17772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19876v2","updated":"2024-12-02T08:58:20Z","published":"2024-11-29T17:38:56Z","title":"LUMIA: Linear probing for Unimodal and MultiModal Membership Inference\n Attacks leveraging internal LLM states","summary":" Large Language Models (LLMs) are increasingly used in a variety of\napplications, but concerns around membership inference have grown in parallel.\nPrevious efforts focus on black-to-grey-box models, thus neglecting the\npotential benefit from internal LLM information. To address this, we propose\nthe use of Linear Probes (LPs) as a method to detect Membership Inference\nAttacks (MIAs) by examining internal activations of LLMs. Our approach, dubbed\nLUMIA, applies LPs layer-by-layer to get fine-grained data on the model inner\nworkings. We test this method across several model architectures, sizes and\ndatasets, including unimodal and multimodal tasks. In unimodal MIA, LUMIA\nachieves an average gain of 15.71 % in Area Under the Curve (AUC) over previous\ntechniques. Remarkably, LUMIA reaches AUC>60% in 65.33% of cases -- an\nincrement of 46.80% against the state of the art. Furthermore, our approach\nreveals key insights, such as the model layers where MIAs are most detectable.\nIn multimodal models, LPs indicate that visual inputs can significantly\ncontribute to detect MIAs -- AUC>60% is reached in 85.90% of experiments.\n","authors":["Luis Ibanez-Lissen","Lorena Gonzalez-Manzano","Jose Maria de Fuentes","Nicolas Anciaux","Joaquin Garcia-Alfaro"],"pdf_url":"https://arxiv.org/pdf/2411.19876v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03274v3","updated":"2024-12-02T08:53:40Z","published":"2024-09-05T06:31:37Z","title":"Recent Advances in Attack and Defense Approaches of Large Language\n Models","summary":" Large Language Models (LLMs) have revolutionized artificial intelligence and\nmachine learning through their advanced text processing and generating\ncapabilities. However, their widespread deployment has raised significant\nsafety and reliability concerns. Established vulnerabilities in deep neural\nnetworks, coupled with emerging threat models, may compromise security\nevaluations and create a false sense of security. Given the extensive research\nin the field of LLM security, we believe that summarizing the current state of\naffairs will help the research community better understand the present\nlandscape and inform future developments. This paper reviews current research\non LLM vulnerabilities and threats, and evaluates the effectiveness of\ncontemporary defense mechanisms. We analyze recent studies on attack vectors\nand model weaknesses, providing insights into attack mechanisms and the\nevolving threat landscape. We also examine current defense strategies,\nhighlighting their strengths and limitations. By contrasting advancements in\nattack and defense methodologies, we identify research gaps and propose future\ndirections to enhance LLM security. Our goal is to advance the understanding of\nLLM safety challenges and guide the development of more robust security\nmeasures.\n","authors":["Jing Cui","Yishi Xu","Zhewei Huang","Shuchang Zhou","Jianbin Jiao","Junge Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.03274v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07278v2","updated":"2024-12-02T08:43:33Z","published":"2024-10-09T07:13:22Z","title":"PAR: Prompt-Aware Token Reduction Method for Efficient Large Multimodal\n Models","summary":" Multimodal large language models (MLLMs) demonstrate strong performance\nacross visual tasks, but their efficiency is hindered by significant\ncomputational and memory demands from processing long contexts in multimodal\ninputs. To address this, we introduce PAR (Prompt-Aware Token Reduction), a\nnovel and plug-and-play approach that reduces visual tokens efficiently without\ncompromising model performance. Unlike previous methods that rely heavily on\nattention mechanisms and overlooking cross-modal interactions , we uses a\nprompt-aware strategy to adpative identify and cluster essential visual tokens.\nPAR categorizes visual context redundancy into two types: external and\ninternal. External redundancy is minimized through semantic retrieval, while\ninternal redundancy is addressed using a token routing mechanism. This method\nsubstantially reduces computational load without requiring additional training\nor complex architectural modifications. \\textbf{Experimental results\ndemonstrate that across various visual question answering tasks, PAR reduces\nFLOPs by 83\\% with a compression ratio of 89\\%, while retaining 97\\% of\nbaseline accuracy.} The adaptive design of PAR achieves a 2x token reduction\nratio compared to prior approaches, enabling a better balance between\nperformance and efficiency.\n","authors":["Yingen Liu","Fan Wu","Ruihui Li","Zhuo Tang","Kenli Li"],"pdf_url":"https://arxiv.org/pdf/2410.07278v2.pdf","comment":"10 pages, 5 figures,3 tables"},{"id":"http://arxiv.org/abs/2410.01380v2","updated":"2024-12-02T08:43:16Z","published":"2024-10-02T09:49:45Z","title":"Knowledge Entropy Decay during Language Model Pretraining Hinders New\n Knowledge Acquisition","summary":" In this work, we investigate how a model's tendency to broadly integrate its\nparametric knowledge evolves throughout pretraining, and how this behavior\naffects overall performance, particularly in terms of knowledge acquisition and\nforgetting. We introduce the concept of knowledge entropy, which quantifies the\nrange of memory sources the model engages with; high knowledge entropy\nindicates that the model utilizes a wide range of memory sources, while low\nknowledge entropy suggests reliance on specific sources with greater certainty.\nOur analysis reveals a consistent decline in knowledge entropy as pretraining\nadvances. We also find that the decline is closely associated with a reduction\nin the model's ability to acquire and retain knowledge, leading us to conclude\nthat diminishing knowledge entropy (smaller number of active memory sources)\nimpairs the model's knowledge acquisition and retention capabilities. We find\nfurther support for this by demonstrating that increasing the activity of\ninactive memory sources enhances the model's capacity for knowledge acquisition\nand retention.\n","authors":["Jiyeon Kim","Hyunji Lee","Hyowon Cho","Joel Jang","Hyeonbin Hwang","Seungpil Won","Youbin Ahn","Dohaeng Lee","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2410.01380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12787v2","updated":"2024-12-02T07:41:38Z","published":"2024-11-19T11:03:09Z","title":"Visual Cue Enhancement and Dual Low-Rank Adaptation for Efficient Visual\n Instruction Fine-Tuning","summary":" Parameter-efficient fine-tuning multimodal large language models (MLLMs)\npresents significant challenges, including reliance on high-level visual\nfeatures that limit fine-grained detail comprehension, and data conflicts that\narise from task complexity. To address these issues, we propose an efficient\nfine-tuning framework with two novel approaches: Vision Cue Enhancement (VCE)\nand Dual Low-Rank Adaptation (Dual-LoRA). VCE enhances the vision projector by\nintegrating multi-level visual cues, improving the model's ability to capture\nfine-grained visual features. Dual-LoRA introduces a dual low-rank structure\nfor instruction tuning, decoupling learning into skill and task spaces to\nenable precise control and efficient adaptation across diverse tasks. Our\nmethod simplifies implementation, enhances visual comprehension, and improves\nadaptability. Experiments on both downstream tasks and general benchmarks\ndemonstrate the effectiveness of our proposed approach.\n","authors":["Pengkun Jiao","Bin Zhu","Jingjing Chen","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.12787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12027v3","updated":"2024-12-02T07:22:40Z","published":"2024-03-18T17:57:09Z","title":"From Pixels to Insights: A Survey on Automatic Chart Understanding in\n the Era of Large Foundation Models","summary":" Data visualization in the form of charts plays a pivotal role in data\nanalysis, offering critical insights and aiding in informed decision-making.\nAutomatic chart understanding has witnessed significant advancements with the\nrise of large foundation models in recent years. Foundation models, such as\nlarge language models, have revolutionized various natural language processing\ntasks and are increasingly being applied to chart understanding tasks. This\nsurvey paper provides a comprehensive overview of the recent developments,\nchallenges, and future directions in chart understanding within the context of\nthese foundation models. We review fundamental building blocks crucial for\nstudying chart understanding tasks. Additionally, we explore various tasks and\ntheir evaluation metrics and sources of both charts and textual inputs. Various\nmodeling strategies are then examined, encompassing both classification-based\nand generation-based approaches, along with tool augmentation techniques that\nenhance chart understanding performance. Furthermore, we discuss the\nstate-of-the-art performance of each task and discuss how we can improve the\nperformance. Challenges and future directions are addressed, highlighting the\nimportance of several topics, such as domain-specific charts, lack of efforts\nin developing evaluation metrics, and agent-oriented settings. This survey\npaper serves as a comprehensive resource for researchers and practitioners in\nthe fields of natural language processing, computer vision, and data analysis,\nproviding valuable insights and directions for future research in chart\nunderstanding leveraging large foundation models. The studies mentioned in this\npaper, along with emerging new research, will be continually updated at:\nhttps://github.com/khuangaf/Awesome-Chart-Understanding.\n","authors":["Kung-Hsiang Huang","Hou Pong Chan","Yi R. Fung","Haoyi Qiu","Mingyang Zhou","Shafiq Joty","Shih-Fu Chang","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2403.12027v3.pdf","comment":"IEEE Transactions on Knowledge and Data Engineering (TKDE)"},{"id":"http://arxiv.org/abs/2406.06594v2","updated":"2024-12-02T07:04:17Z","published":"2024-06-06T03:13:34Z","title":"Stock Movement Prediction with Multimodal Stable Fusion via Gated\n Cross-Attention Mechanism","summary":" The accurate prediction of stock movements is crucial for investment\nstrategies. Stock prices are subject to the influence of various forms of\ninformation, including financial indicators, sentiment analysis, news\ndocuments, and relational structures. Predominant analytical approaches,\nhowever, tend to address only unimodal or bimodal sources, neglecting the\ncomplexity of multimodal data. Further complicating the landscape are the\nissues of data sparsity and semantic conflicts between these modalities, which\nare frequently overlooked by current models, leading to unstable performance\nand limiting practical applicability. To address these shortcomings, this study\nintroduces a novel architecture, named Multimodal Stable Fusion with Gated\nCross-Attention (MSGCA), designed to robustly integrate multimodal input for\nstock movement prediction. The MSGCA framework consists of three integral\ncomponents: (1) a trimodal encoding module, responsible for processing\nindicator sequences, dynamic documents, and a relational graph, and\nstandardizing their feature representations; (2) a cross-feature fusion module,\nwhere primary and consistent features guide the multimodal fusion of the three\nmodalities via a pair of gated cross-attention networks; and (3) a prediction\nmodule, which refines the fused features through temporal and dimensional\nreduction to execute precise movement forecasting. Empirical evaluations\ndemonstrate that the MSGCA framework exceeds current leading methods, achieving\nperformance gains of 8.1%, 6.1%, 21.7% and 31.6% on four multimodal datasets,\nrespectively, attributed to its enhanced multimodal fusion stability.\n","authors":["Chang Zong","Hang Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.06594v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.10825v3","updated":"2024-12-02T07:00:57Z","published":"2024-09-17T01:37:57Z","title":"Unveiling and Mitigating Bias in Large Language Model Recommendations: A\n Path to Fairness","summary":" excel in delivering comprehensive suggestions by deeply analyzing content and\nuser behavior. However, they often inherit biases from skewed training data,\nfavoring mainstream content while underrepresenting diverse or non-traditional\noptions. This study explores the interplay between bias and LLM-based\nrecommendation systems, focusing on music, song, and book recommendations\nacross diverse demographic and cultural groups. This paper analyzes bias in\nLLM-based recommendation systems across multiple models (GPT, LLaMA, and\nGemini), revealing its deep and pervasive impact on outcomes. Intersecting\nidentities and contextual factors, like socioeconomic status, further amplify\nbiases, complicating fair recommendations across diverse groups. Our findings\nreveal that bias in these systems is deeply ingrained, yet even simple\ninterventions like prompt engineering can significantly reduce it. We further\npropose a retrieval-augmented generation strategy to mitigate bias more\neffectively. Numerical experiments validate these strategies, demonstrating\nboth the pervasive nature of bias and the impact of the proposed solutions.\n","authors":["Anindya Bijoy Das","Shahnewaz Karim Sakib"],"pdf_url":"https://arxiv.org/pdf/2409.10825v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19713v2","updated":"2024-12-02T06:53:59Z","published":"2024-11-29T14:01:34Z","title":"CantorNet: A Sandbox for Testing Geometrical and Topological Complexity\n Measures","summary":" Many natural phenomena are characterized by self-similarity, for example the\nsymmetry of human faces, or a repetitive motif of a song. Studying of such\nsymmetries will allow us to gain deeper insights into the underlying mechanisms\nof complex systems. Recognizing the importance of understanding these patterns,\nwe propose a geometrically inspired framework to study such phenomena in\nartificial neural networks. To this end, we introduce \\emph{CantorNet},\ninspired by the triadic construction of the Cantor set, which was introduced by\nGeorg Cantor in the $19^\\text{th}$ century. In mathematics, the Cantor set is a\nset of points lying on a single line that is self-similar and has a counter\nintuitive property of being an uncountably infinite null set. Similarly, we\nintroduce CantorNet as a sandbox for studying self-similarity by means of novel\ntopological and geometrical complexity measures. CantorNet constitutes a family\nof ReLU neural networks that spans the whole spectrum of possible Kolmogorov\ncomplexities, including the two opposite descriptions (linear and exponential\nas measured by the description length). CantorNet's decision boundaries can be\narbitrarily ragged, yet are analytically known. Besides serving as a testing\nground for complexity measures, our work may serve to illustrate potential\npitfalls in geometry-ignorant data augmentation techniques and adversarial\nattacks.\n","authors":["Michal Lewandowski","Hamid Eghbalzadeh","Bernhard A. Moser"],"pdf_url":"https://arxiv.org/pdf/2411.19713v2.pdf","comment":"Accepted at the NeurIPS Workshop on Symmetry and Geometry in Neural\n Representations, 2024"},{"id":"http://arxiv.org/abs/2411.19943v2","updated":"2024-12-02T06:26:38Z","published":"2024-11-29T18:58:22Z","title":"Critical Tokens Matter: Token-Level Contrastive Estimation Enhances\n LLM's Reasoning Capability","summary":" Large Language Models (LLMs) have exhibited remarkable performance on\nreasoning tasks. They utilize autoregressive token generation to construct\nreasoning trajectories, enabling the development of a coherent chain of\nthought. In this work, we explore the impact of individual tokens on the final\noutcomes of reasoning tasks. We identify the existence of ``critical tokens''\nthat lead to incorrect reasoning trajectories in LLMs. Specifically, we find\nthat LLMs tend to produce positive outcomes when forced to decode other tokens\ninstead of critical tokens. Motivated by this observation, we propose a novel\napproach - cDPO - designed to automatically recognize and conduct token-level\nrewards for the critical tokens during the alignment process. Specifically, we\ndevelop a contrastive estimation approach to automatically identify critical\ntokens. It is achieved by comparing the generation likelihood of positive and\nnegative models. To achieve this, we separately fine-tune the positive and\nnegative models on various reasoning trajectories, consequently, they are\ncapable of identifying identify critical tokens within incorrect trajectories\nthat contribute to erroneous outcomes. Moreover, to further align the model\nwith the critical token information during the alignment process, we extend the\nconventional DPO algorithms to token-level DPO and utilize the differential\nlikelihood from the aforementioned positive and negative model as important\nweight for token-level DPO learning.Experimental results on GSM8K and MATH500\nbenchmarks with two-widely used models Llama-3 (8B and 70B) and deepseek-math\n(7B) demonstrate the effectiveness of the propsoed approach cDPO.\n","authors":["Zicheng Lin","Tian Liang","Jiahao Xu","Xing Wang","Ruilin Luo","Chufan Shi","Siheng Li","Yujiu Yang","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2411.19943v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.10625v4","updated":"2024-12-02T04:10:31Z","published":"2023-09-19T14:04:04Z","title":"NoisyNN: Exploring the Impact of Information Entropy Change in Learning\n Systems","summary":" We investigate the impact of entropy change in deep learning systems by noise\ninjection at different levels, including the embedding space and the image. The\nseries of models that employ our methodology are collectively known as Noisy\nNeural Networks (NoisyNN), with examples such as NoisyViT and NoisyCNN. Noise\nis conventionally viewed as a harmful perturbation in various deep learning\narchitectures, such as convolutional neural networks (CNNs) and vision\ntransformers (ViTs), as well as different learning tasks like image\nclassification and transfer learning. However, this work shows noise can be an\neffective way to change the entropy of the learning system. We demonstrate that\nspecific noise can boost the performance of various deep models under certain\nconditions. We theoretically prove the enhancement gained from positive noise\nby reducing the task complexity defined by information entropy and\nexperimentally show the significant performance gain in large image datasets,\nsuch as the ImageNet. Herein, we use the information entropy to define the\ncomplexity of the task. We categorize the noise into two types, positive noise\n(PN) and harmful noise (HN), based on whether the noise can help reduce the\ntask complexity. Extensive experiments of CNNs and ViTs have shown performance\nimprovements by proactively injecting positive noise, where we achieved an\nunprecedented top 1 accuracy of 95$\\%$ on ImageNet. Both theoretical analysis\nand empirical evidence have confirmed that the presence of positive noise, can\nbenefit the learning process, while the traditionally perceived harmful noise\nindeed impairs deep learning models. The different roles of noise offer new\nexplanations for deep models on specific tasks and provide a new paradigm for\nimproving model performance. Moreover, it reminds us that we can influence the\nperformance of learning systems via information entropy change.\n","authors":["Xiaowei Yu","Zhe Huang","Minheng Chen","Yao Xue","Tianming Liu","Dajiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.10625v4.pdf","comment":"Task Entropy, NoisyViT, NoisyCNN"},{"id":"http://arxiv.org/abs/2406.12336v2","updated":"2024-12-02T04:08:49Z","published":"2024-06-18T07:03:34Z","title":"Towards Understanding Domain Adapted Sentence Embeddings for Document\n Retrieval","summary":" A plethora of sentence embedding models makes it challenging to choose one,\nespecially for technical domains rich with specialized vocabulary. In this\nwork, we domain adapt embeddings using telecom, health and science datasets for\nquestion answering. We evaluate embeddings obtained from publicly available\nmodels and their domain-adapted variants, on both point retrieval accuracies,\nas well as their (95\\%) confidence intervals. We establish a systematic method\nto obtain thresholds for similarity scores for different embeddings. As\nexpected, we observe that fine-tuning improves mean bootstrapped accuracies. We\nalso observe that it results in tighter confidence intervals, which further\nimprove when pre-training is preceded by fine-tuning. We introduce metrics\nwhich measure the distributional overlaps of top-$K$, correct and random\ndocument similarities with the question. Further, we show that these metrics\nare correlated with retrieval accuracy and similarity thresholds. Recent\nliterature shows conflicting effects of isotropy on retrieval accuracies. Our\nexperiments establish that the isotropy of embeddings (as measured by two\nindependent state-of-the-art isotropy metric definitions) is poorly correlated\nwith retrieval performance. We show that embeddings for domain-specific\nsentences have little overlap with those for domain-agnostic ones, and\nfine-tuning moves them further apart. Based on our results, we provide\nrecommendations for use of our methodology and metrics by researchers and\npractitioners.\n","authors":["Sujoy Roychowdhury","Sumit Soman","H. G. Ranjani","Vansh Chhabra","Neeraj Gunda","Shashank Gautam","Subhadip Bandyopadhyay","Sai Krishna Bala"],"pdf_url":"https://arxiv.org/pdf/2406.12336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00769v2","updated":"2024-12-02T03:54:23Z","published":"2024-11-01T17:59:17Z","title":"GameGen-X: Interactive Open-world Game Video Generation","summary":" We introduce GameGen-X, the first diffusion transformer model specifically\ndesigned for both generating and interactively controlling open-world game\nvideos. This model facilitates high-quality, open-domain generation by\nsimulating an extensive array of game engine features, such as innovative\ncharacters, dynamic environments, complex actions, and diverse events.\nAdditionally, it provides interactive controllability, predicting and altering\nfuture content based on the current clip, thus allowing for gameplay\nsimulation. To realize this vision, we first collected and built an Open-World\nVideo Game Dataset from scratch. It is the first and largest dataset for\nopen-world game video generation and control, which comprises over a million\ndiverse gameplay video clips sampling from over 150 games with informative\ncaptions from GPT-4o. GameGen-X undergoes a two-stage training process,\nconsisting of foundation model pre-training and instruction tuning. Firstly,\nthe model was pre-trained via text-to-video generation and video continuation,\nendowing it with the capability for long-sequence, high-quality open-domain\ngame video generation. Further, to achieve interactive controllability, we\ndesigned InstructNet to incorporate game-related multi-modal control signal\nexperts. This allows the model to adjust latent representations based on user\ninputs, unifying character interaction and scene content control for the first\ntime in video generation. During instruction tuning, only the InstructNet is\nupdated while the pre-trained foundation model is frozen, enabling the\nintegration of interactive controllability without loss of diversity and\nquality of generated video content.\n","authors":["Haoxuan Che","Xuanhua He","Quande Liu","Cheng Jin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2411.00769v2.pdf","comment":"Homepage: https://gamegen-x.github.io/ Github:\n https://github.com/GameGen-X/GameGen-X"},{"id":"http://arxiv.org/abs/2408.15950v2","updated":"2024-12-02T03:48:43Z","published":"2024-08-28T17:08:56Z","title":"Atari-GPT: Benchmarking Multimodal Large Language Models as Low-Level\n Policies in Atari Games","summary":" Recent advancements in large language models (LLMs) have expanded their\ncapabilities beyond traditional text-based tasks to multimodal domains,\nintegrating visual, auditory, and textual data. While multimodal LLMs have been\nextensively explored for high-level planning in domains like robotics and\ngames, their potential as low-level controllers remains largely untapped. In\nthis paper, we introduce a novel benchmark aimed at testing the emergent\ncapabilities of multimodal LLMs as low-level policies in Atari games. Unlike\ntraditional reinforcement learning (RL) methods that require training for each\nnew environment and reward function specification, these LLMs utilize\npre-existing multimodal knowledge to directly engage with game environments.\nOur study assesses the performances of multiple multimodal LLMs against\ntraditional RL agents, human players, and random agents, focusing on their\nability to understand and interact with complex visual scenes and formulate\nstrategic responses. Our results show that these multimodal LLMs are not yet\ncapable of being zero-shot low-level policies. Furthermore, we see that this\nis, in part, due to their visual and spatial reasoning. Additional results and\nvideos are available on our project webpage:\nhttps://dev1nw.github.io/atari-gpt/.\n","authors":["Nicholas R. Waytowich","Devin White","MD Sunbeam","Vinicius G. Goecks"],"pdf_url":"https://arxiv.org/pdf/2408.15950v2.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2411.19527v2","updated":"2024-12-02T03:34:45Z","published":"2024-11-29T07:54:56Z","title":"DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow\n Decoding","summary":" Human motion, inherently continuous and dynamic, presents significant\nchallenges for generative models. Despite their dominance, discrete\nquantization methods, such as VQ-VAEs, suffer from inherent limitations,\nincluding restricted expressiveness and frame-wise noise artifacts. Continuous\napproaches, while producing smoother and more natural motions, often falter due\nto high-dimensional complexity and limited training data. To resolve this\n\"discord\" between discrete and continuous representations, we introduce\nDisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a\nnovel method that decodes discrete motion tokens into continuous motion through\nrectified flow. By employing an iterative refinement process in the continuous\nspace, DisCoRD captures fine-grained dynamics and ensures smoother and more\nnatural motions. Compatible with any discrete-based framework, our method\nenhances naturalness without compromising faithfulness to the conditioning\nsignals. Extensive evaluations demonstrate that DisCoRD achieves\nstate-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on\nKIT-ML. These results solidify DisCoRD as a robust solution for bridging the\ndivide between discrete efficiency and continuous realism. Our project page is\navailable at: https://whwjdqls.github.io/discord.github.io/.\n","authors":["Jungbin Cho","Junwan Kim","Jisoo Kim","Minseo Kim","Mingu Kang","Sungeun Hong","Tae-Hyun Oh","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2411.19527v2.pdf","comment":"20 pages 18 figures"},{"id":"http://arxiv.org/abs/2303.12307v7","updated":"2024-12-02T03:18:41Z","published":"2023-03-22T04:49:23Z","title":"Predicting and Enhancing the Fairness of DNNs with the Curvature of\n Perceptual Manifolds","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we first establish a geometric\nperspective for analyzing model fairness and then systematically propose a\nseries of geometric measurements for perceptual manifolds in deep neural\nnetworks. Subsequently, we comprehensively explore the effect of the geometric\ncharacteristics of perceptual manifolds on classification difficulty and how\nlearning shapes the geometric characteristics of perceptual manifolds. An\nunanticipated finding is that the correlation between the class accuracy and\nthe separation degree of perceptual manifolds gradually decreases during\ntraining, while the negative correlation with the curvature gradually\nincreases, implying that curvature imbalance leads to model bias.Building upon\nthese observations, we propose curvature regularization to facilitate the model\nto learn curvature-balanced and flatter perceptual manifolds. Evaluations on\nmultiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Maoji Wen","Lingling Li","Wenping Ma","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12307v7.pdf","comment":"17pages, Accepted by CVPR 2023, Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2409.01345v3","updated":"2024-12-02T03:10:37Z","published":"2024-09-02T15:58:27Z","title":"Language Models Benefit from Preparation with Elicited Knowledge","summary":" The zero-shot chain of thought (CoT) approach is often used in question\nanswering (QA) by language models (LMs) for tasks that require multiple\nreasoning steps. However, some QA tasks hinge more on accessing relevant\nknowledge than on chaining reasoning steps. We introduce a simple prompting\ntechnique, called PREP, that involves using two instances of LMs: the first\n(LM1) generates relevant information, and the second (LM2) receives the\ninformation from the user and answers the question. This design is intended to\nmake better use of the LM's instruction-following capability. PREP is\napplicable across various QA tasks without domain-specific prompt engineering.\nPREP is developed on a dataset of 100 QA questions, derived from an extensive\nschematic dataset specifying artifact parts and material composition. These\nquestions ask which of two artifacts is less likely to share materials with\nanother artifact. Such questions probe the LM's knowledge of shared materials\nin the part structure of different artifacts. We test our method on our\nparts-and-materials dataset and three published commonsense reasoning datasets.\nThe average accuracy of our method is consistently higher than that of all the\nother tested methods across all the tested datasets.\n","authors":["Jiacan Yu","Hannah An","Lenhart K. Schubert"],"pdf_url":"https://arxiv.org/pdf/2409.01345v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17912v2","updated":"2024-12-02T02:42:05Z","published":"2024-11-26T22:06:39Z","title":"Can LLMs plan paths in the real world?","summary":" As large language models (LLMs) increasingly integrate into vehicle\nnavigation systems, understanding their path-planning capability is crucial. We\ntested three LLMs through six real-world path-planning scenarios in various\nsettings and with various difficulties. Our experiments showed that all LLMs\nmade numerous errors in all scenarios, revealing that they are unreliable path\nplanners. We suggest that future work focus on implementing mechanisms for\nreality checks, enhancing model transparency, and developing smaller models.\n","authors":["Wanyi Chen","Meng-Wen Su","Nafisa Mehjabin","Mary L. Cummings"],"pdf_url":"https://arxiv.org/pdf/2411.17912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02326v2","updated":"2024-12-02T01:59:30Z","published":"2024-04-23T18:55:49Z","title":"Evaluating LLMs for Hardware Design and Test","summary":" Large Language Models (LLMs) have demonstrated capabilities for producing\ncode in Hardware Description Languages (HDLs). However, most of the focus\nremains on their abilities to write functional code, not test code. The\nhardware design process consists of both design and test, and so eschewing\nvalidation and verification leaves considerable potential benefit unexplored,\ngiven that a design and test framework may allow for progress towards full\nautomation of the digital design pipeline. In this work, we perform one of the\nfirst studies exploring how a LLM can both design and test hardware modules\nfrom provided specifications. Using a suite of 8 representative benchmarks, we\nexamined the capabilities and limitations of the state-of-the-art\nconversational LLMs when producing Verilog for functional and verification\npurposes. We taped out the benchmarks on a Skywater 130nm shuttle and received\nthe functional chip.\n","authors":["Jason Blocklove","Siddharth Garg","Ramesh Karri","Hammond Pearce"],"pdf_url":"https://arxiv.org/pdf/2405.02326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04374v2","updated":"2024-12-02T01:38:05Z","published":"2023-12-07T15:44:56Z","title":"Deep Dynamics: Vehicle Dynamics Modeling with a Physics-Constrained\n Neural Network for Autonomous Racing","summary":" Autonomous racing is a critical research area for autonomous driving,\npresenting significant challenges in vehicle dynamics modeling, such as\nbalancing model precision and computational efficiency at high speeds\n(>280km/h), where minor errors in modeling have severe consequences. Existing\nphysics-based models for vehicle dynamics require elaborate testing setups and\ntuning, which are hard to implement, time-intensive, and cost-prohibitive.\nConversely, purely data-driven approaches do not generalize well and cannot\nadequately ensure physical constraints on predictions. This paper introduces\nDeep Dynamics, a physics-constrained neural network (PCNN) for vehicle dynamics\nmodeling of an autonomous racecar. It combines physics coefficient estimation\nand dynamical equations to accurately predict vehicle states at high speeds and\nincludes a unique Physics Guard layer to ensure internal coefficient estimates\nremain within their nominal physical ranges. Open-loop and closed-loop\nperformance assessments, using a physics-based simulator and full-scale\nautonomous Indy racecar data, highlight Deep Dynamics as a promising approach\nfor modeling racecar vehicle dynamics.\n","authors":["John Chrosniak","Jingyun Ning","Madhur Behl"],"pdf_url":"https://arxiv.org/pdf/2312.04374v2.pdf","comment":"Published in the IEEE Robotics and Automation Letters and presented\n at the IEEE International Conference on Intelligent Robots and Systems"},{"id":"http://arxiv.org/abs/2402.01401v4","updated":"2024-12-02T00:03:53Z","published":"2024-02-02T13:33:30Z","title":"An Information Theoretic Approach to Machine Unlearning","summary":" To comply with AI and data regulations, the need to forget private or\ncopyrighted information from trained machine learning models is increasingly\nimportant. The key challenge in unlearning is forgetting the necessary data in\na timely manner, while preserving model performance. In this work, we address\nthe zero-shot unlearning scenario, whereby an unlearning algorithm must be able\nto remove data given only a trained model and the data to be forgotten. We\nexplore unlearning from an information theoretic perspective, connecting the\ninfluence of a sample to the information gain a model receives by observing it.\nFrom this, we derive a simple but principled zero-shot unlearning method based\non the geometry of the model. Our approach takes the form of minimising the\ngradient of a learned function with respect to a small neighbourhood around a\ntarget forget point. This induces a smoothing effect, causing forgetting by\nmoving the boundary of the classifier. We explore the intuition behind why this\napproach can jointly unlearn forget samples while preserving general model\nperformance through a series of low-dimensional experiments. We perform\nextensive empirical evaluation of our method over a range of contemporary\nbenchmarks, verifying that our method is competitive with state-of-the-art\nperformance under the strict constraints of zero-shot unlearning. Code for the\nproject can be found at\nhttps://github.com/jwf40/Information-Theoretic-Unlearning\n","authors":["Jack Foster","Kyle Fogarty","Stefan Schoepf","Zack Dugue","Cengiz Öztireli","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2402.01401v4.pdf","comment":"Updated, new low-dimensional experiments and updated perspective on\n unlearning from an information theoretic view"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2410.16208v3","updated":"2024-12-02T18:59:28Z","published":"2024-10-21T17:11:21Z","title":"Compute-Constrained Data Selection","summary":" Data selection can reduce the amount of training data needed to finetune\nLLMs; however, the efficacy of data selection scales directly with its compute.\nMotivated by the practical challenge of compute-constrained finetuning, we\nconsider the setting in which both the cost of selecting data and training are\nbudgeted for. We first formalize the problem of data selection with a\ncost-aware utility function, and model the data selection problem as trading\noff initial-selection cost for training gain. We run a comprehensive sweep of\nexperiments across multiple tasks, varying compute budget by scaling finetuning\ntokens, model sizes, and data selection compute. Interestingly we find that\nmany powerful data selection methods are almost never compute-optimal, and that\ncheaper data selection alternatives dominate both from a theoretical and\nempirical perspective. For compute-optimal training, we find that perplexity\nand gradient data selection require training-to-selection model size ratios of\n5x and 10x, respectively.\n","authors":["Junjie Oscar Yin","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2410.16208v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07978v3","updated":"2024-12-02T18:58:18Z","published":"2024-11-12T17:58:34Z","title":"A Note on Doubly Robust Estimator in Regression Continuity Designs","summary":" This note introduces a doubly robust (DR) estimator for regression\ndiscontinuity (RD) designs. RD designs provide a quasi-experimental framework\nfor estimating treatment effects, where treatment assignment depends on whether\na running variable surpasses a predefined cutoff. A common approach in RD\nestimation is the use of nonparametric regression methods, such as local linear\nregression. However, the validity of these methods still relies on the\nconsistency of the nonparametric estimators. In this study, we propose the\nDR-RD estimator, which combines two distinct estimators for the conditional\nexpected outcomes. The primary advantage of the DR-RD estimator lies in its\nability to ensure the consistency of the treatment effect estimation as long as\nat least one of the two estimators is consistent. Consequently, our DR-RD\nestimator enhances robustness of treatment effect estimators in RD designs.\n","authors":["Masahiro Kato"],"pdf_url":"https://arxiv.org/pdf/2411.07978v3.pdf","comment":"There is a critical error in the previous submission. We have revised\n the original claim and present a weakened result"},{"id":"http://arxiv.org/abs/2411.17501v2","updated":"2024-12-02T18:54:28Z","published":"2024-11-26T15:13:06Z","title":"Inference Scaling fLaws: The Limits of LLM Resampling with Imperfect\n Verifiers","summary":" Recent research has generated hope that inference scaling could allow weaker\nlanguage models to match or exceed the accuracy of stronger models, such as by\nrepeatedly sampling solutions to a coding problem until it passes unit tests.\nThe central thesis of this paper is that there is no free lunch for inference\nscaling: indefinite accuracy improvement through resampling can only be\nrealized if the \"verifier\" (in this case, a set of unit tests) is perfect. When\nthe verifier is imperfect, as it almost always is in domains such as reasoning\nor coding (for example, unit tests have imperfect coverage), there is a nonzero\nprobability of false positives: incorrect solutions that pass the verifier.\nResampling cannot decrease this probability, so it imposes an upper bound to\nthe accuracy of resampling-based inference scaling even with an infinite\ncompute budget. We find that there is a very strong correlation between the\nmodel's single-sample accuracy (i.e. accuracy without unit tests) and its false\npositive rate on coding benchmarks HumanEval and MBPP, whose unit tests have\nlimited coverage. Therefore, no amount of inference scaling of weaker models\ncan enable them to match the single-sample accuracy of a sufficiently strong\nmodel (Fig. 1a). When we consider that false positives have a negative utility\ncompared to abstaining from producing a solution, it bends the inference\nscaling curve further downward. Empirically, we find that the optimal number of\nsamples can be less than 10 under realistic assumptions (Fig. 1b). Finally, we\nshow that beyond accuracy, false positives may have other undesirable\nqualities, such as poor adherence to coding style conventions.\n","authors":["Benedikt Stroebl","Sayash Kapoor","Arvind Narayanan"],"pdf_url":"https://arxiv.org/pdf/2411.17501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05248v3","updated":"2024-12-02T18:54:09Z","published":"2023-12-08T18:55:40Z","title":"Topology-Based Reconstruction Prevention for Decentralised Learning","summary":" Decentralised learning has recently gained traction as an alternative to\nfederated learning in which both data and coordination are distributed. To\npreserve the confidentiality of users' data, decentralised learning relies on\ndifferential privacy, multi-party computation, or both. However, running\nmultiple privacy-preserving summations in sequence may allow adversaries to\nperform reconstruction attacks. Current reconstruction countermeasures either\ncannot trivially be adapted to the distributed setting, or add excessive\namounts of noise.\n In this work, we first show that passive honest-but-curious adversaries can\ninfer other users' private data after several privacy-preserving summations.\nFor example, in subgraphs with 18 users, we show that only three passive\nhonest-but-curious adversaries succeed at reconstructing private data 11.0% of\nthe time, requiring an average of 8.8 summations per adversary. The success\nrate depends only on the adversaries' direct neighbourhood, and is independent\nof the size of the full network. We consider weak adversaries that do not\ncontrol the graph topology, cannot exploit the summation's inner workings, and\ndo not have auxiliary knowledge; and show that these adversaries can still\ninfer private data.\n We analyse how reconstruction relates to topology and propose the first\ntopology-based decentralised defence against reconstruction attacks. We show\nthat reconstruction requires a number of adversaries linear in the length of\nthe network's shortest cycle. Consequently, exact attacks over\nprivacy-preserving summations are impossible in acyclic networks.\n Our work is a stepping stone for a formal theory of topology-based\ndecentralised reconstruction defences. Such a theory would generalise our\ncountermeasure beyond summation, define confidentiality in terms of entropy,\nand describe the interactions with (topology-aware) differential privacy.\n","authors":["Florine W. Dekker","Zekeriya Erkin","Mauro Conti"],"pdf_url":"https://arxiv.org/pdf/2312.05248v3.pdf","comment":"14 pages, 19 figures, for associated experiment source code see\n doi:10.4121/21572601.v2"},{"id":"http://arxiv.org/abs/2410.09943v2","updated":"2024-12-02T18:39:06Z","published":"2024-10-13T17:55:58Z","title":"Dynamic Estimation of Learning Rates Using a Non-Linear Autoregressive\n Model","summary":" We introduce a new class of adaptive non-linear autoregressive (Nlar) models\nincorporating the concept of momentum, which dynamically estimate both the\nlearning rates and momentum as the number of iterations increases. In our\nmethod, the growth of the gradients is controlled using a scaling (clipping)\nfunction, leading to stable convergence. Within this framework, we propose\nthree distinct estimators for learning rates and provide theoretical proof of\ntheir convergence. We further demonstrate how these estimators underpin the\ndevelopment of effective Nlar optimizers. The performance of the proposed\nestimators and optimizers is rigorously evaluated through extensive experiments\nacross several datasets and a reinforcement learning environment. The results\nhighlight two key features of the Nlar optimizers: robust convergence despite\nvariations in underlying parameters, including large initial learning rates,\nand strong adaptability with rapid convergence during the initial epochs.\n","authors":["Ramin Okhrati"],"pdf_url":"https://arxiv.org/pdf/2410.09943v2.pdf","comment":"Typos corrected"},{"id":"http://arxiv.org/abs/2408.00170v2","updated":"2024-12-02T18:37:01Z","published":"2024-07-31T21:43:55Z","title":"CREW: Facilitating Human-AI Teaming Research","summary":" With the increasing deployment of artificial intelligence (AI) technologies,\nthe potential of humans working with AI agents has been growing at a great\nspeed. Human-AI teaming is an important paradigm for studying various aspects\nwhen humans and AI agents work together. The unique aspect of Human-AI teaming\nresearch is the need to jointly study humans and AI agents, demanding\nmultidisciplinary research efforts from machine learning to human-computer\ninteraction, robotics, cognitive science, neuroscience, psychology, social\nscience, and complex systems. However, existing platforms for Human-AI teaming\nresearch are limited, often supporting oversimplified scenarios and a single\ntask, or specifically focusing on either human-teaming research or multi-agent\nAI algorithms. We introduce CREW, a platform to facilitate Human-AI teaming\nresearch in real-time decision-making scenarios and engage collaborations from\nmultiple scientific disciplines, with a strong emphasis on human involvement.\nIt includes pre-built tasks for cognitive studies and Human-AI teaming with\nexpandable potentials from our modular design. Following conventional cognitive\nneuroscience research, CREW also supports multimodal human physiological signal\nrecording for behavior analysis. Moreover, CREW benchmarks real-time\nhuman-guided reinforcement learning agents using state-of-the-art algorithms\nand well-tuned baselines. With CREW, we were able to conduct 50 human subject\nstudies within a week to verify the effectiveness of our benchmark.\n","authors":["Lingyu Zhang","Zhengran Ji","Boyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2408.00170v2.pdf","comment":"Our project website is at: http://generalroboticslab.com/CREW"},{"id":"http://arxiv.org/abs/2402.08573v3","updated":"2024-12-02T18:33:58Z","published":"2024-02-13T16:21:18Z","title":"Two Tales of Single-Phase Contrastive Hebbian Learning","summary":" The search for ``biologically plausible'' learning algorithms has converged\non the idea of representing gradients as activity differences. However, most\napproaches require a high degree of synchronization (distinct phases during\nlearning) and introduce substantial computational overhead, which raises doubts\nregarding their biological plausibility as well as their potential utility for\nneuromorphic computing. Furthermore, they commonly rely on applying\ninfinitesimal perturbations (nudges) to output units, which is impractical in\nnoisy environments. Recently it has been shown that by modelling artificial\nneurons as dyads with two oppositely nudged compartments, it is possible for a\nfully local learning algorithm named ``dual propagation'' to bridge the\nperformance gap to backpropagation, without requiring separate learning phases\nor infinitesimal nudging. However, the algorithm has the drawback that its\nnumerical stability relies on symmetric nudging, which may be restrictive in\nbiological and analog implementations. In this work we first provide a solid\nfoundation for the objective underlying the dual propagation method, which also\nreveals a surprising connection with adversarial robustness. Second, we\ndemonstrate how dual propagation is related to a particular adjoint state\nmethod, which is stable regardless of asymmetric nudging.\n","authors":["Rasmus Kjær Høier","Christopher Zach"],"pdf_url":"https://arxiv.org/pdf/2402.08573v3.pdf","comment":"ICML 2024; 21 pages"},{"id":"http://arxiv.org/abs/2406.16738v2","updated":"2024-12-02T18:27:02Z","published":"2024-06-24T15:45:20Z","title":"Inducing Group Fairness in Prompt-Based Language Model Decisions","summary":" Classifiers are used throughout industry to enforce policies, ranging from\nthe detection of toxic content to age-appropriate content filtering. While\nthese classifiers serve important functions, it is also essential that they are\nbuilt in ways that minimize unfair biases for users.\n One such fairness consideration is called group fairness, which desires that\ndifferent sub-population of users receive equal treatment. This is a\nwell-studied problem in the context of 'classical' classifiers. However, the\nemergence of prompt-based language model (LM) decision making has created new\nopportunities to solve text-based classification tasks, and the fairness\nproperties of these new classifiers are not yet well understood. Further, the\n`remediation toolkit' is incomplete for LM-based decision makers and little is\nunderstood about how to improve decision maker group fairness while maintaining\nclassifier performance.\n This work sets out to add more tools to that toolbox. We introduce\nadaptations of existing effective approaches from the classical classifier\nfairness to the prompt-based classifier space. We also devise simple methods\nthat take advantage of the new structure of prompt-based decision makers and\noperate at the prompt level. We compare these approaches empirically on real\ndata. Our results suggest that adaptations of approaches that are effective for\nclassical classifiers remain effective in the LM-based classifier environment.\nHowever, there is room for further exploration of prompt-based remediation\nmethods (and other remediation methods that take advantage of LM structure).\n","authors":["James Atwood","Nino Scherrer","Preethi Lahoti","Ananth Balashankar","Flavien Prost","Ahmad Beirami"],"pdf_url":"https://arxiv.org/pdf/2406.16738v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13846v2","updated":"2024-12-02T18:03:51Z","published":"2024-05-22T17:14:03Z","title":"Regression Trees Know Calculus","summary":" Regression trees have emerged as a preeminent tool for solving real-world\nregression problems due to their ability to deal with nonlinearities,\ninteraction effects and sharp discontinuities. In this article, we rather study\nregression trees applied to well-behaved, differentiable functions, and\ndetermine the relationship between node parameters and the local gradient of\nthe function being approximated. We find a simple estimate of the gradient\nwhich can be efficiently computed using quantities exposed by popular tree\nlearning libraries. This allows the tools developed in the context of\ndifferentiable algorithms, like neural nets and Gaussian processes, to be\ndeployed to tree-based models. To demonstrate this, we study measures of model\nsensitivity defined in terms of integrals of gradients and demonstrate how to\ncompute them for regression trees using the proposed gradient estimates.\nQuantitative and qualitative numerical experiments reveal the capability of\ngradients estimated by regression trees to improve predictive analysis, solve\ntasks in uncertainty quantification, and provide interpretation of model\nbehavior.\n","authors":["Nathan Wycoff"],"pdf_url":"https://arxiv.org/pdf/2405.13846v2.pdf","comment":"Comments very welcome!"},{"id":"http://arxiv.org/abs/2311.04604v3","updated":"2024-12-02T18:02:53Z","published":"2023-11-08T11:12:27Z","title":"Asynchronous Message-Passing and Zeroth-Order Optimization Based\n Distributed Learning with a Use-Case in Resource Allocation in Communication\n Networks","summary":" Distributed learning and adaptation have received significant interest and\nfound wide-ranging applications in machine learning and signal processing.\nWhile various approaches, such as shared-memory optimization, multi-task\nlearning, and consensus-based learning (e.g., federated learning and learning\nover graphs), focus on optimizing either local costs or a global cost, there\nremains a need for further exploration of their interconnections. This paper\nspecifically focuses on a scenario where agents collaborate towards a common\ntask (i.e., optimizing a global cost equal to aggregated local costs) while\neffectively having distinct individual tasks (i.e., optimizing individual local\nparameters in a local cost). Each agent's actions can potentially impact other\nagents' performance through interactions. Notably, each agent has access to\nonly its local zeroth-order oracle (i.e., cost function value) and shares\nscalar values, rather than gradient vectors, with other agents, leading to\ncommunication bandwidth efficiency and agent privacy. Agents employ\nzeroth-order optimization to update their parameters, and the asynchronous\nmessage-passing between them is subject to bounded but possibly random\ncommunication delays. This paper presents theoretical convergence analyses and\nestablishes a convergence rate for nonconvex problems. Furthermore, it\naddresses the relevant use-case of deep learning-based resource allocation in\ncommunication networks and conducts numerical experiments in which agents,\nacting as transmitters, collaboratively train their individual policies to\nmaximize a global reward, e.g., a sum of data rates.\n","authors":["Pourya Behmandpoor","Marc Moonen","Panagiotis Patrinos"],"pdf_url":"https://arxiv.org/pdf/2311.04604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.24060v5","updated":"2024-12-02T18:00:18Z","published":"2024-10-31T15:57:04Z","title":"Understanding Generalizability of Diffusion Models Requires Rethinking\n the Hidden Gaussian Structure","summary":" In this work, we study the generalizability of diffusion models by looking\ninto the hidden properties of the learned score functions, which are\nessentially a series of deep denoisers trained on various noise levels. We\nobserve that as diffusion models transition from memorization to\ngeneralization, their corresponding nonlinear diffusion denoisers exhibit\nincreasing linearity. This discovery leads us to investigate the linear\ncounterparts of the nonlinear diffusion models, which are a series of linear\nmodels trained to match the function mappings of the nonlinear diffusion\ndenoisers. Surprisingly, these linear denoisers are approximately the optimal\ndenoisers for a multivariate Gaussian distribution characterized by the\nempirical mean and covariance of the training dataset. This finding implies\nthat diffusion models have the inductive bias towards capturing and utilizing\nthe Gaussian structure (covariance information) of the training dataset for\ndata generation. We empirically demonstrate that this inductive bias is a\nunique property of diffusion models in the generalization regime, which becomes\nincreasingly evident when the model's capacity is relatively small compared to\nthe training dataset size. In the case that the model is highly\noverparameterized, this inductive bias emerges during the initial training\nphases before the model fully memorizes its training data. Our study provides\ncrucial insights into understanding the notable strong generalization\nphenomenon recently observed in real-world diffusion models.\n","authors":["Xiang Li","Yixiang Dai","Qing Qu"],"pdf_url":"https://arxiv.org/pdf/2410.24060v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15098v3","updated":"2024-12-02T17:59:40Z","published":"2024-11-22T17:55:15Z","title":"OminiControl: Minimal and Universal Control for Diffusion Transformer","summary":" In this paper, we introduce OminiControl, a highly versatile and\nparameter-efficient framework that integrates image conditions into pre-trained\nDiffusion Transformer (DiT) models. At its core, OminiControl leverages a\nparameter reuse mechanism, enabling the DiT to encode image conditions using\nitself as a powerful backbone and process them with its flexible multi-modal\nattention processors. Unlike existing methods, which rely heavily on additional\nencoder modules with complex architectures, OminiControl (1) effectively and\nefficiently incorporates injected image conditions with only ~0.1% additional\nparameters, and (2) addresses a wide range of image conditioning tasks in a\nunified manner, including subject-driven generation and spatially-aligned\nconditions such as edges, depth, and more. Remarkably, these capabilities are\nachieved by training on images generated by the DiT itself, which is\nparticularly beneficial for subject-driven generation. Extensive evaluations\ndemonstrate that OminiControl outperforms existing UNet-based and DiT-adapted\nmodels in both subject-driven and spatially-aligned conditional generation.\nAdditionally, we release our training dataset, Subjects200K, a diverse\ncollection of over 200,000 identity-consistent images, along with an efficient\ndata synthesis pipeline to advance research in subject-consistent generation.\n","authors":["Zhenxiong Tan","Songhua Liu","Xingyi Yang","Qiaochu Xue","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.15098v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17593v3","updated":"2024-12-02T17:43:20Z","published":"2024-11-26T17:01:27Z","title":"What Differentiates Educational Literature? A Multimodal Fusion Approach\n of Transformers and Computational Linguistics","summary":" The integration of new literature into the English curriculum remains a\nchallenge since educators often lack scalable tools to rapidly evaluate\nreadability and adapt texts for diverse classroom needs. This study proposes to\naddress this gap through a multimodal approach that combines transformer-based\ntext classification with linguistic feature analysis to align texts with UK Key\nStages. Eight state-of-the-art Transformers were fine-tuned on segmented text\ndata, with BERT achieving the highest unimodal F1 score of 0.75. In parallel,\n500 deep neural network topologies were searched for the classification of\nlinguistic characteristics, achieving an F1 score of 0.392. The fusion of these\nmodalities shows a significant improvement, with every multimodal approach\noutperforming all unimodal models. In particular, the ELECTRA Transformer fused\nwith the neural network achieved an F1 score of 0.996. Unimodal and multimodal\napproaches are shown to have statistically significant differences in all\nvalidation metrics (accuracy, precision, recall, F1 score) except for inference\ntime. The proposed approach is finally encapsulated in a stakeholder-facing web\napplication, providing non-technical stakeholder access to real-time insights\non text complexity, reading difficulty, curriculum alignment, and\nrecommendations for learning age range. The application empowers data-driven\ndecision making and reduces manual workload by integrating AI-based\nrecommendations into lesson planning for English literature.\n","authors":["Jordan J. Bird"],"pdf_url":"https://arxiv.org/pdf/2411.17593v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.14973v2","updated":"2024-12-02T17:35:07Z","published":"2024-01-26T16:06:01Z","title":"Discovering group dynamics in coordinated time series via hierarchical\n recurrent switching-state models","summary":" We seek a computationally efficient model for a collection of time series\narising from multiple interacting entities (a.k.a. \"agents\"). Recent models of\nspatiotemporal patterns across individuals fail to incorporate explicit\nsystem-level collective behavior that can influence the trajectories of\nindividual entities. To address this gap in the literature, we present a new\nhierarchical switching-state model that can be trained in an unsupervised\nfashion to simultaneously learn both system-level and individual-level\ndynamics. We employ a latent system-level discrete state Markov chain that\nprovides top-down influence on latent entity-level chains which in turn govern\nthe emission of each observed time series. Recurrent feedback from the\nobservations to the latent chains at both entity and system levels allows\nrecent situational context to inform how dynamics unfold at all levels in\nbottom-up fashion. We hypothesize that including both top-down and bottom-up\ninfluences on group dynamics will improve interpretability of the learned\ndynamics and reduce error when forecasting. Our hierarchical switching\nrecurrent dynamical model can be learned via closed-form variational coordinate\nascent updates to all latent chains that scale linearly in the number of\nentities. This is asymptotically no more costly than fitting a separate model\nfor each entity. Analysis of both synthetic data and real basketball team\nmovements suggests our lean parametric model can achieve competitive forecasts\ncompared to larger neural network models that require far more computational\nresources. Further experiments on soldier data as well as a synthetic task with\n64 cooperating entities show how our approach can yield interpretable insights\nabout team dynamics over time.\n","authors":["Michael T. Wojnowicz","Kaitlin Gili","Preetish Rath","Eric Miller","Jeffrey Miller","Clifford Hancock","Meghan O'Donovan","Seth Elkin-Frankston","Tad T. Brunyé","Michael C. Hughes"],"pdf_url":"https://arxiv.org/pdf/2401.14973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.17644v4","updated":"2024-12-02T17:12:54Z","published":"2024-04-26T18:08:15Z","title":"A Conditional Independence Test in the Presence of Discretization","summary":" Testing conditional independence has many applications, such as in Bayesian\nnetwork learning and causal discovery. Different test methods have been\nproposed. However, existing methods generally can not work when only\ndiscretized observations are available. Specifically, consider $X_1$,\n$\\tilde{X}_2$ and $X_3$ are observed variables, where $\\tilde{X}_2$ is a\ndiscretization of latent variables $X_2$. Applying existing test methods to the\nobservations of $X_1$, $\\tilde{X}_2$ and $X_3$ can lead to a false conclusion\nabout the underlying conditional independence of variables $X_1$, $X_2$ and\n$X_3$. Motivated by this, we propose a conditional independence test\nspecifically designed to accommodate the presence of such discretization. To\nachieve this, we design the bridge equations to recover the parameter\nreflecting the statistical information of the underlying latent continuous\nvariables. An appropriate test statistic and its asymptotic distribution under\nthe null hypothesis of conditional independence have also been derived. Both\ntheoretical results and empirical validation have been provided, demonstrating\nthe effectiveness of our test methods.\n","authors":["Boyang Sun","Yu Yao","Huangyuan Hao","Yumou Qiu","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.17644v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07118v3","updated":"2024-12-02T17:11:07Z","published":"2024-11-11T16:45:18Z","title":"ConvMixFormer- A Resource-efficient Convolution Mixer for\n Transformer-based Dynamic Hand Gesture Recognition","summary":" Transformer models have demonstrated remarkable success in many domains such\nas natural language processing (NLP) and computer vision. With the growing\ninterest in transformer-based architectures, they are now utilized for gesture\nrecognition. So, we also explore and devise a novel ConvMixFormer architecture\nfor dynamic hand gestures. The transformers use quadratic scaling of the\nattention features with the sequential data, due to which these models are\ncomputationally complex and heavy. We have considered this drawback of the\ntransformer and designed a resource-efficient model that replaces the\nself-attention in the transformer with the simple convolutional layer-based\ntoken mixer. The computational cost and the parameters used for the\nconvolution-based mixer are comparatively less than the quadratic\nself-attention. Convolution-mixer helps the model capture the local spatial\nfeatures that self-attention struggles to capture due to their sequential\nprocessing nature. Further, an efficient gate mechanism is employed instead of\na conventional feed-forward network in the transformer to help the model\ncontrol the flow of features within different stages of the proposed model.\nThis design uses fewer learnable parameters which is nearly half the vanilla\ntransformer that helps in fast and efficient training. The proposed method is\nevaluated on NVidia Dynamic Hand Gesture and Briareo datasets and our model has\nachieved state-of-the-art results on single and multimodal inputs. We have also\nshown the parameter efficiency of the proposed ConvMixFormer model compared to\nother methods. The source code is available at\nhttps://github.com/mallikagarg/ConvMixFormer.\n","authors":["Mallika Garg","Debashis Ghosh","Pyari Mohan Pradhan"],"pdf_url":"https://arxiv.org/pdf/2411.07118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17251v2","updated":"2024-12-02T16:37:41Z","published":"2024-11-26T09:29:27Z","title":"DGNN-YOLO: Dynamic Graph Neural Networks with YOLO11 for Small Object\n Detection and Tracking in Traffic Surveillance","summary":" Accurate detection and tracking of small objects such as pedestrians,\ncyclists, and motorbikes are critical for traffic surveillance systems, which\nare crucial in improving road safety and decision-making in intelligent\ntransportation systems. However, traditional methods struggle with challenges\nsuch as occlusion, low resolution, and dynamic traffic conditions,\nnecessitating innovative approaches to address these limitations. This paper\nintroduces DGNN-YOLO, a novel framework integrating dynamic graph neural\nnetworks (DGNN) with YOLO11 to enhance small object detection and tracking in\ntraffic surveillance systems. The framework leverages YOLO11's advanced spatial\nfeature extraction capabilities for precise object detection and incorporates\nDGNN to model spatial-temporal relationships for robust real-time tracking\ndynamically. By constructing and updating graph structures, DGNN-YOLO\neffectively represents objects as nodes and their interactions as edges,\nensuring adaptive and accurate tracking in complex and dynamic environments.\nExtensive experiments demonstrate that DGNN-YOLO consistently outperforms\nstate-of-the-art methods in detecting and tracking small objects under diverse\ntraffic conditions, achieving the highest precision (0.8382), recall (0.6875),\nand mAP@0.5:0.95 (0.6476), showcasing its robustness and scalability,\nparticularly in challenging scenarios involving small and occluded objects.\nThis work provides a scalable, real-time traffic surveillance and analysis\nsolution, significantly contributing to intelligent transportation systems.\n","authors":["Shahriar Soudeep","M. F. Mridha","Md Abrar Jahin","Nilanjan Dey"],"pdf_url":"https://arxiv.org/pdf/2411.17251v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17311v3","updated":"2024-12-02T16:29:47Z","published":"2024-05-27T16:11:49Z","title":"Probabilistic Graph Rewiring via Virtual Nodes","summary":" Message-passing graph neural networks (MPNNs) have emerged as a powerful\nparadigm for graph-based machine learning. Despite their effectiveness, MPNNs\nface challenges such as under-reaching and over-squashing, where limited\nreceptive fields and structural bottlenecks hinder information flow in the\ngraph. While graph transformers hold promise in addressing these issues, their\nscalability is limited due to quadratic complexity regarding the number of\nnodes, rendering them impractical for larger graphs. Here, we propose\nimplicitly rewired message-passing neural networks (IPR-MPNNs), a novel\napproach that integrates implicit probabilistic graph rewiring into MPNNs. By\nintroducing a small number of virtual nodes, i.e., adding additional nodes to a\ngiven graph and connecting them to existing nodes, in a differentiable,\nend-to-end manner, IPR-MPNNs enable long-distance message propagation,\ncircumventing quadratic complexity. Theoretically, we demonstrate that\nIPR-MPNNs surpass the expressiveness of traditional MPNNs. Empirically, we\nvalidate our approach by showcasing its ability to mitigate under-reaching and\nover-squashing effects, achieving state-of-the-art performance across multiple\ngraph datasets. Notably, IPR-MPNNs outperform graph transformers while\nmaintaining significantly faster computational efficiency.\n","authors":["Chendi Qian","Andrei Manolache","Christopher Morris","Mathias Niepert"],"pdf_url":"https://arxiv.org/pdf/2405.17311v3.pdf","comment":"Accepted at 38th Conference on Neural Information Processing Systems\n (NeurIPS 2024), Vancouver, Canada"},{"id":"http://arxiv.org/abs/2409.19839v3","updated":"2024-12-02T16:27:16Z","published":"2024-09-30T00:41:51Z","title":"ForecastBench: A Dynamic Benchmark of AI Forecasting Capabilities","summary":" Forecasts of future events are essential inputs into informed\ndecision-making. Machine learning (ML) systems have the potential to deliver\nforecasts at scale, but there is no framework for evaluating the accuracy of ML\nsystems on a standardized set of forecasting questions. To address this gap, we\nintroduce ForecastBench: a dynamic benchmark that evaluates the accuracy of ML\nsystems on an automatically generated and regularly updated set of 1,000\nforecasting questions. To avoid any possibility of data leakage, ForecastBench\nis comprised solely of questions about future events that have no known answer\nat the time of submission. We quantify the capabilities of current ML systems\nby collecting forecasts from expert (human) forecasters, the general public,\nand LLMs on a random subset of questions from the benchmark ($N=200$). While\nLLMs have achieved super-human performance on many benchmarks, they perform\nless well here: expert forecasters outperform the top-performing LLM (p-value\n$<0.01$). We display system and human scores in a public leaderboard at\nwww.forecastbench.org.\n","authors":["Ezra Karger","Houtan Bastani","Chen Yueh-Han","Zachary Jacobs","Danny Halawi","Fred Zhang","Philip E. Tetlock"],"pdf_url":"https://arxiv.org/pdf/2409.19839v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17339v2","updated":"2024-12-02T16:08:41Z","published":"2024-05-27T16:42:51Z","title":"Physics-Informed Real NVP for Satellite Power System Fault Detection","summary":" The unique challenges posed by the space environment, characterized by\nextreme conditions and limited accessibility, raise the need for robust and\nreliable techniques to identify and prevent satellite faults. Fault detection\nmethods in the space sector are required to ensure mission success and to\nprotect valuable assets. In this context, this paper proposes an Artificial\nIntelligence (AI) based fault detection methodology and evaluates its\nperformance on ADAPT (Advanced Diagnostics and Prognostics Testbed), an\nElectrical Power System (EPS) dataset, crafted in laboratory by NASA. Our study\nfocuses on the application of a physics-informed (PI) real-valued non-volume\npreserving (Real NVP) model for fault detection in space systems. The efficacy\nof this method is systematically compared against other AI approaches such as\nGated Recurrent Unit (GRU) and Autoencoder-based techniques. Results show that\nour physics-informed approach outperforms existing methods of fault detection,\ndemonstrating its suitability for addressing the unique challenges of satellite\nEPS sub-system faults. Furthermore, we unveil the competitive advantage of\nphysics-informed loss in AI models to address specific space needs, namely\nrobustness, reliability, and power constraints, crucial for space exploration\nand satellite missions.\n","authors":["Carlo Cena","Umberto Albertin","Mauro Martini","Silvia Bucci","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2405.17339v2.pdf","comment":"C. Cena, U. Albertin, M. Martini, S. Bucci and M. Chiaberge,\n \"Physics-Informed Real NVP for Satellite Power System Fault Detection,\" 2024\n IEEE International Conference on Advanced Intelligent Mechatronics (AIM),\n Boston, MA, USA, 2024, pp. 679-684, doi: 10.1109/AIM55361.2024.10636990"},{"id":"http://arxiv.org/abs/2407.02861v2","updated":"2024-12-02T16:04:40Z","published":"2024-07-03T07:19:41Z","title":"A Self-Supervised Task for Fault Detection in Satellite Multivariate\n Time Series","summary":" In the space sector, due to environmental conditions and restricted\naccessibility, robust fault detection methods are imperative for ensuring\nmission success and safeguarding valuable assets. This work proposes a novel\napproach leveraging Physics-Informed Real NVP neural networks, renowned for\ntheir ability to model complex and high-dimensional distributions, augmented\nwith a self-supervised task based on sensors' data permutation. It focuses on\nenhancing fault detection within the satellite multivariate time series. The\nexperiments involve various configurations, including pre-training with\nself-supervision, multi-task learning, and standalone self-supervised training.\nResults indicate significant performance improvements across all settings. In\nparticular, employing only the self-supervised loss yields the best overall\nresults, suggesting its efficacy in guiding the network to extract relevant\nfeatures for fault detection. This study presents a promising direction for\nimproving fault detection in space systems and warrants further exploration in\nother datasets and applications.\n","authors":["Carlo Cena","Silvia Bucci","Alessandro Balossino","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2407.02861v2.pdf","comment":"SPAICE: AI in and for Space, 2024"},{"id":"http://arxiv.org/abs/2411.04630v2","updated":"2024-12-02T15:47:17Z","published":"2024-11-07T11:29:55Z","title":"Brain Tumour Removing and Missing Modality Generation using 3D WDM","summary":" This paper presents the second-placed solution for task 8 and the\nparticipation solution for task 7 of BraTS 2024. The adoption of automated\nbrain analysis algorithms to support clinical practice is increasing. However,\nmany of these algorithms struggle with the presence of brain lesions or the\nabsence of certain MRI modalities. The alterations in the brain's morphology\nleads to high variability and thus poor performance of predictive models that\nwere trained only on healthy brains. The lack of information that is usually\nprovided by some of the missing MRI modalities also reduces the reliability of\nthe prediction models trained with all modalities. In order to improve the\nperformance of these models, we propose the use of conditional 3D wavelet\ndiffusion models. The wavelet transform enabled full-resolution image training\nand prediction on a GPU with 48 GB VRAM, without patching or downsampling,\npreserving all information for prediction. The code for these tasks is\navailable at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Gijs Luijten","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04630v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12850v2","updated":"2024-12-02T15:46:35Z","published":"2024-07-08T09:50:49Z","title":"Limits to Predicting Online Speech Using Large Language Models","summary":" We study the predictability of online speech on social media, and whether\npredictability improves with information outside a user's own posts. Recent\ntheoretical results suggest that posts from a user's social circle are as\npredictive of the user's future posts as that of the user's past posts.\nMotivated by the success of large language models, we empirically test this\nhypothesis. We define predictability as a measure of the model's uncertainty,\ni.e., its negative log-likelihood on future tokens given context. As the basis\nof our study, we collect 10M tweets for ``tweet-tuning'' base models and a\nfurther 6.25M posts from more than five thousand X (previously Twitter) users\nand their peers. Across four large language models ranging in size from 1.5\nbillion to 70 billion parameters, we find that predicting a user's posts from\ntheir peers' posts performs poorly. Moreover, the value of the user's own posts\nfor prediction is consistently higher than that of their peers'. We extend our\ninvestigation with a detailed analysis on what's learned in-context and the\nrobustness of our findings. From context, base models learn to correctly\npredict @-mentions and hashtags. Moreover, our results replicate if instead of\nprompting the model with additional context, we finetune on it. Across the\nboard, we find that predicting the posts of individual users remains hard.\n","authors":["Mina Remeli","Moritz Hardt","Robert C. Williamson"],"pdf_url":"https://arxiv.org/pdf/2407.12850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.16867v2","updated":"2024-12-02T15:42:53Z","published":"2024-07-23T22:23:47Z","title":"From Text to Insight: Large Language Models for Materials Science Data\n Extraction","summary":" The vast majority of materials science knowledge exists in unstructured\nnatural language, yet structured data is crucial for innovative and systematic\nmaterials design. Traditionally, the field has relied on manual curation and\npartial automation for data extraction for specific use cases. The advent of\nlarge language models (LLMs) represents a significant shift, potentially\nenabling efficient extraction of structured, actionable data from unstructured\ntext by non-experts. While applying LLMs to materials science data extraction\npresents unique challenges, domain knowledge offers opportunities to guide and\nvalidate LLM outputs. This review provides a comprehensive overview of\nLLM-based structured data extraction in materials science, synthesizing current\nknowledge and outlining future directions. We address the lack of standardized\nguidelines and present frameworks for leveraging the synergy between LLMs and\nmaterials science expertise. This work serves as a foundational resource for\nresearchers aiming to harness LLMs for data-driven materials research. The\ninsights presented here could significantly enhance how researchers across\ndisciplines access and utilize scientific information, potentially accelerating\nthe development of novel materials for critical societal needs.\n","authors":["Mara Schilling-Wilhelmi","Martiño Ríos-García","Sherjeel Shabih","María Victoria Gil","Santiago Miret","Christoph T. Koch","José A. Márquez","Kevin Maik Jablonka"],"pdf_url":"https://arxiv.org/pdf/2407.16867v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06562v2","updated":"2024-12-02T15:32:41Z","published":"2023-12-11T17:46:44Z","title":"On Meta-Prompting","summary":" Modern generative language models are capable of interpreting input strings\nas instructions, or prompts, and carry out tasks based on them. Many approaches\nto prompting and pre-training these models involve the automated generation of\nthese prompts: meta-prompting, or prompting to obtain prompts. We propose a\ntheoretical framework based on category theory to generalize and describe them.\nThis framework is flexible enough to account for stochasticity, and allows us\nto obtain formal results around task agnosticity and equivalence of various\nmeta-prompting approaches. Experimentally, we test our framework in two active\nareas of model research: creativity and ideation. We find that user preference\nstrongly favors (p < 0.01) the prompts generated under meta-prompting, as well\nas their corresponding outputs, over a series of hardcoded baseline prompts\nthat include the original task definition. Using our framework, we argue that\nmeta-prompting is more effective than basic prompting at generating desirable\noutputs.\n","authors":["Adrian de Wynter","Xun Wang","Qilong Gu","Si-Qing Chen"],"pdf_url":"https://arxiv.org/pdf/2312.06562v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2407.01036v2","updated":"2024-12-02T15:31:12Z","published":"2024-07-01T07:40:08Z","title":"Ranking by Lifts: A Cost-Benefit Approach to Large-Scale A/B Tests","summary":" A/B testers that conduct large-scale tests often prioritize lifts as the main\noutcome metric and want to be able to control costs resulting from false\nrejections of the null. This work develops a decision-theoretic framework for\nmaximizing profits subject to false discovery rate (FDR) control. We build an\nempirical Bayes solution for the problem via a greedy knapsack approach. We\nderive an oracle rule based on ranking the ratio of expected lifts and the cost\nof wrong rejections using the local false discovery rate (lfdr) statistic. Our\noracle decision rule is valid and optimal for large-scale tests. Further, we\nestablish asymptotic validity for the data-driven procedure and demonstrate\nfinite-sample validity in experimental studies. We also demonstrate the merit\nof the proposed method over other FDR control methods. Finally, we discuss an\napplication to data collected by experiments on the Optimizely platform.\n","authors":["Pallavi Basu","Ron Berman"],"pdf_url":"https://arxiv.org/pdf/2407.01036v2.pdf","comment":"Updated"},{"id":"http://arxiv.org/abs/2411.08085v2","updated":"2024-12-02T15:20:08Z","published":"2024-11-12T16:52:51Z","title":"Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation,\n Embrace Orthogonality","summary":" We introduce a yat-product-powered neural network, the Neural Matter Network\n(NMN), a breakthrough in deep learning that achieves non-linear pattern\nrecognition without activation functions. Our key innovation relies on the\nyat-product and yat-product, which naturally induces non-linearity by\nprojecting inputs into a pseudo-metric space, eliminating the need for\ntraditional activation functions while maintaining only a softmax layer for\nfinal class probability distribution. This approach simplifies network\narchitecture and provides unprecedented transparency into the network's\ndecision-making process. Our comprehensive empirical evaluation across\ndifferent datasets demonstrates that NMN consistently outperforms traditional\nMLPs. The results challenge the assumption that separate activation functions\nare necessary for effective deep-learning models. The implications of this work\nextend beyond immediate architectural benefits, by eliminating intermediate\nactivation functions while preserving non-linear capabilities, yat-MLP\nestablishes a new paradigm for neural network design that combines simplicity\nwith effectiveness. Most importantly, our approach provides unprecedented\ninsights into the traditionally opaque \"black-box\" nature of neural networks,\noffering a clearer understanding of how these models process and classify\ninformation.\n","authors":["Taha Bouhsine"],"pdf_url":"https://arxiv.org/pdf/2411.08085v2.pdf","comment":"fixed proof, added softermax"},{"id":"http://arxiv.org/abs/2407.08751v2","updated":"2024-12-02T15:16:03Z","published":"2024-06-27T13:47:06Z","title":"Latent Diffusion for Neural Spiking Data","summary":" Modern datasets in neuroscience enable unprecedented inquiries into the\nrelationship between complex behaviors and the activity of many simultaneously\nrecorded neurons. While latent variable models can successfully extract\nlow-dimensional embeddings from such recordings, using them to generate\nrealistic spiking data, especially in a behavior-dependent manner, still poses\na challenge. Here, we present Latent Diffusion for Neural Spiking data (LDNS),\na diffusion-based generative model with a low-dimensional latent space: LDNS\nemploys an autoencoder with structured state-space (S4) layers to project\ndiscrete high-dimensional spiking data into continuous time-aligned latents. On\nthese inferred latents, we train expressive (conditional) diffusion models,\nenabling us to sample neural activity with realistic single-neuron and\npopulation spiking statistics. We validate LDNS on synthetic data, accurately\nrecovering latent structure, firing rates, and spiking statistics. Next, we\ndemonstrate its flexibility by generating variable-length data that mimics\nhuman cortical activity during attempted speech. We show how to equip LDNS with\nan expressive observation model that accounts for single-neuron dynamics not\nmediated by the latent state, further increasing the realism of generated\nsamples. Finally, conditional LDNS trained on motor cortical activity during\ndiverse reaching behaviors can generate realistic spiking data given reach\ndirection or unseen reach trajectories. In summary, LDNS simultaneously enables\ninference of low-dimensional latents and realistic conditional generation of\nneural spiking datasets, opening up further possibilities for simulating\nexperimentally testable hypotheses.\n","authors":["Jaivardhan Kapoor","Auguste Schulz","Julius Vetter","Felix Pei","Richard Gao","Jakob H. Macke"],"pdf_url":"https://arxiv.org/pdf/2407.08751v2.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2405.04101v2","updated":"2024-12-02T14:54:31Z","published":"2024-05-07T08:15:48Z","title":"Continual Learning in the Presence of Repetition","summary":" Continual learning (CL) provides a framework for training models in\never-evolving environments. Although re-occurrence of previously seen objects\nor tasks is common in real-world problems, the concept of repetition in the\ndata stream is not often considered in standard benchmarks for CL. Unlike with\nthe rehearsal mechanism in buffer-based strategies, where sample repetition is\ncontrolled by the strategy, repetition in the data stream naturally stems from\nthe environment. This report provides a summary of the CLVision challenge at\nCVPR 2023, which focused on the topic of repetition in class-incremental\nlearning. The report initially outlines the challenge objective and then\ndescribes three solutions proposed by finalist teams that aim to effectively\nexploit the repetition in the stream to learn continually. The experimental\nresults from the challenge highlight the effectiveness of ensemble-based\nsolutions that employ multiple versions of similar modules, each trained on\ndifferent but overlapping subsets of classes. This report underscores the\ntransformative potential of taking a different perspective in CL by employing\nrepetition in the data stream to foster innovative strategy design.\n","authors":["Hamed Hemati","Lorenzo Pellegrini","Xiaotian Duan","Zixuan Zhao","Fangfang Xia","Marc Masana","Benedikt Tscheschner","Eduardo Veas","Yuxiang Zheng","Shiji Zhao","Shao-Yuan Li","Sheng-Jun Huang","Vincenzo Lomonaco","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2405.04101v2.pdf","comment":"Accepted version, to appear in Neural Networks; Challenge Report of\n the 4th Workshop on Continual Learning in Computer Vision at CVPR"},{"id":"http://arxiv.org/abs/2405.09273v7","updated":"2024-12-02T14:49:26Z","published":"2024-05-15T11:42:41Z","title":"Fair Generalized Linear Mixed Models","summary":" When using machine learning for automated prediction, it is important to\naccount for fairness in the prediction. Fairness in machine learning aims to\nensure that biases in the data and model inaccuracies do not lead to\ndiscriminatory decisions. E.g., predictions from fair machine learning models\nshould not discriminate against sensitive variables such as sexual orientation\nand ethnicity. The training data often in obtained from social surveys. In\nsocial surveys, oftentimes the data collection process is a strata sampling,\ne.g. due to cost restrictions. In strata samples, the assumption of\nindependence between the observation is not fulfilled. Hence, if the machine\nlearning models do not account for the strata correlations, the results may be\nbiased. Especially high is the bias in cases where the strata assignment is\ncorrelated to the variable of interest. We present in this paper an algorithm\nthat can handle both problems simultaneously, and we demonstrate the impact of\nstratified sampling on the quality of fair machine learning predictions in a\nreproducible simulation study.\n","authors":["Jan Pablo Burgard","João Vitor Pamplona"],"pdf_url":"https://arxiv.org/pdf/2405.09273v7.pdf","comment":"25 pages, 12 figures. arXiv admin note: text overlap with\n arXiv:2405.06433"},{"id":"http://arxiv.org/abs/2405.06433v6","updated":"2024-12-02T14:47:51Z","published":"2024-05-10T12:25:06Z","title":"Fair Mixed Effects Support Vector Machine","summary":" To ensure unbiased and ethical automated predictions, fairness must be a core\nprinciple in machine learning applications. Fairness in machine learning aims\nto mitigate biases present in the training data and model imperfections that\ncould lead to discriminatory outcomes. This is achieved by preventing the model\nfrom making decisions based on sensitive characteristics like ethnicity or\nsexual orientation. A fundamental assumption in machine learning is the\nindependence of observations. However, this assumption often does not hold true\nfor data describing social phenomena, where data points are often clustered\nbased. Hence, if the machine learning models do not account for the cluster\ncorrelations, the results may be biased. Especially high is the bias in cases\nwhere the cluster assignment is correlated to the variable of interest. We\npresent a fair mixed effects support vector machine algorithm that can handle\nboth problems simultaneously. With a reproducible simulation study we\ndemonstrate the impact of clustered data on the quality of fair machine\nlearning predictions.\n","authors":["Jan Pablo Burgard","João Vitor Pamplona"],"pdf_url":"https://arxiv.org/pdf/2405.06433v6.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2405.15158v2","updated":"2024-12-02T14:42:34Z","published":"2024-05-24T02:26:45Z","title":"ProtFAD: Introducing function-aware domains as implicit modality towards\n protein function prediction","summary":" Protein function prediction is currently achieved by encoding its sequence or\nstructure, where the sequence-to-function transcendence and high-quality\nstructural data scarcity lead to obvious performance bottlenecks. Protein\ndomains are \"building blocks\" of proteins that are functionally independent,\nand their combinations determine the diverse biological functions. However,\nmost existing studies have yet to thoroughly explore the intricate functional\ninformation contained in the protein domains. To fill this gap, we propose a\nsynergistic integration approach for a function-aware domain representation,\nand a domain-joint contrastive learning strategy to distinguish different\nprotein functions while aligning the modalities. Specifically, we align the\ndomain semantics with GO terms and text description to pre-train domain\nembeddings. Furthermore, we partition proteins into multiple sub-views based on\ncontinuous joint domains for contrastive training under the supervision of a\nnovel triplet InfoNCE loss. Our approach significantly and comprehensively\noutperforms the state-of-the-art methods on various benchmarks, and clearly\ndifferentiates proteins carrying distinct functions compared to the competitor.\nOur implementation is available at\nhttps://github.com/AI-HPC-Research-Team/ProtFAD.\n","authors":["Mingqing Wang","Zhiwei Nie","Yonghong He","Athanasios V. Vasilakos","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2405.15158v2.pdf","comment":"17 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2108.11986v2","updated":"2024-12-02T14:25:58Z","published":"2021-08-25T11:45:40Z","title":"Anomaly Detection in Medical Imaging -- A Mini Review","summary":" The increasing digitization of medical imaging enables machine learning based\nimprovements in detecting, visualizing and segmenting lesions, easing the\nworkload for medical experts. However, supervised machine learning requires\nreliable labelled data, which is is often difficult or impossible to collect or\nat least time consuming and thereby costly. Therefore methods requiring only\npartly labeled data (semi-supervised) or no labeling at all (unsupervised\nmethods) have been applied more regularly. Anomaly detection is one possible\nmethodology that is able to leverage semi-supervised and unsupervised methods\nto handle medical imaging tasks like classification and segmentation. This\npaper uses a semi-exhaustive literature review of relevant anomaly detection\npapers in medical imaging to cluster into applications, highlight important\nresults, establish lessons learned and give further advice on how to approach\nanomaly detection in medical imaging. The qualitative analysis is based on\ngoogle scholar and 4 different search terms, resulting in 120 different\nanalysed papers. The main results showed that the current research is mostly\nmotivated by reducing the need for labelled data. Also, the successful and\nsubstantial amount of research in the brain MRI domain shows the potential for\napplications in further domains like OCT and chest X-ray.\n","authors":["Maximilian E. Tschuchnig","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2108.11986v2.pdf","comment":"Accepted and presented at iDSC2021 edit: During work on this\n publication Maximilian Ernst Tschuchnig was affiliated with Salzburg\n University of Applied Sciences and University of Salzburg"},{"id":"http://arxiv.org/abs/2410.01639v2","updated":"2024-12-02T14:25:30Z","published":"2024-10-02T15:09:36Z","title":"Moral Alignment for LLM Agents","summary":" Decision-making agents based on pre-trained Large Language Models (LLMs) are\nincreasingly being deployed across various domains of human activity. While\ntheir applications are currently rather specialized, several research efforts\nare under way to develop more generalist agents. As LLM-based systems become\nmore agentic, their influence on human activity will grow and the transparency\nof this will decrease. Consequently, developing effective methods for aligning\nthem to human values is vital.\n The prevailing practice in alignment often relies on human preference data\n(e.g., in RLHF or DPO), in which values are implicit and are essentially\ndeduced from relative preferences over different model outputs. In this work,\ninstead of relying on human feedback, we introduce the design of reward\nfunctions that explicitly encode core human values for Reinforcement\nLearning-based fine-tuning of foundation agent models. Specifically, we use\nintrinsic rewards for the moral alignment of LLM agents.\n We evaluate our approach using the traditional philosophical frameworks of\nDeontological Ethics and Utilitarianism, quantifying moral rewards for agents\nin terms of actions and consequences on the Iterated Prisoner's Dilemma (IPD)\nenvironment. We also show how moral fine-tuning can be deployed to enable an\nagent to unlearn a previously developed selfish strategy. Finally, we find that\ncertain moral strategies learned on the IPD game generalize to several other\nmatrix game environments. In summary, we demonstrate that fine-tuning with\nintrinsic rewards is a promising general solution for aligning LLM agents to\nhuman values, and it might represent a more transparent and cost-effective\nalternative to currently predominant alignment techniques.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2410.01639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15166v2","updated":"2024-12-02T14:20:12Z","published":"2023-06-27T02:47:59Z","title":"Constraining Generative Models for Engineering Design with Negative Data","summary":" Generative models have recently achieved remarkable success and widespread\nadoption in society, yet they often struggle to generate realistic and accurate\noutputs. This challenge extends beyond language and vision into fields like\nengineering design, where safety-critical engineering standards and\nnon-negotiable physical laws tightly constrain what outputs are considered\nacceptable. In this work, we introduce a novel training method to guide a\ngenerative model toward constraint-satisfying outputs using `negative data' --\nexamples of what to avoid. Our negative-data generative model (NDGM)\nformulation easily outperforms classic models, generating 1/6 as many\nconstraint-violating samples using 1/8 as much data in certain problems. It\nalso consistently outperforms other baselines, achieving a balance between\nconstraint satisfaction and distributional similarity that is unsurpassed by\nany other model in 12 of the 14 problems tested. This widespread superiority is\nrigorously demonstrated across numerous synthetic tests and real engineering\nproblems, such as ship hull synthesis with hydrodynamic constraints and vehicle\ndesign with impact safety constraints. Our benchmarks showcase both the\nbest-in-class performance of our new NDGM formulation and the overall dominance\nof NDGMs versus classic generative models. We publicly release the code and\nbenchmarks at https://github.com/Lyleregenwetter/NDGMs.\n","authors":["Lyle Regenwetter","Giorgio Giannone","Akash Srivastava","Dan Gutfreund","Faez Ahmed"],"pdf_url":"https://arxiv.org/pdf/2306.15166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13306v6","updated":"2024-12-02T14:13:35Z","published":"2023-01-30T21:59:30Z","title":"Autobidders with Budget and ROI Constraints: Efficiency, Regret, and\n Pacing Dynamics","summary":" We study a game between autobidding algorithms that compete in an online\nadvertising platform. Each autobidder is tasked with maximizing its\nadvertiser's total value over multiple rounds of a repeated auction, subject to\nbudget and return-on-investment constraints. We propose a gradient-based\nlearning algorithm that is guaranteed to satisfy all constraints and achieves\nvanishing individual regret. Our algorithm uses only bandit feedback and can be\nused with the first- or second-price auction, as well as with any\n\"intermediate\" auction format. Our main result is that when these autobidders\nplay against each other, the resulting expected liquid welfare over all rounds\nis at least half of the expected optimal liquid welfare achieved by any\nallocation. This holds whether or not the bidding dynamics converges to an\nequilibrium.\n","authors":["Brendan Lucier","Sarath Pattathil","Aleksandrs Slivkins","Mengxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.13306v6.pdf","comment":"Appeared at COLT 2024. Numerical experiments added since Jun'24\n version"},{"id":"http://arxiv.org/abs/2204.10942v2","updated":"2024-12-02T14:12:18Z","published":"2022-04-22T21:48:56Z","title":"Evaluation of Multi-Scale Multiple Instance Learning to Improve Thyroid\n Cancer Classification","summary":" Thyroid cancer is currently the fifth most common malignancy diagnosed in\nwomen. Since differentiation of cancer sub-types is important for treatment and\ncurrent, manual methods are time consuming and subjective, automatic\ncomputer-aided differentiation of cancer types is crucial. Manual\ndifferentiation of thyroid cancer is based on tissue sections, analysed by\npathologists using histological features. Due to the enormous size of gigapixel\nwhole slide images, holistic classification using deep learning methods is not\nfeasible. Patch based multiple instance learning approaches, combined with\naggregations such as bag-of-words, is a common approach. This work's\ncontribution is to extend a patch based state-of-the-art method by generating\nand combining feature vectors of three different patch resolutions and\nanalysing three distinct ways of combining them. The results showed\nimprovements in one of the three multi-scale approaches, while the others led\nto decreased scores. This provides motivation for analysis and discussion of\nthe individual approaches.\n","authors":["Maximilian E. Tschuchnig","Philipp Grubmüller","Lea M. Stangassinger","Christina Kreutzer","Sébastien Couillard-Després","Gertie J. Oostingh","Anton Hittmair","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2204.10942v2.pdf","comment":"Accepted and presented at IPTA 2022 (Best Paper) edit: During work on\n this publication Maximilian Ernst Tschuchnig was affiliated with Salzburg\n University of Applied Sciences and University of Salzburg"},{"id":"http://arxiv.org/abs/2409.11141v2","updated":"2024-12-02T14:03:32Z","published":"2024-09-17T12:52:16Z","title":"Sample Complexity Bounds for Linear System Identification from a Finite\n Set","summary":" This paper considers a finite sample perspective on the problem of\nidentifying an LTI system from a finite set of possible systems using\ntrajectory data. To this end, we use the maximum likelihood estimator to\nidentify the true system and provide an upper bound for its sample complexity.\nCrucially, the derived bound does not rely on a potentially restrictive\nstability assumption. Additionally, we leverage tools from information theory\nto provide a lower bound to the sample complexity that holds independently of\nthe used estimator. The derived sample complexity bounds are analyzed\nanalytically and numerically.\n","authors":["Nicolas Chatzikiriakos","Andrea Iannelli"],"pdf_url":"https://arxiv.org/pdf/2409.11141v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07123v2","updated":"2024-12-02T13:04:18Z","published":"2024-09-11T09:21:20Z","title":"Cross-Refine: Improving Natural Language Explanation Generation by\n Learning in Tandem","summary":" Natural language explanations (NLEs) are vital for elucidating the reasoning\nbehind large language model (LLM) decisions. Many techniques have been\ndeveloped to generate NLEs using LLMs. However, like humans, LLMs might not\nalways produce optimal NLEs on first attempt. Inspired by human learning\nprocesses, we introduce Cross-Refine, which employs role modeling by deploying\ntwo LLMs as generator and critic, respectively. The generator outputs a first\nNLE and then refines this initial explanation using feedback and suggestions\nprovided by the critic. Cross-Refine does not require any supervised training\ndata or additional training. We validate Cross-Refine across three NLP tasks\nusing three state-of-the-art open-source LLMs through automatic and human\nevaluation. We select Self-Refine (Madaan et al., 2023) as the baseline, which\nonly utilizes self-feedback to refine the explanations. Our findings from\nautomatic evaluation and a user study indicate that Cross-Refine outperforms\nSelf-Refine. Meanwhile, Cross-Refine can perform effectively with less powerful\nLLMs, whereas Self-Refine only yields strong results with ChatGPT.\nAdditionally, we conduct an ablation study to assess the importance of feedback\nand suggestions. Both of them play an important role in refining explanations.\nWe further evaluate Cross-Refine on a bilingual dataset in English and German.\n","authors":["Qianli Wang","Tatiana Anikina","Nils Feldhus","Simon Ostermann","Sebastian Möller","Vera Schmitt"],"pdf_url":"https://arxiv.org/pdf/2409.07123v2.pdf","comment":"Accepted at COLING 2025; long paper"},{"id":"http://arxiv.org/abs/2305.15798v4","updated":"2024-12-02T12:58:23Z","published":"2023-05-25T07:28:28Z","title":"BK-SDM: A Lightweight, Fast, and Cheap Version of Stable Diffusion","summary":" Text-to-image (T2I) generation with Stable Diffusion models (SDMs) involves\nhigh computing demands due to billion-scale parameters. To enhance efficiency,\nrecent studies have reduced sampling steps and applied network quantization\nwhile retaining the original architectures. The lack of architectural reduction\nattempts may stem from worries over expensive retraining for such massive\nmodels. In this work, we uncover the surprising potential of block pruning and\nfeature distillation for low-cost general-purpose T2I. By removing several\nresidual and attention blocks from the U-Net of SDMs, we achieve 30%~50%\nreduction in model size, MACs, and latency. We show that distillation\nretraining is effective even under limited resources: using only 13 A100 days\nand a tiny dataset, our compact models can imitate the original SDMs (v1.4 and\nv2.1-base with over 6,000 A100 days). Benefiting from the transferred\nknowledge, our BK-SDMs deliver competitive results on zero-shot MS-COCO against\nlarger multi-billion parameter models. We further demonstrate the applicability\nof our lightweight backbones in personalized generation and image-to-image\ntranslation. Deployment of our models on edge devices attains 4-second\ninference. Code and models can be found at:\nhttps://github.com/Nota-NetsPresso/BK-SDM\n","authors":["Bo-Kyeong Kim","Hyoung-Kyu Song","Thibault Castells","Shinkook Choi"],"pdf_url":"https://arxiv.org/pdf/2305.15798v4.pdf","comment":"ECCV 2024 Camera-Ready Version"},{"id":"http://arxiv.org/abs/2312.02522v2","updated":"2024-12-02T12:49:50Z","published":"2023-12-05T06:05:04Z","title":"MASP: Scalable GNN-based Planning for Multi-Agent Navigation","summary":" We investigate multi-agent navigation tasks, where multiple agents need to\nreach initially unassigned goals in a limited time. Classical planning-based\nmethods suffer from expensive computation overhead at each step and offer\nlimited expressiveness for complex cooperation strategies. In contrast,\nreinforcement learning (RL) has recently become a popular approach for\naddressing this issue. However, RL struggles with low data efficiency and\ncooperation when directly exploring (nearly) optimal policies in a large\nexploration space, especially with an increased number of agents(e.g., 10+\nagents) or in complex environments (e.g., 3-D simulators). In this paper, we\npropose the Multi-Agent Scalable Graph-based Planner (MASP), a goal-conditioned\nhierarchical planner for navigation tasks with a substantial number of agents\nin the decentralized setting. MASP employs a hierarchical framework to reduce\nspace complexity by decomposing a large exploration space into multiple\ngoal-conditioned subspaces, where a high-level policy assigns agents goals, and\na low-level policy navigates agents toward designated goals. For agent\ncooperation and the adaptation to varying team sizes, we model agents and goals\nas graphs to better capture their relationship. The high-level policy, the Goal\nMatcher, leverages a graph-based Self-Encoder and Cross-Encoder to optimize\ngoal assignment by updating the agent and the goal graphs. The low-level\npolicy, the Coordinated Action Executor, introduces the Group Information\nFusion to facilitate group division and extract agent relationships across\ngroups, enhancing training efficiency for agent cooperation. The results\ndemonstrate that MASP outperforms RL and planning-based baselines in task\nefficiency.\n","authors":["Xinyi Yang","Xinting Yang","Chao Yu","Jiayu Chen","Wenbo Ding","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02522v2.pdf","comment":"Submitted to IEEE RA-L"},{"id":"http://arxiv.org/abs/2410.07836v4","updated":"2024-12-02T12:44:48Z","published":"2024-10-10T11:52:07Z","title":"Masked Generative Priors Improve World Models Sequence Modelling\n Capabilities","summary":" Deep Reinforcement Learning (RL) has become the leading approach for creating\nartificial agents in complex environments. Model-based approaches, which are RL\nmethods with world models that predict environment dynamics, are among the most\npromising directions for improving data efficiency, forming a critical step\ntoward bridging the gap between research and real-world deployment. In\nparticular, world models enhance sample efficiency by learning in imagination,\nwhich involves training a generative sequence model of the environment in a\nself-supervised manner. Recently, Masked Generative Modelling has emerged as a\nmore efficient and superior inductive bias for modelling and generating token\nsequences. Building on the Efficient Stochastic Transformer-based World Models\n(STORM) architecture, we replace the traditional MLP prior with a Masked\nGenerative Prior (e.g., MaskGIT Prior) and introduce GIT-STORM. We evaluate our\nmodel on two downstream tasks: reinforcement learning and video prediction.\nGIT-STORM demonstrates substantial performance gains in RL tasks on the Atari\n100k benchmark. Moreover, we apply Transformer-based World Models to continuous\naction environments for the first time, addressing a significant gap in prior\nresearch. To achieve this, we employ a state mixer function that integrates\nlatent state representations with actions, enabling our model to handle\ncontinuous control tasks. We validate this approach through qualitative and\nquantitative analyses on the DeepMind Control Suite, showcasing the\neffectiveness of Transformer-based World Models in this new domain. Our results\nhighlight the versatility and efficacy of the MaskGIT dynamics prior, paving\nthe way for more accurate world models and effective RL policies.\n","authors":["Cristian Meo","Mircea Lica","Zarif Ikram","Akihiro Nakano","Vedant Shah","Aniket Rajiv Didolkar","Dianbo Liu","Anirudh Goyal","Justin Dauwels"],"pdf_url":"https://arxiv.org/pdf/2410.07836v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03976v4","updated":"2024-12-02T12:44:25Z","published":"2023-11-07T13:24:01Z","title":"Topology Only Pre-Training: Towards Generalised Multi-Domain Graph\n Models","summary":" The principal benefit of unsupervised representation learning is that a\npre-trained model can be fine-tuned where data or labels are scarce. Existing\napproaches for graph representation learning are domain specific, maintaining\nconsistent node and edge features across the pre-training and target datasets.\nThis has precluded transfer to multiple domains. We present Topology Only\nPre-Training (ToP), a graph pre-training method based on node and edge feature\nexclusion. We show positive transfer on evaluation datasets from multiple\ndomains, including domains not present in pre-training data, running directly\ncontrary to assumptions made in contemporary works. On 75% of experiments, ToP\nmodels perform significantly $p \\leq 0.01$ better than a supervised baseline.\nPerformance is significantly positive on 85.7% of tasks when node and edge\nfeatures are used in fine-tuning. We further show that out-of-domain topologies\ncan produce more useful pre-training than in-domain. Under ToP we show better\ntransfer from non-molecule pre-training, compared to molecule pre-training, on\n79% of molecular benchmarks. Against the limited set of other generalist graph\nmodels ToP performs strongly, including against models with many orders of\nmagnitude larger. These findings show that ToP opens broad areas of research in\nboth transfer learning on scarcely populated graph domains and in graph\nfoundation models.\n","authors":["Alex O. Davies","Riku W. Green","Nirav S. Ajmeri","Telmo M. Silva Filho"],"pdf_url":"https://arxiv.org/pdf/2311.03976v4.pdf","comment":"28 pages, 5 figures, 5 tables. For in-development code see\n https://github.com/neutralpronoun/general-gcl"},{"id":"http://arxiv.org/abs/2405.14655v2","updated":"2024-12-02T12:37:46Z","published":"2024-05-23T14:53:54Z","title":"Multi-turn Reinforcement Learning from Preference Human Feedback","summary":" Reinforcement Learning from Human Feedback (RLHF) has become the standard\napproach for aligning Large Language Models (LLMs) with human preferences,\nallowing LLMs to demonstrate remarkable abilities in various tasks. Existing\nmethods work by emulating the preferences at the single decision (turn) level,\nlimiting their capabilities in settings that require planning or multi-turn\ninteractions to achieve a long-term goal. In this paper, we address this issue\nby developing novel methods for Reinforcement Learning (RL) from preference\nfeedback between two full multi-turn conversations. In the tabular setting, we\npresent a novel mirror-descent-based policy optimization algorithm for the\ngeneral multi-turn preference-based RL problem, and prove its convergence to\nNash equilibrium. To evaluate performance, we create a new environment,\nEducation Dialogue, where a teacher agent guides a student in learning a random\ntopic, and show that a deep RL variant of our algorithm outperforms RLHF\nbaselines. Finally, we show that in an environment with explicit rewards, our\nalgorithm recovers the same performance as a reward-based RL baseline, despite\nrelying solely on a weaker preference signal.\n","authors":["Lior Shani","Aviv Rosenberg","Asaf Cassel","Oran Lang","Daniele Calandriello","Avital Zipori","Hila Noga","Orgad Keller","Bilal Piot","Idan Szpektor","Avinatan Hassidim","Yossi Matias","Rémi Munos"],"pdf_url":"https://arxiv.org/pdf/2405.14655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12380v3","updated":"2024-12-02T12:36:45Z","published":"2023-09-21T12:44:31Z","title":"Methods for generating and evaluating synthetic longitudinal patient\n data: a systematic review","summary":" The rapid growth in data availability has facilitated research and\ndevelopment, yet not all industries have benefited equally due to legal and\nprivacy constraints. The healthcare sector faces significant challenges in\nutilizing patient data because of concerns about data security and\nconfidentiality. To address this, various privacy-preserving methods, including\nsynthetic data generation, have been proposed. Synthetic data replicate\nexisting data as closely as possible, acting as a proxy for sensitive\ninformation. While patient data are often longitudinal, this aspect remains\nunderrepresented in existing reviews of synthetic data generation in\nhealthcare. This paper maps and describes methods for generating and evaluating\nsynthetic longitudinal patient data in real-life settings through a systematic\nliterature review, conducted following the PRISMA guidelines and incorporating\ndata from five databases up to May 2024. Thirty-nine methods were identified,\nwith four addressing all challenges of longitudinal data generation, though\nnone included privacy-preserving mechanisms. Resemblance was evaluated in most\nstudies, utility in the majority, and privacy in just over half. Only a small\nfraction of studies assessed all three aspects. Our findings highlight the need\nfor further research in this area.\n","authors":["Katariina Perkonoja","Kari Auranen","Joni Virta"],"pdf_url":"https://arxiv.org/pdf/2309.12380v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02272v4","updated":"2024-12-02T12:36:30Z","published":"2024-11-04T17:03:55Z","title":"Combining Induction and Transduction for Abstract Reasoning","summary":" When learning an input-output mapping from very few examples, is it better to\nfirst infer a latent function that explains the examples, or is it better to\ndirectly predict new test outputs, e.g. using a neural network? We study this\nquestion on ARC by training neural models for induction (inferring latent\nfunctions) and transduction (directly predicting the test output for a given\ntest input). We train on synthetically generated variations of Python programs\nthat solve ARC training tasks. We find inductive and transductive models solve\ndifferent kinds of test problems, despite having the same training problems and\nsharing the same neural architecture: Inductive program synthesis excels at\nprecise computations, and at composing multiple concepts, while transduction\nsucceeds on fuzzier perceptual concepts. Ensembling them approaches human-level\nperformance on ARC.\n","authors":["Wen-Ding Li","Keya Hu","Carter Larsen","Yuqing Wu","Simon Alford","Caleb Woo","Spencer M. Dunn","Hao Tang","Michelangelo Naim","Dat Nguyen","Wei-Long Zheng","Zenna Tavares","Yewen Pu","Kevin Ellis"],"pdf_url":"https://arxiv.org/pdf/2411.02272v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07818v5","updated":"2024-12-02T12:29:47Z","published":"2024-02-12T17:24:15Z","title":"Differentially Private Zeroth-Order Methods for Scalable Large Language\n Model Finetuning","summary":" Fine-tuning on task-specific datasets is a widely-embraced paradigm of\nharnessing the powerful capability of pretrained LLMs for various downstream\ntasks. Due to the popularity of LLMs fine-tuning and its accompanying privacy\nconcerns, differentially private (DP) fine-tuning of pretrained LLMs has been\nwidely used to safeguarding the privacy of task-specific datasets. Lying at the\ndesign core of DP LLM fine-tuning methods is the satisfactory tradeoff among\nprivacy, utility, and scalability. Most existing methods build upon the seminal\nwork of DP-SGD. Despite pushing the scalability of DP-SGD to its limit,\nDP-SGD-based fine-tuning methods are unfortunately limited by the inherent\ninefficiency of SGD.\n In this paper, we investigate the potential of DP zeroth-order methods for\nLLM pretraining, which avoids the scalability bottleneck of SGD by\napproximating the gradient with the more efficient zeroth-order gradient.\nRather than treating the zeroth-order method as a drop-in replacement for SGD,\nthis paper presents a comprehensive study both theoretically and empirically.\nFirst, we propose the stagewise DP zeroth-order method (DP-ZOSO) that\ndynamically schedules key hyperparameters. This design is grounded on the\nsynergy between DP random perturbation and the gradient approximation error of\nthe zeroth-order method, and its effect on fine-tuning trajectory.\n We provide theoretical analysis for both proposed methods. We conduct\nextensive empirical analysis on both encoder-only masked language model and\ndecoder-only autoregressive language model, achieving impressive results in\nterms of scalability and utility regardless of the class of tasks (compared\nwith DPZero, DP-ZOPO improves $4.5\\%$ on SST-5, $5.5\\%$ on MNLI with\nRoBERTa-Large and 9.2\\% on CB, 3.9\\% on BoolQ with OPT-2.7b when $\\epsilon=4$,\ndemonstrates more significant enhancement in performance on more complicated\ntasks).\n","authors":["Z Liu","J Lou","W Bao","Y Hu","B Li","Z Qin","K Ren"],"pdf_url":"https://arxiv.org/pdf/2402.07818v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08559v3","updated":"2024-12-02T12:16:19Z","published":"2024-10-11T06:30:48Z","title":"Learning General Representation of 12-Lead Electrocardiogram with a\n Joint-Embedding Predictive Architecture","summary":" Electrocardiogram (ECG) captures the heart's electrical signals, offering\nvaluable information for diagnosing cardiac conditions. However, the scarcity\nof labeled data makes it challenging to fully leverage supervised learning in\nmedical domain. Self-supervised learning (SSL) offers a promising solution,\nenabling models to learn from unlabeled data and uncover meaningful patterns.\nIn this paper, we show that masked modeling in the latent space can be a\npowerful alternative to existing self-supervised methods in the ECG domain. We\nintroduce ECG-JEPA, a SSL model for 12-lead ECG analysis that learns semantic\nrepresentations of ECG data by predicting in the hidden latent space, bypassing\nthe need to reconstruct raw signals. This approach offers several advantages in\nthe ECG domain: (1) it avoids producing unnecessary details, such as noise,\nwhich is common in ECG; and (2) it addresses the limitations of na\\\"ive L2 loss\nbetween raw signals. Another key contribution is the introduction of\nCross-Pattern Attention (CroPA), a specialized masked attention mechanism\ntailored for 12-lead ECG data. ECG-JEPA is trained on the union of several open\nECG datasets, totaling approximately 180,000 samples, and achieves\nstate-of-the-art performance in various downstream tasks including ECG\nclassification and feature prediction. Our code is openly available at\nhttps://github.com/sehunfromdaegu/ECG_JEPA.\n","authors":["Sehun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.08559v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11883v3","updated":"2024-12-02T12:09:05Z","published":"2024-10-11T10:09:46Z","title":"Simulation-based inference with scattering representations: scattering\n is all you need","summary":" We demonstrate the successful use of scattering representations without\nfurther compression for simulation-based inference (SBI) with images (i.e.\nfield-level), illustrated with a cosmological case study. Scattering\nrepresentations provide a highly effective representational space for\nsubsequent learning tasks, although the higher dimensional compressed space\nintroduces challenges. We overcome these through spatial averaging, coupled\nwith more expressive density estimators. Compared to alternative methods, such\nan approach does not require additional simulations for either training or\ncomputing derivatives, is interpretable, and resilient to covariate shift. As\nexpected, we show that a scattering only approach extracts more information\nthan traditional second order summary statistics.\n","authors":["Kiyam Lin","Benjamin Joachimi","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2410.11883v3.pdf","comment":"9 pages, 2 figures, accepted by NeurIPS workshop on Machine Learning\n and the Physical Sciences"},{"id":"http://arxiv.org/abs/2410.23132v2","updated":"2024-12-02T12:05:29Z","published":"2024-10-30T15:42:59Z","title":"Revisiting MAE pre-training for 3D medical image segmentation","summary":" Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the\npotential of vast, untapped clinical datasets, for various downstream\napplications that suffer from the scarcity of labeled data. While SSL has\nrevolutionized fields like natural language processing and computer vision, its\nadoption in 3D medical image computing has been limited by three key pitfalls:\nSmall pre-training dataset sizes, architectures inadequate for 3D medical image\nanalysis, and insufficient evaluation practices. In this paper, we address\nthese issues by i) leveraging a large-scale dataset of 39k 3D brain MRI volumes\nand ii) using a Residual Encoder U-Net architecture within the state-of-the-art\nnnU-Net framework. iii) A robust development framework, incorporating 5\ndevelopment and 8 testing brain MRI segmentation datasets, allowed\nperformance-driven design decisions to optimize the simple concept of Masked\nAuto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses\nprevious SSL methods but also outperforms the strong nnU-Net baseline by an\naverage of approximately 3 Dice points setting a new state-of-the-art. Our code\nand models are made available here.\n","authors":["Tassilo Wald","Constantin Ulrich","Stanislav Lukyanenko","Andrei Goncharov","Alberto Paderno","Leander Maerkisch","Paul F. Jäger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2410.23132v2.pdf","comment":"Arxiv Preprint. Revised and under review"},{"id":"http://arxiv.org/abs/2303.16668v3","updated":"2024-12-02T12:01:58Z","published":"2023-03-29T13:22:20Z","title":"Protecting Federated Learning from Extreme Model Poisoning Attacks via\n Multidimensional Time Series Anomaly Detection","summary":" Current defense mechanisms against model poisoning attacks in federated\nlearning (FL) systems have proven effective up to a certain threshold of\nmalicious clients. In this work, we introduce FLANDERS, a novel pre-aggregation\nfilter for FL resilient to large-scale model poisoning attacks, i.e., when\nmalicious clients far exceed legitimate participants. FLANDERS treats the\nsequence of local models sent by clients in each FL round as a matrix-valued\ntime series. Then, it identifies malicious client updates as outliers in this\ntime series by comparing actual observations with estimates generated by a\nmatrix autoregressive forecasting model maintained by the server. Experiments\nconducted in several non-iid FL setups show that FLANDERS significantly\nimproves robustness across a wide spectrum of attacks when paired with standard\nand robust existing aggregation methods.\n","authors":["Edoardo Gabrielli","Dimitri Belli","Zoe Matrullo","Vittorio Miori","Gabriele Tolomei"],"pdf_url":"https://arxiv.org/pdf/2303.16668v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15290v6","updated":"2024-12-02T11:46:33Z","published":"2023-10-23T18:56:01Z","title":"Reliable Generation of Privacy-preserving Synthetic Electronic Health\n Record Time Series via Diffusion Models","summary":" Electronic Health Records (EHRs) are rich sources of patient-level data,\noffering valuable resources for medical data analysis. However, privacy\nconcerns often restrict access to EHRs, hindering downstream analysis. Current\nEHR de-identification methods are flawed and can lead to potential privacy\nleakage. Additionally, existing publicly available EHR databases are limited,\npreventing the advancement of medical research using EHR. This study aims to\novercome these challenges by generating realistic and privacy-preserving\nsynthetic electronic health records (EHRs) time series efficiently. We\nintroduce a new method for generating diverse and realistic synthetic EHR time\nseries data using Denoising Diffusion Probabilistic Models (DDPM). We conducted\nexperiments on six databases: Medical Information Mart for Intensive Care III\nand IV (MIMIC-III/IV), the eICU Collaborative Research Database (eICU), and\nnon-EHR datasets on Stocks and Energy. We compared our proposed method with\neight existing methods. Our results demonstrate that our approach significantly\noutperforms all existing methods in terms of data fidelity while requiring less\ntraining effort. Additionally, data generated by our method yields a lower\ndiscriminative accuracy compared to other baseline methods, indicating the\nproposed method can generate data with less privacy risk. The proposed\ndiffusion-model-based method can reliably and efficiently generate synthetic\nEHR time series, which facilitates the downstream medical data analysis. Our\nnumerical results show the superiority of the proposed method over all other\nexisting methods.\n","authors":["Muhang Tian","Bernie Chen","Allan Guo","Shiyi Jiang","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.15290v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11932v3","updated":"2024-12-02T10:57:58Z","published":"2024-05-20T10:16:26Z","title":"Nonequilbrium physics of generative diffusion models","summary":" Generative diffusion models apply the concept of Langevin dynamics in physics\nto machine leaning, attracting a lot of interests from engineering, statistics\nand physics, but a complete picture about inherent mechanisms is still lacking.\nIn this paper, we provide a transparent physics analysis of diffusion models,\nformulating the fluctuation theorem, entropy production, equilibrium measure,\nand Franz-Parisi potential to understand the dynamic process and intrinsic\nphase transitions. Our analysis is rooted in a path integral representation of\nboth forward and backward dynamics, and in treating the reverse diffusion\ngenerative process as a statistical inference, where the time-dependent state\nvariables serve as quenched disorder akin to that in spin glass theory. Our\nstudy thus links stochastic thermodynamics, statistical inference and geometry\nbased analysis together to yield a coherent picture about how the generative\ndiffusion models work.\n","authors":["Zhendong Yu","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2405.11932v3.pdf","comment":"26 pages, 11 figures, 31 refs"},{"id":"http://arxiv.org/abs/2411.14708v2","updated":"2024-12-02T10:52:21Z","published":"2024-11-22T03:33:51Z","title":"Understanding LLM Embeddings for Regression","summary":" With the rise of large language models (LLMs) for flexibly processing\ninformation as strings, a natural application is regression, specifically by\npreprocessing string representations into LLM embeddings as downstream features\nfor metric prediction. In this paper, we provide one of the first comprehensive\ninvestigations into embedding-based regression and demonstrate that LLM\nembeddings as features can be better for high-dimensional regression tasks than\nusing traditional feature engineering. This regression performance can be\nexplained in part due to LLM embeddings over numeric data inherently preserving\nLipschitz continuity over the feature space. Furthermore, we quantify the\ncontribution of different model effects, most notably model size and language\nunderstanding, which we find surprisingly do not always improve regression\nperformance.\n","authors":["Eric Tang","Bangding Yang","Xingyou Song"],"pdf_url":"https://arxiv.org/pdf/2411.14708v2.pdf","comment":"16 pages, 13 figures"},{"id":"http://arxiv.org/abs/2410.23178v2","updated":"2024-12-02T10:51:10Z","published":"2024-10-30T16:36:55Z","title":"Uncertainty quantification for fast reconstruction methods using\n augmented equivariant bootstrap: Application to radio interferometry","summary":" The advent of next-generation radio interferometers like the Square Kilometer\nArray promises to revolutionise our radio astronomy observational capabilities.\nThe unprecedented volume of data these devices generate requires fast and\naccurate image reconstruction algorithms to solve the ill-posed radio\ninterferometric imaging problem. Most state-of-the-art reconstruction methods\nlack trustworthy and scalable uncertainty quantification, which is critical for\nthe rigorous scientific interpretation of radio observations. We propose an\nunsupervised technique based on a conformalized version of a radio-augmented\nequivariant bootstrapping method, which allows us to quantify uncertainties for\nfast reconstruction methods. Noticeably, we rely on reconstructions from\nultra-fast unrolled algorithms. The proposed method brings more reliable\nuncertainty estimations to our problem than existing alternatives.\n","authors":["Mostafa Cherif","Tobías I. Liaudat","Jonathan Kern","Christophe Kervazo","Jérôme Bobin"],"pdf_url":"https://arxiv.org/pdf/2410.23178v2.pdf","comment":"14 pages, 7 figures. Accepted at the Machine Learning and the\n Physical Sciences Workshop, NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.04632v2","updated":"2024-12-02T10:48:28Z","published":"2024-11-07T11:35:31Z","title":"Improved Multi-Task Brain Tumour Segmentation with Synthetic Data\n Augmentation","summary":" This paper presents the winning solution of task 1 and the third-placed\nsolution of task 3 of the BraTS challenge. The use of automated tools in\nclinical practice has increased due to the development of more and more\nsophisticated and reliable algorithms. However, achieving clinical standards\nand developing tools for real-life scenarios is a major challenge. To this end,\nBraTS has organised tasks to find the most advanced solutions for specific\npurposes. In this paper, we propose the use of synthetic data to train\nstate-of-the-art frameworks in order to improve the segmentation of adult\ngliomas in a post-treatment scenario, and the segmentation of meningioma for\nradiotherapy planning. Our results suggest that the use of synthetic data leads\nto more robust algorithms, although the synthetic data generation pipeline is\nnot directly suited to the meningioma task. In task 1, we achieved a DSC of\n0.7900, 0.8076, 0.7760, 0.8926, 0.7874, 0.8938 and a HD95 of 35.63, 30.35,\n44.58, 16.87, 38.19, 17.95 for ET, NETC, RC, SNFH, TC and WT, respectively and,\nin task 3, we achieved a DSC of 0.801 and HD95 of 38.26, in the testing phase.\nThe code for these tasks is available at\nhttps://github.com/ShadowTwin41/BraTS_2023_2024_solutions.\n","authors":["André Ferreira","Tiago Jesus","Behrus Puladi","Jens Kleesiek","Victor Alves","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2411.04632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19211v2","updated":"2024-12-02T10:44:08Z","published":"2024-03-28T08:19:33Z","title":"Dual-Personalizing Adapter for Federated Foundation Models","summary":" Recently, foundation models, particularly large language models (LLMs), have\ndemonstrated an impressive ability to adapt to various tasks by fine-tuning\ndiverse instruction data. Notably, federated foundation models (FedFM) emerge\nas a privacy preservation method to fine-tune models collaboratively under\nfederated learning (FL) settings by leveraging many distributed datasets with\nnon-IID data. To alleviate communication and computation overhead,\nparameter-efficient methods are introduced for efficiency, and some research\nadapted personalization methods to FedFM for better user preferences alignment.\nHowever, a critical gap in existing research is the neglect of test-time\ndistribution shifts in real-world applications, and conventional methods for\ntest-time distribution shifts in personalized FL are less effective for FedFM\ndue to their failure to adapt to complex distribution shift scenarios and the\nrequirement to train all parameters. To bridge this gap, we refine the setting\nin FedFM, termed test-time personalization, which aims to learn personalized\nfederated foundation models on clients while effectively handling test-time\ndistribution shifts simultaneously. To address challenges in this setting, we\nexplore a simple yet effective solution, a Federated Dual-Personalizing Adapter\n(FedDPA) architecture. By co-working with a foundation model, a global adapter\nand a local adapter jointly tackle the test-time distribution shifts and\nclient-specific personalization. Additionally, we introduce an instance-wise\ndynamic weighting mechanism that dynamically integrates the global and local\nadapters for each test instance during inference, facilitating effective\ntest-time personalization. The effectiveness of the proposed method has been\nevaluated on benchmark datasets across different NLP tasks.\n","authors":["Yiyuan Yang","Guodong Long","Tao Shen","Jing Jiang","Michael Blumenstein"],"pdf_url":"https://arxiv.org/pdf/2403.19211v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02595v3","updated":"2024-12-02T10:36:05Z","published":"2024-04-03T09:19:46Z","title":"QFNN-FFD: Quantum Federated Neural Network for Financial Fraud Detection","summary":" This study introduces the Quantum Federated Neural Network for Financial\nFraud Detection (QFNN-FFD), a cutting-edge framework merging Quantum Machine\nLearning (QML) and quantum computing with Federated Learning (FL) for financial\nfraud detection. Using quantum technologies' computational power and the robust\ndata privacy protections offered by FL, QFNN-FFD emerges as a secure and\nefficient method for identifying fraudulent transactions within the financial\nsector. Implementing a dual-phase training model across distributed clients\nenhances data integrity and enables superior performance metrics, achieving\nprecision rates consistently above 95%. Additionally, QFNN-FFD demonstrates\nexceptional resilience by maintaining an impressive 80% accuracy, highlighting\nits robustness and readiness for real-world applications. This combination of\nhigh performance, security, and robustness against noise positions QFNN-FFD as\na transformative advancement in financial technology solutions and establishes\nit as a new benchmark for privacy-focused fraud detection systems. This\nframework facilitates the broader adoption of secure, quantum-enhanced\nfinancial services and inspires future innovations that could use QML to tackle\ncomplex challenges in other areas requiring high confidentiality and accuracy.\n","authors":["Nouhaila Innan","Alberto Marchisio","Mohamed Bennai","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2404.02595v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09874v4","updated":"2024-12-02T10:33:19Z","published":"2023-03-17T10:38:27Z","title":"Image Statistics Predict the Sensitivity of Perceptual Quality Metrics","summary":" Previously, Barlow and Attneave hypothesised a link between biological vision\nand information maximisation. Following Shannon, information was defined using\nthe probability of natural images. Several physiological and psychophysical\nphenomena have been derived from principles like info-max, efficient coding, or\noptimal denoising. However, it remains unclear how this link is expressed in\nmathematical terms from image probability. Classical derivations were subjected\nto strong assumptions on the probability models and on the behaviour of the\nsensors. Moreover, the direct evaluation of the hypothesis was limited by the\ninability of classical image models to deliver accurate estimates of the\nprobability. Here, we directly evaluate image probabilities using a generative\nmodel for natural images, and analyse how probability-related factors can be\ncombined to predict the sensitivity of state-of-the-art subjective image\nquality metrics, a proxy for human perception. We use information theory and\nregression analysis to find a simple model that when combining just two\nprobability-related factors achieves 0.77 correlation with subjective metrics.\nThis probability-based model is validated in two ways: through direct\ncomparison with the opinion of real observers in a subjective quality\nexperiment, and by reproducing basic trends of classical psychophysical facts\nsuch as the Contrast Sensitivity Function, the Weber-law, and contrast masking.\n","authors":["Alexander Hepburn","Valero Laparra","Raúl Santos-Rodriguez","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2303.09874v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00115v2","updated":"2024-12-02T10:25:47Z","published":"2024-08-28T04:07:40Z","title":"Self-Adaptive Quantum Kernel Principal Components Analysis for Compact\n Readout of Chemiresistive Sensor Arrays","summary":" The rapid growth of Internet of Things (IoT) devices necessitates efficient\ndata compression techniques to handle the vast amounts of data generated by\nthese devices. Chemiresistive sensor arrays (CSAs), a simple-to-fabricate but\ncrucial component in IoT systems, generate large volumes of data due to their\nsimultaneous multi-sensor operations. Classical principal component analysis\n(cPCA) methods, a common solution to the data compression challenge, face\nlimitations in preserving critical information during dimensionality reduction.\nIn this study, we present self-adaptive quantum kernel (SAQK) PCA as a superior\nalternative to enhance information retention. Our findings demonstrate that\nSAQK PCA outperforms cPCA in various back-end machine-learning tasks,\nespecially in low-dimensional scenarios where access to quantum bits is\nlimited. These results highlight the potential of noisy intermediate-scale\nquantum (NISQ) computers to revolutionize data processing in real-world IoT\napplications by improving the efficiency and reliability of CSA data\ncompression and readout, despite the current constraints on qubit availability.\n","authors":["Zeheng Wang","Timothy van der Laan","Muhammad Usman"],"pdf_url":"https://arxiv.org/pdf/2409.00115v2.pdf","comment":"Version 2"},{"id":"http://arxiv.org/abs/2402.08711v3","updated":"2024-12-02T10:21:10Z","published":"2024-02-13T18:31:55Z","title":"Correction to \"Wasserstein distance estimates for the distributions of\n numerical approximations to ergodic stochastic differential equations\"","summary":" A method for analyzing non-asymptotic guarantees of numerical discretizations\nof ergodic SDEs in Wasserstein-2 distance is presented by Sanz-Serna and\nZygalakis in ``Wasserstein distance estimates for the distributions of\nnumerical approximations to ergodic stochastic differential equations\". They\nanalyze the UBU integrator which is strong order two and only requires one\ngradient evaluation per step, resulting in desirable non-asymptotic guarantees,\nin particular $\\mathcal{O}(d^{1/4}\\epsilon^{-1/2})$ steps to reach a distance\nof $\\epsilon > 0$ in Wasserstein-2 distance away from the target distribution.\nHowever, there is a mistake in the local error estimates in Sanz-Serna and\nZygalakis (2021), in particular, a stronger assumption is needed to achieve\nthese complexity estimates. This note reconciles the theory with the dimension\ndependence observed in practice in many applications of interest.\n","authors":["Daniel Paulin","Peter A. Whalley"],"pdf_url":"https://arxiv.org/pdf/2402.08711v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.06067v2","updated":"2024-12-02T10:18:38Z","published":"2024-09-09T21:04:16Z","title":"MLLM-LLaVA-FL: Multimodal Large Language Model Assisted Federated\n Learning","summary":" Previous studies on federated learning (FL) often encounter performance\ndegradation due to data heterogeneity among different clients. In light of the\nrecent advances in multimodal large language models (MLLMs), such as GPT-4v and\nLLaVA, which demonstrate their exceptional proficiency in multimodal tasks,\nsuch as image captioning and multimodal question answering. We introduce a\nnovel federated learning framework, named Multimodal Large Language Model\nAssisted Federated Learning (MLLM-LLaVA-FL), which employs powerful MLLMs at\nthe server end to address the heterogeneous and long-tailed challenges. Owing\nto the advanced cross-modality representation capabilities and the extensive\nopen-vocabulary prior knowledge of MLLMs, our framework is adept at harnessing\nthe extensive, yet previously underexploited, open-source data accessible from\nwebsites and powerful server-side computational resources. Hence, the\nMLLM-LLaVA-FL not only enhances the performance but also avoids increasing the\nrisk of privacy leakage and the computational burden on local devices,\ndistinguishing it from prior methodologies. Our framework has three key stages.\nInitially, we conduct global visual-text pretraining of the model. This\npretraining is facilitated by utilizing the extensive open-source data\navailable online, with the assistance of MLLMs. Subsequently, the pretrained\nmodel is distributed among various clients for local training. Finally, once\nthe locally trained models are transmitted back to the server, a global\nalignment is carried out under the supervision of MLLMs to further enhance the\nperformance. Experimental evaluations on established benchmarks, show that our\nframework delivers promising performance in the typical scenarios with data\nheterogeneity and long-tail distribution across different clients in FL.\n","authors":["Jianyi Zhang","Hao Frank Yang","Ang Li","Xin Guo","Pu Wang","Haiming Wang","Yiran Chen","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2409.06067v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2409.19437v3","updated":"2024-12-02T10:15:47Z","published":"2024-09-28T18:56:48Z","title":"Strongly-polynomial time and validation analysis of policy gradient\n methods","summary":" This paper proposes a novel termination criterion, termed the advantage gap\nfunction, for finite state and action Markov decision processes (MDP) and\nreinforcement learning (RL). By incorporating this advantage gap function into\nthe design of step size rules and deriving a new linear rate of convergence\nthat is independent of the stationary state distribution of the optimal policy,\nwe demonstrate that policy gradient methods can solve MDPs in\nstrongly-polynomial time. To the best of our knowledge, this is the first time\nthat such strong convergence properties have been established for policy\ngradient methods. Moreover, in the stochastic setting, where only stochastic\nestimates of policy gradients are available, we show that the advantage gap\nfunction provides close approximations of the optimality gap for each\nindividual state and exhibits a sublinear rate of convergence at every state.\nThe advantage gap function can be easily estimated in the stochastic case, and\nwhen coupled with easily computable upper bounds on policy values, they provide\na convenient way to validate the solutions generated by policy gradient\nmethods. Therefore, our developments offer a principled and computable measure\nof optimality for RL, whereas current practice tends to rely on\nalgorithm-to-algorithm or baselines comparisons with no certificate of\noptimality.\n","authors":["Caleb Ju","Guanghui Lan"],"pdf_url":"https://arxiv.org/pdf/2409.19437v3.pdf","comment":"Add numerical experiments"},{"id":"http://arxiv.org/abs/2406.17490v2","updated":"2024-12-02T09:49:23Z","published":"2024-06-25T12:17:44Z","title":"BricksRL: A Platform for Democratizing Robotics and Reinforcement\n Learning Research and Education with LEGO","summary":" We present BricksRL, a platform designed to democratize access to robotics\nfor reinforcement learning research and education. BricksRL facilitates the\ncreation, design, and training of custom LEGO robots in the real world by\ninterfacing them with the TorchRL library for reinforcement learning agents.\nThe integration of TorchRL with the LEGO hubs, via Bluetooth bidirectional\ncommunication, enables state-of-the-art reinforcement learning training on GPUs\nfor a wide variety of LEGO builds. This offers a flexible and cost-efficient\napproach for scaling and also provides a robust infrastructure for\nrobot-environment-algorithm communication. We present various experiments\nacross tasks and robot configurations, providing built plans and training\nresults. Furthermore, we demonstrate that inexpensive LEGO robots can be\ntrained end-to-end in the real world to achieve simple tasks, with training\ntimes typically under 120 minutes on a normal laptop. Moreover, we show how\nusers can extend the capabilities, exemplified by the successful integration of\nnon-LEGO sensors. By enhancing accessibility to both robotics and reinforcement\nlearning, BricksRL establishes a strong foundation for democratized robotic\nlearning in research and educational settings.\n","authors":["Sebastian Dittert","Vincent Moens","Gianni De Fabritiis"],"pdf_url":"https://arxiv.org/pdf/2406.17490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14377v2","updated":"2024-12-02T09:48:21Z","published":"2024-05-23T09:52:15Z","title":"CoMERA: Computing- and Memory-Efficient Training via Rank-Adaptive\n Tensor Optimization","summary":" Training large AI models such as LLMs and DLRMs costs massive GPUs and\ncomputing time. The high training cost has become only affordable to big tech\ncompanies, meanwhile also causing increasing concerns about the environmental\nimpact. This paper presents CoMERA, a Computing- and Memory-Efficient training\nmethod via Rank-Adaptive tensor optimization. CoMERA achieves rank-adaptive\ntensor-compressed (pre)-training via a multi-objective optimization formulation\nand improves the training to provide both a high compression ratio and\nexcellent accuracy in the training process. Our optimized numerical computation\n(e.g., optimized tensorized embedding and tensor-network contractions) and GPU\nimplementation eliminate part of the run-time overhead in the tensorized\ntraining on GPU. This leads to, for the first time, $2-3\\times$ speedup per\ntraining epoch compared with standard training. CoMERA also outperforms the\nrecent GaLore in terms of both memory and computing efficiency. Specifically,\nCoMERA is $2\\times$ faster per training epoch and $9\\times$ more\nmemory-efficient than GaLore on a tested six-encoder transformer with\nsingle-batch training. Our method also shows $\\sim 2\\times$ speedup than\nstandard pre-training on a BERT-like code-generation LLM while achieving\n$4.23\\times$ compression ratio in pre-training. With further HPC optimization,\nCoMERA may reduce the pre-training cost of many other LLMs. An implementation\nof CoMERA is available at https://github.com/ziyangjoy/CoMERA.\n","authors":["Zi Yang","Ziyue Liu","Samridhi Choudhary","Xinfeng Xie","Cao Gao","Siegfried Kunzmann","Zheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.14377v2.pdf","comment":"Accepted by Neurips 2024"},{"id":"http://arxiv.org/abs/2409.15344v2","updated":"2024-12-02T09:45:07Z","published":"2024-09-10T07:04:48Z","title":"Video-Driven Graph Network-Based Simulators","summary":" Lifelike visualizations in design, cinematography, and gaming rely on precise\nphysics simulations, typically requiring extensive computational resources and\ndetailed physical input. This paper presents a method that can infer a system's\nphysical properties from a short video, eliminating the need for explicit\nparameter input, provided it is close to the training condition. The learned\nrepresentation is then used within a Graph Network-based Simulator to emulate\nthe trajectories of physical systems. We demonstrate that the video-derived\nencodings effectively capture the physical properties of the system and\nshowcase a linear dependence between some of the encodings and the system's\nmotion.\n","authors":["Franciszek Szewczyk","Gilles Louppe","Matthia Sabatelli"],"pdf_url":"https://arxiv.org/pdf/2409.15344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04125v2","updated":"2024-12-02T09:42:24Z","published":"2024-07-04T18:54:30Z","title":"Query-Guided Self-Supervised Summarization of Nursing Notes","summary":" Nursing notes, an important part of Electronic Health Records (EHRs), track a\npatient's health during a care episode. Summarizing key information in nursing\nnotes can help clinicians quickly understand patients' conditions. However,\nexisting summarization methods in the clinical setting, especially abstractive\nmethods, have overlooked nursing notes and require reference summaries for\ntraining. We introduce QGSumm, a novel query-guided self-supervised domain\nadaptation approach for abstractive nursing note summarization. The method uses\npatient-related clinical queries for guidance, and hence does not need\nreference summaries for training. Through automatic experiments and manual\nevaluation by an expert clinician, we study our approach and other\nstate-of-the-art Large Language Models (LLMs) for nursing note summarization.\nOur experiments show: 1) GPT-4 is competitive in maintaining information in the\noriginal nursing notes, 2) QGSumm can generate high-quality summaries with a\ngood balance between recall of the original content and hallucination rate\nlower than other top methods. Ultimately, our work offers a new perspective on\nconditional text summarization, tailored to clinical applications.\n","authors":["Ya Gao","Hans Moen","Saila Koivusalo","Miika Koskinen","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2407.04125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11918v2","updated":"2024-12-02T09:21:40Z","published":"2024-11-18T03:37:33Z","title":"Artificial Intelligence Mangrove Monitoring System Based on Deep\n Learning and Sentinel-2 Satellite Data in the UAE (2017-2024)","summary":" Mangroves play a crucial role in maintaining coastal ecosystem health and\nprotecting biodiversity. Therefore, continuous mapping of mangroves is\nessential for understanding their dynamics. Earth observation imagery typically\nprovides a cost-effective way to monitor mangrove dynamics. However, there is a\nlack of regional studies on mangrove areas in the UAE. This study utilizes the\nUNet++ deep learning model combined with Sentinel-2 multispectral data and\nmanually annotated labels to monitor the spatiotemporal dynamics of densely\ndistributed mangroves (coverage greater than 70%) in the UAE from 2017 to 2024,\nachieving an mIoU of 87.8% on the validation set. Results show that the total\nmangrove area in the UAE in 2024 was approximately 9,142.21 hectares, an\nincrease of 2,061.33 hectares compared to 2017, with carbon sequestration\nincreasing by approximately 194,383.42 tons, equivalent to fixing about\n713,367.36 tons of carbon dioxide. Abu Dhabi has the largest mangrove area and\nplays a dominant role in the UAE's mangrove growth, increasing by 1,855.6\nhectares between 2017-2024, while other emirates have also contributed to\nmangrove expansion through stable and sustainable growth in mangrove areas.\nThis comprehensive growth pattern reflects the collective efforts of all\nemirates in mangrove restoration.\n","authors":["Linlin Tan","Haishan Wu"],"pdf_url":"https://arxiv.org/pdf/2411.11918v2.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2404.06400v2","updated":"2024-12-02T09:17:21Z","published":"2024-04-09T15:46:00Z","title":"Dynamic Deep Learning Based Super-Resolution For The Shallow Water\n Equations","summary":" Using the nonlinear shallow water equations as benchmark, we demonstrate that\na simulation with the ICON-O ocean model with a 20km resolution that is\nfrequently corrected by a U-net-type neural network can achieve discretization\nerrors of a simulation with 10km resolution. The network, originally developed\nfor image-based super-resolution in post-processing, is trained to compute the\ndifference between solutions on both meshes and is used to correct the coarse\nmesh every 12h. Our setup is the Galewsky test case, modeling transition of a\nbarotropic instability into turbulent flow. We show that the ML-corrected\ncoarse resolution run correctly maintains a balance flow and captures the\ntransition to turbulence in line with the higher resolution simulation. After 8\nday of simulation, the $L_2$-error of the corrected run is similar to a\nsimulation run on the finer mesh. While mass is conserved in the corrected\nruns, we observe some spurious generation of kinetic energy.\n","authors":["Maximilian Witte","Fabricio Rodrigues Lapolli","Philip Freese","Sebastian Götschel","Daniel Ruprecht","Peter Korn","Christopher Kadow"],"pdf_url":"https://arxiv.org/pdf/2404.06400v2.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2411.18810v2","updated":"2024-12-02T09:10:34Z","published":"2024-11-27T23:32:54Z","title":"Enhancing Compositional Text-to-Image Generation with Reliable Random\n Seeds","summary":" Text-to-image diffusion models have demonstrated remarkable capability in\ngenerating realistic images from arbitrary text prompts. However, they often\nproduce inconsistent results for compositional prompts such as \"two dogs\" or \"a\npenguin on the right of a bowl\". Understanding these inconsistencies is crucial\nfor reliable image generation. In this paper, we highlight the significant role\nof initial noise in these inconsistencies, where certain noise patterns are\nmore reliable for compositional prompts than others. Our analyses reveal that\ndifferent initial random seeds tend to guide the model to place objects in\ndistinct image areas, potentially adhering to specific patterns of camera\nangles and image composition associated with the seed. To improve the model's\ncompositional ability, we propose a method for mining these reliable cases,\nresulting in a curated training set of generated images without requiring any\nmanual annotation. By fine-tuning text-to-image models on these generated\nimages, we significantly enhance their compositional capabilities. For\nnumerical composition, we observe relative increases of 29.3% and 19.5% for\nStable Diffusion and PixArt-{\\alpha}, respectively. Spatial composition sees\neven larger gains, with 60.7% for Stable Diffusion and 21.1% for\nPixArt-{\\alpha}.\n","authors":["Shuangqi Li","Hieu Le","Jingyi Xu","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2411.18810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17781v4","updated":"2024-12-02T08:27:06Z","published":"2024-07-25T05:22:08Z","title":"Ensemble data assimilation to diagnose AI-based weather prediction\n model: A case with ClimaX version 0.3.1","summary":" Artificial intelligence (AI)-based weather prediction research is growing\nrapidly and has shown to be competitive with the advanced dynamic numerical\nweather prediction models. However, research combining AI-based weather\nprediction models with data assimilation remains limited partially because\nlong-term sequential data assimilation cycles are required to evaluate data\nassimilation systems. This study proposes using ensemble data assimilation for\ndiagnosing AI-based weather prediction models, and marked the first successful\nimplementation of ensemble Kalman filter with AI-based weather prediction\nmodels. Our experiments with an AI-based model ClimaX demonstrated that the\nensemble data assimilation cycled stably for the AI-based weather prediction\nmodel using covariance inflation and localization techniques within the\nensemble Kalman filter. While ClimaX showed some limitations in capturing\nflow-dependent error covariance compared to dynamical models, the AI-based\nensemble forecasts provided reasonable and beneficial error covariance in\nsparsely observed regions. In addition, ensemble data assimilation revealed\nthat error growth based on ensemble ClimaX predictions was weaker than that of\ndynamical NWP models, leading to higher inflation factors. A series of\nexperiments demonstrated that ensemble data assimilation can be used to\ndiagnose properties of AI weather prediction models such as physical\nconsistency and accurate error growth representation.\n","authors":["Shunji Kotsuki","Kenta Shiraishi","Atsushi Okazaki"],"pdf_url":"https://arxiv.org/pdf/2407.17781v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09710v2","updated":"2024-12-02T08:05:45Z","published":"2024-02-15T05:06:53Z","title":"Preserving Data Privacy for ML-driven Applications in Open Radio Access\n Networks","summary":" Deep learning offers a promising solution to improve spectrum access\ntechniques by utilizing data-driven approaches to manage and share limited\nspectrum resources for emerging applications. For several of these\napplications, the sensitive wireless data (such as spectrograms) are stored in\na shared database or multistakeholder cloud environment and are therefore prone\nto privacy leaks. This paper aims to address such privacy concerns by examining\nthe representative case study of shared database scenarios in 5G Open Radio\nAccess Network (O-RAN) networks where we have a shared database within the\nnear-real-time (near-RT) RAN intelligent controller. We focus on securing the\ndata that can be used by machine learning (ML) models for spectrum sharing and\ninterference mitigation applications without compromising the model and network\nperformances. The underlying idea is to leverage a (i) Shuffling-based\nlearnable encryption technique to encrypt the data, following which, (ii)\nemploy a custom Vision transformer (ViT) as the trained ML model that is\ncapable of performing accurate inferences on such encrypted data. The paper\noffers a thorough analysis and comparisons with analogous convolutional neural\nnetworks (CNN) as well as deeper architectures (such as ResNet-50) as\nbaselines. Our experiments showcase that the proposed approach significantly\noutperforms the baseline CNN with an improvement of 24.5% and 23.9% for the\npercent accuracy and F1-Score respectively when operated on encrypted data.\nThough deeper ResNet-50 architecture is obtained as a slightly more accurate\nmodel, with an increase of 4.4%, the proposed approach boasts a reduction of\nparameters by 99.32%, and thus, offers a much-improved prediction time by\nnearly 60%.\n","authors":["Pranshav Gajjar","Azuka Chiejina","Vijay K. Shah"],"pdf_url":"https://arxiv.org/pdf/2402.09710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02408v2","updated":"2024-12-02T07:47:00Z","published":"2024-02-04T08:57:54Z","title":"GLaPE: Gold Label-agnostic Prompt Evaluation and Optimization for Large\n Language Model","summary":" Despite the rapid progress of large language models (LLMs), their task\nperformance remains sensitive to prompt design. Recent studies have explored\nleveraging the LLM itself as an optimizer to identify optimal prompts that\nmaximize task accuracy. However, when evaluating prompts, such approaches\nheavily rely on elusive manually annotated gold labels to calculate task\naccuracy for each candidate prompt, which hinders the widespread implementation\nand generality. To overcome the limitation, this work proposes a gold\nlabel-agnostic prompt evaluation (GLaPE) to alleviate dependence on gold\nlabels. Motivated by the observed correlation between self-consistency and the\naccuracy of the answer, we adopt self-consistency as the initial evaluation\nscore. Subsequently, we refine the scores of prompts producing identical\nanswers to be mutually consistent. Experimental results show that GLaPE\nprovides reliable evaluations uniform with accuracy, even in the absence of\ngold labels. Moreover, on six popular reasoning tasks, our GLaPE-based prompt\noptimization yields effective prompts comparable to accuracy-based ones. The\ncode is publicly available at https://github.com/thunderous77/GLaPE.\n","authors":["Xuanchang Zhang","Zhuosheng Zhang","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2402.02408v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2410.13381v2","updated":"2024-12-02T07:18:46Z","published":"2024-10-17T09:36:01Z","title":"Learning Counterfactual Distributions via Kernel Nearest Neighbors","summary":" Consider a setting with multiple units (e.g., individuals, cohorts,\ngeographic locations) and outcomes (e.g., treatments, times, items), where the\ngoal is to learn a multivariate distribution for each unit-outcome entry, such\nas the distribution of a user's weekly spend and engagement under a specific\nmobile app version. A common challenge is the prevalence of missing not at\nrandom data, where observations are available only for certain unit-outcome\ncombinations and the observation availability can be correlated with the\nproperties of distributions themselves, i.e., there is unobserved confounding.\nAn additional challenge is that for any observed unit-outcome entry, we only\nhave a finite number of samples from the underlying distribution. We tackle\nthese two challenges by casting the problem into a novel distributional matrix\ncompletion framework and introduce a kernel based distributional generalization\nof nearest neighbors to estimate the underlying distributions. By leveraging\nmaximum mean discrepancies and a suitable factor model on the kernel mean\nembeddings of the underlying distributions, we establish consistent recovery of\nthe underlying distributions even when data is missing not at random and\npositivity constraints are violated. Furthermore, we demonstrate that our\nnearest neighbors approach is robust to heteroscedastic noise, provided we have\naccess to two or more measurements for the observed unit-outcome entries, a\nrobustness not present in prior works on nearest neighbors with single\nmeasurements.\n","authors":["Kyuseong Choi","Jacob Feitelberg","Caleb Chin","Anish Agarwal","Raaz Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2410.13381v2.pdf","comment":"39 pages, 8 figures"},{"id":"http://arxiv.org/abs/2301.13516v3","updated":"2024-12-02T07:07:42Z","published":"2023-01-31T10:07:23Z","title":"Recurrences reveal shared causal drivers of complex time series","summary":" Unmeasured causal forces influence diverse experimental time series, such as\nthe transcription factors that regulate genes, or the descending neurons that\nsteer motor circuits. Combining the theory of skew-product dynamical systems\nwith topological data analysis, we show that simultaneous recurrence events\nacross multiple time series reveal the structure of their shared unobserved\ndriving signal. We introduce a physics-based unsupervised learning algorithm\nthat reconstructs causal drivers by iteratively building a recurrence graph\nwith glass-like structure. As the amount of data increases, a percolation\ntransition on this graph leads to weak ergodicity breaking for random walks --\nrevealing the shared driver's dynamics, even from strongly-corrupted\nmeasurements. We relate reconstruction accuracy to the rate of information\ntransfer from a chaotic driver to the response systems, and we find that\neffective reconstruction proceeds through gradual approximation of the driver's\ndynamical attractor. Through extensive benchmarks against classical signal\nprocessing and machine learning techniques, we demonstrate our method's ability\nto extract causal drivers from diverse experimental datasets spanning ecology,\ngenomics, fluid dynamics, and physiology.\n","authors":["William Gilpin"],"pdf_url":"https://arxiv.org/pdf/2301.13516v3.pdf","comment":"Physical Review X (to appear). Code available online at\n https://github.com/williamgilpin/shrec"},{"id":"http://arxiv.org/abs/2406.06594v2","updated":"2024-12-02T07:04:17Z","published":"2024-06-06T03:13:34Z","title":"Stock Movement Prediction with Multimodal Stable Fusion via Gated\n Cross-Attention Mechanism","summary":" The accurate prediction of stock movements is crucial for investment\nstrategies. Stock prices are subject to the influence of various forms of\ninformation, including financial indicators, sentiment analysis, news\ndocuments, and relational structures. Predominant analytical approaches,\nhowever, tend to address only unimodal or bimodal sources, neglecting the\ncomplexity of multimodal data. Further complicating the landscape are the\nissues of data sparsity and semantic conflicts between these modalities, which\nare frequently overlooked by current models, leading to unstable performance\nand limiting practical applicability. To address these shortcomings, this study\nintroduces a novel architecture, named Multimodal Stable Fusion with Gated\nCross-Attention (MSGCA), designed to robustly integrate multimodal input for\nstock movement prediction. The MSGCA framework consists of three integral\ncomponents: (1) a trimodal encoding module, responsible for processing\nindicator sequences, dynamic documents, and a relational graph, and\nstandardizing their feature representations; (2) a cross-feature fusion module,\nwhere primary and consistent features guide the multimodal fusion of the three\nmodalities via a pair of gated cross-attention networks; and (3) a prediction\nmodule, which refines the fused features through temporal and dimensional\nreduction to execute precise movement forecasting. Empirical evaluations\ndemonstrate that the MSGCA framework exceeds current leading methods, achieving\nperformance gains of 8.1%, 6.1%, 21.7% and 31.6% on four multimodal datasets,\nrespectively, attributed to its enhanced multimodal fusion stability.\n","authors":["Chang Zong","Hang Zhou"],"pdf_url":"https://arxiv.org/pdf/2406.06594v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.10825v3","updated":"2024-12-02T07:00:57Z","published":"2024-09-17T01:37:57Z","title":"Unveiling and Mitigating Bias in Large Language Model Recommendations: A\n Path to Fairness","summary":" excel in delivering comprehensive suggestions by deeply analyzing content and\nuser behavior. However, they often inherit biases from skewed training data,\nfavoring mainstream content while underrepresenting diverse or non-traditional\noptions. This study explores the interplay between bias and LLM-based\nrecommendation systems, focusing on music, song, and book recommendations\nacross diverse demographic and cultural groups. This paper analyzes bias in\nLLM-based recommendation systems across multiple models (GPT, LLaMA, and\nGemini), revealing its deep and pervasive impact on outcomes. Intersecting\nidentities and contextual factors, like socioeconomic status, further amplify\nbiases, complicating fair recommendations across diverse groups. Our findings\nreveal that bias in these systems is deeply ingrained, yet even simple\ninterventions like prompt engineering can significantly reduce it. We further\npropose a retrieval-augmented generation strategy to mitigate bias more\neffectively. Numerical experiments validate these strategies, demonstrating\nboth the pervasive nature of bias and the impact of the proposed solutions.\n","authors":["Anindya Bijoy Das","Shahnewaz Karim Sakib"],"pdf_url":"https://arxiv.org/pdf/2409.10825v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10137v3","updated":"2024-12-02T06:55:10Z","published":"2024-10-14T04:07:45Z","title":"Variational autoencoders with latent high-dimensional steady geometric\n flows for dynamics","summary":" We develop Riemannian approaches to variational autoencoders (VAEs) for\nPDE-type ambient data with regularizing geometric latent dynamics, which we\nrefer to as VAE-DLM, or VAEs with dynamical latent manifolds. We redevelop the\nVAE framework such that manifold geometries, subject to our geometric flow,\nembedded in Euclidean space are learned in the intermediary latent space\ndeveloped by encoders and decoders. By tailoring the geometric flow in which\nthe latent space evolves, we induce latent geometric properties of our\nchoosing, which are reflected in empirical performance. We reformulate the\ntraditional evidence lower bound (ELBO) loss with a considerate choice of\nprior. We develop a linear geometric flow with a steady-state regularizing\nterm. This flow requires only automatic differentiation of one time derivative,\nand can be solved in moderately high dimensions in a physics-informed approach,\nallowing more expressive latent representations. We discuss how this flow can\nbe formulated as a gradient flow, and maintains entropy away from metric\nsingularity. This, along with an eigenvalue penalization condition, helps\nensure the manifold is sufficiently large in measure, nondegenerate, and a\ncanonical geometry, which contribute to a robust representation. Our methods\nfocus on the modified multi-layer perceptron architecture with tanh activations\nfor the manifold encoder-decoder. We demonstrate, on our datasets of interest,\nour methods perform at least as well as the traditional VAE, and oftentimes\nbetter. Our methods can outperform this and a VAE endowed with our proposed\narchitecture by up to 25% reduction in out-of-distribution (OOD) error and\npotentially greater. We highlight our method on ambient PDEs whose solutions\nmaintain minimal variation in late times. We provide empirical justification\ntowards how we can improve robust learning for external dynamics with VAEs.\n","authors":["Andrew Gracyk"],"pdf_url":"https://arxiv.org/pdf/2410.10137v3.pdf","comment":"Minor fixes; added details to proofs in the appendix"},{"id":"http://arxiv.org/abs/2411.19951v2","updated":"2024-12-02T06:54:47Z","published":"2024-11-29T18:59:54Z","title":"T2Vid: Translating Long Text into Multi-Image is the Catalyst for\n Video-LLMs","summary":" The success of Multimodal Large Language Models (MLLMs) in the image domain\nhas garnered wide attention from the research community. Drawing on previous\nsuccessful experiences, researchers have recently explored extending the\nsuccess to the video understanding realms. Apart from training from scratch, an\nefficient way is to utilize the pre-trained image-LLMs, leading to two\nmainstream approaches, i.e. zero-shot inference and further fine-tuning with\nvideo data. In this work, our study of these approaches harvests an effective\ndata augmentation method. We first make a deeper inspection of the zero-shot\ninference way and identify two limitations, i.e. limited generalization and\nlack of temporal understanding capabilities. Thus, we further investigate the\nfine-tuning approach and find a low learning efficiency when simply using all\nthe video data samples, which can be attributed to a lack of instruction\ndiversity. Aiming at this issue, we develop a method called T2Vid to synthesize\nvideo-like samples to enrich the instruction diversity in the training corpus.\nIntegrating these data enables a simple and efficient training scheme, which\nachieves performance comparable to or even superior to using full video\ndatasets by training with just 15% the sample size. Meanwhile, we find that the\nproposed scheme can boost the performance of long video understanding without\ntraining with long video samples. We hope our study will spark more thinking\nabout using MLLMs for video understanding and curation of high-quality data.\nThe code is released at https://github.com/xjtupanda/T2Vid.\n","authors":["Shukang Yin","Chaoyou Fu","Sirui Zhao","Yunhang Shen","Chunjiang Ge","Yan Yang","Zuwei Long","Yuhan Dai","Tong Xu","Xing Sun","Ran He","Caifeng Shan","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2411.19951v2.pdf","comment":"Project page: https://github.com/xjtupanda/T2Vid"},{"id":"http://arxiv.org/abs/2410.13025v2","updated":"2024-12-02T06:40:50Z","published":"2024-10-16T20:33:06Z","title":"LoRA Soups: Merging LoRAs for Practical Skill Composition Tasks","summary":" Low-Rank Adaptation (LoRA) is a popular technique for parameter-efficient\nfine-tuning of Large Language Models (LLMs). We study how different LoRA\nmodules can be merged to achieve skill composition -- testing the performance\nof the merged model on a target task that involves combining multiple skills,\neach skill coming from a single LoRA. This setup is favorable when it is\ndifficult to obtain training data for the target task and when it can be\ndecomposed into multiple skills. First, we identify practically occurring\nuse-cases that can be studied under the realm of skill composition, e.g.\nsolving hard math-word problems with code, creating a bot to answer questions\non proprietary manuals or about domain-specialized corpora. Our main\ncontribution is to show that concatenation of LoRAs (CAT), which optimally\nweights LoRAs that were individually trained on different skills, outperforms\nexisting model- and data- merging techniques; for instance on math-word\nproblems, CAT beats these methods by an average of 43% and 12% respectively.\nThus, this paper advocates model merging as an efficient way to solve\ncompositional tasks and underscores CAT as a simple, compute-friendly and\neffective procedure. To our knowledge, this is the first work demonstrating the\nsuperiority of model merging over data mixing for binary skill composition\ntasks. Code and data are available at https://github.com/aksh555/LoRA-Soups\n","authors":["Akshara Prabhakar","Yuanzhi Li","Karthik Narasimhan","Sham Kakade","Eran Malach","Samy Jelassi"],"pdf_url":"https://arxiv.org/pdf/2410.13025v2.pdf","comment":"COLING 2025 Industry track; 9 pages plus references and appendices"},{"id":"http://arxiv.org/abs/2411.16698v2","updated":"2024-12-02T06:31:31Z","published":"2024-11-10T18:28:30Z","title":"Universal on-chip polarization handling with deep photonic networks","summary":" We propose a novel design paradigm for arbitrarily capable deep photonic\nnetworks of cascaded Mach-Zehnder Interferometers (MZIs) for on-chip universal\npolarization handling. Using a device architecture made of cascaded\nMach-Zehnder interferometers, we modify and train the phase difference between\ninterferometer arms for both polarizations through wide operation bandwidths.\nThree proof-of-concept polarization handling devices are illustrated using a\nsoftware-defined, physics-informed neural framework, to achieve user-specified\ntarget device responses as functions of polarization and wavelength. These\ndevices include a polarization splitter, a polarization-independent power\nsplitter, and an arbitrary polarization-dependent splitter to illustrate the\ncapabilities of the design framework. The performance for all three devices is\noptimized using transfer matrix calculations; and their final responses are\nverified through 3D-FDTD simulations. All devices demonstrate state-of-the-art\nperformance metrics with over 20 dB extinction, and flat-top transmission bands\nthrough bandwidths of 120 nm. In addition to the functional diversity enabled,\nthe optimization for each device is completed in under a minute, highlighting\nthe computational efficiency of the design paradigm presented. These results\ndemonstrate the versatility of the deep photonic network design ecosystem in\npolarization management, unveiling promising prospects for advanced on-chip\napplications in optical communications, sensing, and computing.\n","authors":["Aycan Deniz Vit","Ujal Rzayev","Bahrem Serhat Danis","Ali Najjar Amiri","Kazim Gorgulu","Emir Salih Magden"],"pdf_url":"https://arxiv.org/pdf/2411.16698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19943v2","updated":"2024-12-02T06:26:38Z","published":"2024-11-29T18:58:22Z","title":"Critical Tokens Matter: Token-Level Contrastive Estimation Enhances\n LLM's Reasoning Capability","summary":" Large Language Models (LLMs) have exhibited remarkable performance on\nreasoning tasks. They utilize autoregressive token generation to construct\nreasoning trajectories, enabling the development of a coherent chain of\nthought. In this work, we explore the impact of individual tokens on the final\noutcomes of reasoning tasks. We identify the existence of ``critical tokens''\nthat lead to incorrect reasoning trajectories in LLMs. Specifically, we find\nthat LLMs tend to produce positive outcomes when forced to decode other tokens\ninstead of critical tokens. Motivated by this observation, we propose a novel\napproach - cDPO - designed to automatically recognize and conduct token-level\nrewards for the critical tokens during the alignment process. Specifically, we\ndevelop a contrastive estimation approach to automatically identify critical\ntokens. It is achieved by comparing the generation likelihood of positive and\nnegative models. To achieve this, we separately fine-tune the positive and\nnegative models on various reasoning trajectories, consequently, they are\ncapable of identifying identify critical tokens within incorrect trajectories\nthat contribute to erroneous outcomes. Moreover, to further align the model\nwith the critical token information during the alignment process, we extend the\nconventional DPO algorithms to token-level DPO and utilize the differential\nlikelihood from the aforementioned positive and negative model as important\nweight for token-level DPO learning.Experimental results on GSM8K and MATH500\nbenchmarks with two-widely used models Llama-3 (8B and 70B) and deepseek-math\n(7B) demonstrate the effectiveness of the propsoed approach cDPO.\n","authors":["Zicheng Lin","Tian Liang","Jiahao Xu","Xing Wang","Ruilin Luo","Chufan Shi","Siheng Li","Yujiu Yang","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2411.19943v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.18122v3","updated":"2024-12-02T06:15:54Z","published":"2024-11-27T08:02:31Z","title":"Fighting Bias with Bias: A Machine Learning Approach to Assess Human\n Bias","summary":" Biased human decisions have consequential impacts across various domains,\nyielding unfair treatment of individuals and resulting in suboptimal outcomes\nfor organizations and society. In recognition of this fact, organizations\nregularly design and deploy interventions aimed at mitigating these biases.\nHowever, measuring human decision biases remains an important but elusive task.\nOrganizations are frequently concerned with mistaken decisions\ndisproportionately affecting one group. In practice, however, this is typically\nnot possible to assess due to the scarcity of a gold standard: a label that\nindicates what the correct decision would have been. In this work, we propose a\nmachine learning-based framework to assess bias in human-generated decisions\nwhen gold standard labels are scarce. We provide theoretical guarantees and\nempirical evidence demonstrating the superiority of our method over existing\nalternatives. This proposed methodology establishes a foundation for\ntransparency in human decision-making, carrying substantial implications for\nmanagerial duties, and offering potential for alleviating algorithmic biases\nwhen human decisions are used as labels to train algorithms.\n","authors":["Wanxue Dong","Maria De-arteaga","Maytal Saar-Tsechansky"],"pdf_url":"https://arxiv.org/pdf/2411.18122v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14778v5","updated":"2024-12-02T05:02:11Z","published":"2024-08-27T04:56:45Z","title":"GPU-Accelerated Counterfactual Regret Minimization","summary":" Counterfactual regret minimization is a family of algorithms of no-regret\nlearning dynamics capable of solving large-scale imperfect information games.\nWe propose implementing this algorithm as a series of dense and sparse matrix\nand vector operations, thereby making it highly parallelizable for a graphical\nprocessing unit, at a cost of higher memory usage. Our experiments show that\nour implementation performs up to about 401.2 times faster than OpenSpiel's\nPython implementation and, on an expanded set of games, up to about 203.6 times\nfaster than OpenSpiel's C++ implementation and the speedup becomes more\npronounced as the size of the game being solved grows.\n","authors":["Juho Kim"],"pdf_url":"https://arxiv.org/pdf/2408.14778v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.02038v2","updated":"2024-12-02T04:20:10Z","published":"2024-10-02T21:08:11Z","title":"Realizable Continuous-Space Shields for Safe Reinforcement Learning","summary":" While Deep Reinforcement Learning (DRL) has achieved remarkable success\nacross various domains, it remains vulnerable to occasional catastrophic\nfailures without additional safeguards. An effective solution to prevent these\nfailures is to use a shield that validates and adjusts the agent's actions to\nensure compliance with a provided set of safety specifications. For real-world\nrobotic domains, it is essential to define safety specifications over\ncontinuous state and action spaces to accurately account for system dynamics\nand compute new actions that minimally deviate from the agent's original\ndecision. In this paper, we present the first shielding approach specifically\ndesigned to ensure the satisfaction of safety requirements in continuous state\nand action spaces, making it suitable for practical robotic applications. Our\nmethod builds upon realizability, an essential property that confirms the\nshield will always be able to generate a safe action for any state in the\nenvironment. We formally prove that realizability can be verified for stateful\nshields, enabling the incorporation of non-Markovian safety requirements, such\nas loop avoidance. Finally, we demonstrate the effectiveness of our approach in\nensuring safety without compromising the policy's success rate by applying it\nto a navigation problem and a multi-agent particle environment.\n","authors":["Kyungmin Kim","Davide Corsi","Andoni Rodriguez","JB Lanier","Benjami Parellada","Pierre Baldi","Cesar Sanchez","Roy Fox"],"pdf_url":"https://arxiv.org/pdf/2410.02038v2.pdf","comment":"Kim, Corsi, and Rodriguez contributed equally"},{"id":"http://arxiv.org/abs/2406.12336v2","updated":"2024-12-02T04:08:49Z","published":"2024-06-18T07:03:34Z","title":"Towards Understanding Domain Adapted Sentence Embeddings for Document\n Retrieval","summary":" A plethora of sentence embedding models makes it challenging to choose one,\nespecially for technical domains rich with specialized vocabulary. In this\nwork, we domain adapt embeddings using telecom, health and science datasets for\nquestion answering. We evaluate embeddings obtained from publicly available\nmodels and their domain-adapted variants, on both point retrieval accuracies,\nas well as their (95\\%) confidence intervals. We establish a systematic method\nto obtain thresholds for similarity scores for different embeddings. As\nexpected, we observe that fine-tuning improves mean bootstrapped accuracies. We\nalso observe that it results in tighter confidence intervals, which further\nimprove when pre-training is preceded by fine-tuning. We introduce metrics\nwhich measure the distributional overlaps of top-$K$, correct and random\ndocument similarities with the question. Further, we show that these metrics\nare correlated with retrieval accuracy and similarity thresholds. Recent\nliterature shows conflicting effects of isotropy on retrieval accuracies. Our\nexperiments establish that the isotropy of embeddings (as measured by two\nindependent state-of-the-art isotropy metric definitions) is poorly correlated\nwith retrieval performance. We show that embeddings for domain-specific\nsentences have little overlap with those for domain-agnostic ones, and\nfine-tuning moves them further apart. Based on our results, we provide\nrecommendations for use of our methodology and metrics by researchers and\npractitioners.\n","authors":["Sujoy Roychowdhury","Sumit Soman","H. G. Ranjani","Vansh Chhabra","Neeraj Gunda","Shashank Gautam","Subhadip Bandyopadhyay","Sai Krishna Bala"],"pdf_url":"https://arxiv.org/pdf/2406.12336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12707v3","updated":"2024-12-02T03:45:42Z","published":"2024-07-17T16:30:27Z","title":"TTSDS -- Text-to-Speech Distribution Score","summary":" Many recently published Text-to-Speech (TTS) systems produce audio close to\nreal speech. However, TTS evaluation needs to be revisited to make sense of the\nresults obtained with the new architectures, approaches and datasets. We\npropose evaluating the quality of synthetic speech as a combination of multiple\nfactors such as prosody, speaker identity, and intelligibility. Our approach\nassesses how well synthetic speech mirrors real speech by obtaining correlates\nof each factor and measuring their distance from both real speech datasets and\nnoise datasets. We benchmark 35 TTS systems developed between 2008 and 2024 and\nshow that our score computed as an unweighted average of factors strongly\ncorrelates with the human evaluations from each time period.\n","authors":["Christoph Minixhofer","Ondřej Klejch","Peter Bell"],"pdf_url":"https://arxiv.org/pdf/2407.12707v3.pdf","comment":"SLT 2024"},{"id":"http://arxiv.org/abs/2410.09640v3","updated":"2024-12-02T03:41:51Z","published":"2024-10-12T20:33:37Z","title":"Provable Acceleration of Nesterov's Accelerated Gradient for Rectangular\n Matrix Factorization and Linear Neural Networks","summary":" We study the convergence rate of first-order methods for rectangular matrix\nfactorization, which is a canonical nonconvex optimization problem.\nSpecifically, given a rank-$r$ matrix $\\mathbf{A}\\in\\mathbb{R}^{m\\times n}$, we\nprove that gradient descent (GD) can find a pair of $\\epsilon$-optimal\nsolutions $\\mathbf{X}_T\\in\\mathbb{R}^{m\\times d}$ and\n$\\mathbf{Y}_T\\in\\mathbb{R}^{n\\times d}$, where $d\\geq r$, satisfying\n$\\lVert\\mathbf{X}_T\\mathbf{Y}_T^\\top-\\mathbf{A}\\rVert_\\mathrm{F}\\leq\\epsilon\\lVert\\mathbf{A}\\rVert_\\mathrm{F}$\nin $T=O(\\kappa^2\\log\\frac{1}{\\epsilon})$ iterations with high probability,\nwhere $\\kappa$ denotes the condition number of $\\mathbf{A}$. Furthermore, we\nprove that Nesterov's accelerated gradient (NAG) attains an iteration\ncomplexity of $O(\\kappa\\log\\frac{1}{\\epsilon})$, which is the best-known bound\nof first-order methods for rectangular matrix factorization. Different from\nsmall balanced random initialization in the existing literature, we adopt an\nunbalanced initialization, where $\\mathbf{X}_0$ is large and $\\mathbf{Y}_0$ is\n$0$. Moreover, our initialization and analysis can be further extended to\nlinear neural networks, where we prove that NAG can also attain an accelerated\nlinear convergence rate. In particular, we only require the width of the\nnetwork to be greater than or equal to the rank of the output label matrix. In\ncontrast, previous results achieving the same rate require excessive widths\nthat additionally depend on the condition number and the rank of the input data\nmatrix.\n","authors":["Zhenghao Xu","Yuqing Wang","Tuo Zhao","Rachel Ward","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2410.09640v3.pdf","comment":"30 pages (checklist included)"},{"id":"http://arxiv.org/abs/2411.19527v2","updated":"2024-12-02T03:34:45Z","published":"2024-11-29T07:54:56Z","title":"DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow\n Decoding","summary":" Human motion, inherently continuous and dynamic, presents significant\nchallenges for generative models. Despite their dominance, discrete\nquantization methods, such as VQ-VAEs, suffer from inherent limitations,\nincluding restricted expressiveness and frame-wise noise artifacts. Continuous\napproaches, while producing smoother and more natural motions, often falter due\nto high-dimensional complexity and limited training data. To resolve this\n\"discord\" between discrete and continuous representations, we introduce\nDisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a\nnovel method that decodes discrete motion tokens into continuous motion through\nrectified flow. By employing an iterative refinement process in the continuous\nspace, DisCoRD captures fine-grained dynamics and ensures smoother and more\nnatural motions. Compatible with any discrete-based framework, our method\nenhances naturalness without compromising faithfulness to the conditioning\nsignals. Extensive evaluations demonstrate that DisCoRD achieves\nstate-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on\nKIT-ML. These results solidify DisCoRD as a robust solution for bridging the\ndivide between discrete efficiency and continuous realism. Our project page is\navailable at: https://whwjdqls.github.io/discord.github.io/.\n","authors":["Jungbin Cho","Junwan Kim","Jisoo Kim","Minseo Kim","Mingu Kang","Sungeun Hong","Tae-Hyun Oh","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2411.19527v2.pdf","comment":"20 pages 18 figures"},{"id":"http://arxiv.org/abs/2404.01245v3","updated":"2024-12-02T03:27:10Z","published":"2024-04-01T17:03:41Z","title":"A Statistical Framework of Watermarks for Large Language Models: Pivot,\n Detection Efficiency and Optimal Rules","summary":" Since ChatGPT was introduced in November 2022, embedding (nearly)\nunnoticeable statistical signals into text generated by large language models\n(LLMs), also known as watermarking, has been used as a principled approach to\nprovable detection of LLM-generated text from its human-written counterpart. In\nthis paper, we introduce a general and flexible framework for reasoning about\nthe statistical efficiency of watermarks and designing powerful detection\nrules. Inspired by the hypothesis testing formulation of watermark detection,\nour framework starts by selecting a pivotal statistic of the text and a secret\nkey -- provided by the LLM to the verifier -- to enable controlling the false\npositive rate (the error of mistakenly detecting human-written text as\nLLM-generated). Next, this framework allows one to evaluate the power of\nwatermark detection rules by obtaining a closed-form expression of the\nasymptotic false negative rate (the error of incorrectly classifying\nLLM-generated text as human-written). Our framework further reduces the problem\nof determining the optimal detection rule to solving a minimax optimization\nprogram. We apply this framework to two representative watermarks -- one of\nwhich has been internally implemented at OpenAI -- and obtain several findings\nthat can be instrumental in guiding the practice of implementing watermarks. In\nparticular, we derive optimal detection rules for these watermarks under our\nframework. These theoretically derived detection rules are demonstrated to be\ncompetitive and sometimes enjoy a higher power than existing detection\napproaches through numerical experiments.\n","authors":["Xiang Li","Feng Ruan","Huiyuan Wang","Qi Long","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2404.01245v3.pdf","comment":"To appear in the Annals of Statistics"},{"id":"http://arxiv.org/abs/2405.10515v2","updated":"2024-12-02T02:16:35Z","published":"2024-05-17T03:47:30Z","title":"Improved AdaBoost for Virtual Reality Experience Prediction Based on\n Long Short-Term Memory Network","summary":" A classification prediction algorithm based on Long Short-Term Memory Network\n(LSTM) improved AdaBoost is used to predict virtual reality (VR) user\nexperience. The dataset is randomly divided into training and test sets in the\nratio of 7:3.During the training process, the model's loss value decreases from\n0.65 to 0.31, which shows that the model gradually reduces the discrepancy\nbetween the prediction results and the actual labels, and improves the accuracy\nand generalisation ability.The final loss value of 0.31 indicates that the\nmodel fits the training data well, and is able to make predictions and\nclassifications more accurately. The confusion matrix for the training set\nshows a total of 177 correct predictions and 52 incorrect predictions, with an\naccuracy of 77%, precision of 88%, recall of 77% and f1 score of 82%. The\nconfusion matrix for the test set shows a total of 167 correct and 53 incorrect\npredictions with 75% accuracy, 87% precision, 57% recall and 69% f1 score. In\nsummary, the classification prediction algorithm based on LSTM with improved\nAdaBoost shows good prediction ability for virtual reality user experience.\nThis study is of great significance to enhance the application of virtual\nreality technology in user experience. By combining LSTM and AdaBoost\nalgorithms, significant progress has been made in user experience prediction,\nwhich not only improves the accuracy and generalisation ability of the model,\nbut also provides useful insights for related research in the field of virtual\nreality. This approach can help developers better understand user requirements,\noptimise virtual reality product design, and enhance user satisfaction,\npromoting the wide application of virtual reality technology in various fields.\n","authors":["Wenhan Fan","Zhicheng Ding","Ruixin Huang","Chang Zhou","Xuyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.10515v2.pdf","comment":"This work has been peer-reviewed in The 2nd International Conference\n on Software Engineering and Machine Learning and published in Applied and\n Computational Engineering, DOI:\n https://doi.org/10.54254/2755-2721/77/20240678"},{"id":"http://arxiv.org/abs/2405.02326v2","updated":"2024-12-02T01:59:30Z","published":"2024-04-23T18:55:49Z","title":"Evaluating LLMs for Hardware Design and Test","summary":" Large Language Models (LLMs) have demonstrated capabilities for producing\ncode in Hardware Description Languages (HDLs). However, most of the focus\nremains on their abilities to write functional code, not test code. The\nhardware design process consists of both design and test, and so eschewing\nvalidation and verification leaves considerable potential benefit unexplored,\ngiven that a design and test framework may allow for progress towards full\nautomation of the digital design pipeline. In this work, we perform one of the\nfirst studies exploring how a LLM can both design and test hardware modules\nfrom provided specifications. Using a suite of 8 representative benchmarks, we\nexamined the capabilities and limitations of the state-of-the-art\nconversational LLMs when producing Verilog for functional and verification\npurposes. We taped out the benchmarks on a Skywater 130nm shuttle and received\nthe functional chip.\n","authors":["Jason Blocklove","Siddharth Garg","Ramesh Karri","Hammond Pearce"],"pdf_url":"https://arxiv.org/pdf/2405.02326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04374v2","updated":"2024-12-02T01:38:05Z","published":"2023-12-07T15:44:56Z","title":"Deep Dynamics: Vehicle Dynamics Modeling with a Physics-Constrained\n Neural Network for Autonomous Racing","summary":" Autonomous racing is a critical research area for autonomous driving,\npresenting significant challenges in vehicle dynamics modeling, such as\nbalancing model precision and computational efficiency at high speeds\n(>280km/h), where minor errors in modeling have severe consequences. Existing\nphysics-based models for vehicle dynamics require elaborate testing setups and\ntuning, which are hard to implement, time-intensive, and cost-prohibitive.\nConversely, purely data-driven approaches do not generalize well and cannot\nadequately ensure physical constraints on predictions. This paper introduces\nDeep Dynamics, a physics-constrained neural network (PCNN) for vehicle dynamics\nmodeling of an autonomous racecar. It combines physics coefficient estimation\nand dynamical equations to accurately predict vehicle states at high speeds and\nincludes a unique Physics Guard layer to ensure internal coefficient estimates\nremain within their nominal physical ranges. Open-loop and closed-loop\nperformance assessments, using a physics-based simulator and full-scale\nautonomous Indy racecar data, highlight Deep Dynamics as a promising approach\nfor modeling racecar vehicle dynamics.\n","authors":["John Chrosniak","Jingyun Ning","Madhur Behl"],"pdf_url":"https://arxiv.org/pdf/2312.04374v2.pdf","comment":"Published in the IEEE Robotics and Automation Letters and presented\n at the IEEE International Conference on Intelligent Robots and Systems"},{"id":"http://arxiv.org/abs/2208.08287v2","updated":"2024-12-02T01:00:34Z","published":"2022-08-17T13:29:14Z","title":"Noisy Nonnegative Tucker Decomposition with Sparse Factors and Missing\n Data","summary":" Tensor decomposition is a powerful tool for extracting physically meaningful\nlatent factors from multi-dimensional nonnegative data, and has been an\nincreasing interest in a variety of fields such as image processing, machine\nlearning, and computer vision. In this paper, we propose a sparse nonnegative\nTucker decomposition and completion method for the recovery of underlying\nnonnegative data under noisy observations. Here the underlying nonnegative data\ntensor is decomposed into a core tensor and several factor matrices with all\nentries being nonnegative and the factor matrices being sparse. The loss\nfunction is derived by the maximum likelihood estimation of the noisy\nobservations, and the $\\ell_0$ norm is employed to enhance the sparsity of the\nfactor matrices. We establish the error bound of the estimator of the proposed\nmodel under generic noise scenarios, which is then specified to the\nobservations with additive Gaussian noise, additive Laplace noise, and Poisson\nobservations, respectively. Our theoretical results are better than those by\nexisting tensor-based or matrix-based methods. Moreover, the minimax lower\nbounds are shown to be matched with the derived upper bounds up to logarithmic\nfactors. Numerical examples on both synthetic and real-world data sets\ndemonstrate the superiority of the proposed method for nonnegative tensor data\ncompletion.\n","authors":["Xiongjun Zhang","Michael K. Ng"],"pdf_url":"https://arxiv.org/pdf/2208.08287v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03641v3","updated":"2024-12-02T00:54:47Z","published":"2023-04-07T13:44:59Z","title":"A Block Coordinate Descent Method for Nonsmooth Composite Optimization\n under Orthogonality Constraints","summary":" Nonsmooth composite optimization with orthogonality constraints has a wide\nrange of applications in statistical learning and data science. However, this\nproblem is challenging due to its nonsmooth objective and computationally\nexpensive, non-convex constraints. In this paper, we propose a new approach\ncalled \\textbf{OBCD}, which leverages Block Coordinate Descent to address these\nchallenges. \\textbf{OBCD} is a feasible method with a small computational\nfootprint. In each iteration, it updates $k$ rows of the solution matrix, where\n$k \\geq 2$, by globally solving a small nonsmooth optimization problem under\northogonality constraints. We prove that the limiting points of \\textbf{OBCD},\nreferred to as (global) block-$k$ stationary points, offer stronger optimality\nthan standard critical points. Furthermore, we show that \\textbf{OBCD}\nconverges to $\\epsilon$-block-$k$ stationary points with an ergodic convergence\nrate of $\\mathcal{O}(1/\\epsilon)$. Additionally, under the Kurdyka-Lojasiewicz\n(KL) inequality, we establish the non-ergodic convergence rate of\n\\textbf{OBCD}. We also extend \\textbf{OBCD} by incorporating breakpoint\nsearching methods for subproblem solving and greedy strategies for working set\nselection. Comprehensive experiments demonstrate the superior performance of\nour approach across various tasks.\n","authors":["Ganzhao Yuan"],"pdf_url":"https://arxiv.org/pdf/2304.03641v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16791v2","updated":"2024-12-02T00:12:35Z","published":"2024-06-24T16:55:03Z","title":"Enabling more efficient and cost-effective AI/ML systems with Collective\n Mind, virtualized MLOps, MLPerf, Collective Knowledge Playground and\n reproducible optimization tournaments","summary":" This white paper introduces my educational community initiative to learn how\nto run AI, ML and other emerging workloads in the most efficient and\ncost-effective way across diverse models, data sets, software and hardware.\nThis project leverages Collective Mind (CM), virtualized MLOps and DevOps\n(CM4MLOps), MLPerf benchmarks, and the Collective Knowledge playground (CK),\nwhich I have developed in collaboration with the community and MLCommons.\n I created Collective Mind as a small and portable Python package with minimal\ndependencies, a unified CLI and Python API to help researchers and engineers\nautomate repetitive, tedious, and time-consuming tasks. I also designed CM as a\ndistributed framework, continuously enhanced by the community through the CM4*\nrepositories, which function as the unified interface for organizing and\nmanaging various collections of automations and artifacts. For example,\nCM4MLOps repository includes many automations, also known as CM scripts, to\nstreamline the process of building, running, benchmarking, and optimizing AI,\nML, and other workflows across ever-evolving models, data, and systems.\n I donated CK, CM and CM4MLOps to MLCommons to foster collaboration between\nacademia and industry to learn how to co-design more efficient and\ncost-effective AI systems while capturing and encoding knowledge within\nCollective Mind, protecting intellectual property, enabling portable skills,\nand accelerating the transition of the state-of-the-art research into\nproduction. My ultimate goal is to collaborate with the community to complete\nmy two-decade journey toward creating self-optimizing software and hardware\nthat can automatically learn how to run any workload in the most efficient and\ncost-effective manner based on user requirements and constraints such as cost,\nlatency, throughput, accuracy, power consumption, size, and other critical\nfactors.\n","authors":["Grigori Fursin"],"pdf_url":"https://arxiv.org/pdf/2406.16791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01401v4","updated":"2024-12-02T00:03:53Z","published":"2024-02-02T13:33:30Z","title":"An Information Theoretic Approach to Machine Unlearning","summary":" To comply with AI and data regulations, the need to forget private or\ncopyrighted information from trained machine learning models is increasingly\nimportant. The key challenge in unlearning is forgetting the necessary data in\na timely manner, while preserving model performance. In this work, we address\nthe zero-shot unlearning scenario, whereby an unlearning algorithm must be able\nto remove data given only a trained model and the data to be forgotten. We\nexplore unlearning from an information theoretic perspective, connecting the\ninfluence of a sample to the information gain a model receives by observing it.\nFrom this, we derive a simple but principled zero-shot unlearning method based\non the geometry of the model. Our approach takes the form of minimising the\ngradient of a learned function with respect to a small neighbourhood around a\ntarget forget point. This induces a smoothing effect, causing forgetting by\nmoving the boundary of the classifier. We explore the intuition behind why this\napproach can jointly unlearn forget samples while preserving general model\nperformance through a series of low-dimensional experiments. We perform\nextensive empirical evaluation of our method over a range of contemporary\nbenchmarks, verifying that our method is competitive with state-of-the-art\nperformance under the strict constraints of zero-shot unlearning. Code for the\nproject can be found at\nhttps://github.com/jwf40/Information-Theoretic-Unlearning\n","authors":["Jack Foster","Kyle Fogarty","Stefan Schoepf","Zack Dugue","Cengiz Öztireli","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2402.01401v4.pdf","comment":"Updated, new low-dimensional experiments and updated perspective on\n unlearning from an information theoretic view"}],"Multimedia":[{"id":"http://arxiv.org/abs/2303.17550v6","updated":"2024-12-02T10:06:28Z","published":"2023-03-30T17:18:31Z","title":"DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with\n Diffusion Autoencoder","summary":" While recent research has made significant progress in speech-driven talking\nface generation, the quality of the generated video still lags behind that of\nreal recordings. One reason for this is the use of handcrafted intermediate\nrepresentations like facial landmarks and 3DMM coefficients, which are designed\nbased on human knowledge and are insufficient to precisely describe facial\nmovements. Additionally, these methods require an external pretrained model for\nextracting these representations, whose performance sets an upper bound on\ntalking face generation. To address these limitations, we propose a novel\nmethod called DAE-Talker that leverages data-driven latent representations\nobtained from a diffusion autoencoder (DAE). DAE contains an image encoder that\nencodes an image into a latent vector and a DDIM image decoder that\nreconstructs the image from it. We train our DAE on talking face video frames\nand then extract their latent representations as the training target for a\nConformer-based speech2latent model. This allows DAE-Talker to synthesize full\nvideo frames and produce natural head movements that align with the content of\nspeech, rather than relying on a predetermined head pose from a template video.\nWe also introduce pose modelling in speech2latent for pose controllability.\nAdditionally, we propose a novel method for generating continuous video frames\nwith the DDIM image decoder trained on individual frames, eliminating the need\nfor modelling the joint distribution of consecutive frames directly. Our\nexperiments show that DAE-Talker outperforms existing popular methods in\nlip-sync, video fidelity, and pose naturalness. We also conduct ablation\nstudies to analyze the effectiveness of the proposed techniques and demonstrate\nthe pose controllability of DAE-Talker.\n","authors":["Chenpeng Du","Qi Chen","Tianyu He","Xu Tan","Xie Chen","Kai Yu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.17550v6.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2412.01986v1","updated":"2024-12-02T21:35:33Z","published":"2024-12-02T21:35:33Z","title":"HybridMQA: Exploring Geometry-Texture Interactions for Colored Mesh\n Quality Assessment","summary":" Mesh quality assessment (MQA) models play a critical role in the design,\noptimization, and evaluation of mesh operation systems in a wide variety of\napplications. Current MQA models, whether model-based methods using\ntopology-aware features or projection-based approaches working on rendered 2D\nprojections, often fail to capture the intricate interactions between texture\nand 3D geometry. We introduce HybridMQA, a first-of-its-kind hybrid\nfull-reference colored MQA framework that integrates model-based and\nprojection-based approaches, capturing complex interactions between textural\ninformation and 3D structures for enriched quality representations. Our method\nemploys graph learning to extract detailed 3D representations, which are then\nprojected to 2D using a novel feature rendering process that precisely aligns\nthem with colored projections. This enables the exploration of geometry-texture\ninteractions via cross-attention, producing comprehensive mesh quality\nrepresentations. Extensive experiments demonstrate HybridMQA's superior\nperformance across diverse datasets, highlighting its ability to effectively\nleverage geometry-texture interactions for a thorough understanding of mesh\nquality. Our implementation will be made publicly available.\n","authors":["Armin Shafiee Sarvestani","Sheyang Tang","Zhou Wang"],"pdf_url":"https://arxiv.org/pdf/2412.01986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01824v1","updated":"2024-12-02T18:59:26Z","published":"2024-12-02T18:59:26Z","title":"X-Prompt: Towards Universal In-Context Image Generation in\n Auto-Regressive Vision Language Foundation Models","summary":" In-context generation is a key component of large language models' (LLMs)\nopen-task generalization capability. By leveraging a few examples as context,\nLLMs can perform both in-domain and out-of-domain tasks. Recent advancements in\nauto-regressive vision-language models (VLMs) built upon LLMs have showcased\nimpressive performance in text-to-image generation. However, the potential of\nin-context learning for general image generation tasks remains largely\nunexplored. To address this, we introduce X-Prompt, a purely auto-regressive\nlarge-vision language model designed to deliver competitive performance across\na wide range of both seen and unseen image generation tasks, all within a\nunified in-context learning framework. X-Prompt incorporates a specialized\ndesign that efficiently compresses valuable features from in-context examples,\nsupporting longer in-context token sequences and improving its ability to\ngeneralize to unseen tasks. A unified training task for both text and image\nprediction enables X-Prompt to handle general image generation with enhanced\ntask awareness from in-context examples. Extensive experiments validate the\nmodel's performance across diverse seen image generation tasks and its capacity\nto generalize to previously unseen tasks.\n","authors":["Zeyi Sun","Ziyang Chu","Pan Zhang","Tong Wu","Xiaoyi Dong","Yuhang Zang","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2412.01824v1.pdf","comment":"code: https://github.com/SunzeY/X-Prompt"},{"id":"http://arxiv.org/abs/2412.01556v1","updated":"2024-12-02T14:44:39Z","published":"2024-12-02T14:44:39Z","title":"Divide-and-Conquer: Confluent Triple-Flow Network for RGB-T Salient\n Object Detection","summary":" RGB-Thermal Salient Object Detection aims to pinpoint prominent objects\nwithin aligned pairs of visible and thermal infrared images. Traditional\nencoder-decoder architectures, while designed for cross-modality feature\ninteractions, may not have adequately considered the robustness against noise\noriginating from defective modalities. Inspired by hierarchical human visual\nsystems, we propose the ConTriNet, a robust Confluent Triple-Flow Network\nemploying a Divide-and-Conquer strategy. Specifically, ConTriNet comprises\nthree flows: two modality-specific flows explore cues from RGB and Thermal\nmodalities, and a third modality-complementary flow integrates cues from both\nmodalities. ConTriNet presents several notable advantages. It incorporates a\nModality-induced Feature Modulator in the modality-shared union encoder to\nminimize inter-modality discrepancies and mitigate the impact of defective\nsamples. Additionally, a foundational Residual Atrous Spatial Pyramid Module in\nthe separated flows enlarges the receptive field, allowing for the capture of\nmulti-scale contextual information. Furthermore, a Modality-aware Dynamic\nAggregation Module in the modality-complementary flow dynamically aggregates\nsaliency-related cues from both modality-specific flows. Leveraging the\nproposed parallel triple-flow framework, we further refine saliency maps\nderived from different flows through a flow-cooperative fusion strategy,\nyielding a high-quality, full-resolution saliency map for the final prediction.\nTo evaluate the robustness and stability of our approach, we collect a\ncomprehensive RGB-T SOD benchmark, VT-IMAG, covering various real-world\nchallenging scenarios. Extensive experiments on public benchmarks and our\nVT-IMAG dataset demonstrate that ConTriNet consistently outperforms\nstate-of-the-art competitors in both common and challenging scenarios.\n","authors":["Hao Tang","Zechao Li","Dong Zhang","Shengfeng He","Jinhui Tang"],"pdf_url":"https://arxiv.org/pdf/2412.01556v1.pdf","comment":"Accepted by IEEE TPAMI. Project page:\n https://cser-tang-hao.github.io/contrinet.html"},{"id":"http://arxiv.org/abs/2412.01316v1","updated":"2024-12-02T09:32:36Z","published":"2024-12-02T09:32:36Z","title":"Long Video Diffusion Generation with Segmented Cross-Attention and\n Content-Rich Video Data Curation","summary":" We introduce Presto, a novel video diffusion model designed to generate\n15-second videos with long-range coherence and rich content. Extending video\ngeneration methods to maintain scenario diversity over long durations presents\nsignificant challenges. To address this, we propose a Segmented Cross-Attention\n(SCA) strategy, which splits hidden states into segments along the temporal\ndimension, allowing each segment to cross-attend to a corresponding\nsub-caption. SCA requires no additional parameters, enabling seamless\nincorporation into current DiT-based architectures. To facilitate high-quality\nlong video generation, we build the LongTake-HD dataset, consisting of 261k\ncontent-rich videos with scenario coherence, annotated with an overall video\ncaption and five progressive sub-captions. Experiments show that our Presto\nachieves 78.5% on the VBench Semantic Score and 100% on the Dynamic Degree,\noutperforming existing state-of-the-art video generation methods. This\ndemonstrates that our proposed Presto significantly enhances content richness,\nmaintains long-range coherence, and captures intricate textual details. More\ndetails are displayed on our project page: https://presto-video.github.io/.\n","authors":["Xin Yan","Yuxuan Cai","Qiuyue Wang","Yuan Zhou","Wenhao Huang","Huan Yang"],"pdf_url":"https://arxiv.org/pdf/2412.01316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01202v1","updated":"2024-12-02T07:14:15Z","published":"2024-12-02T07:14:15Z","title":"Neuron Abandoning Attention Flow: Visual Explanation of Dynamics inside\n CNN Models","summary":" In this paper, we present a Neuron Abandoning Attention Flow (NAFlow) method\nto address the open problem of visually explaining the attention evolution\ndynamics inside CNNs when making their classification decisions. A novel\ncascading neuron abandoning back-propagation algorithm is designed to trace\nneurons in all layers of a CNN that involve in making its prediction to address\nthe problem of significant interference from abandoned neurons. Firstly, a\nNeuron Abandoning Back-Propagation (NA-BP) module is proposed to generate\nBack-Propagated Feature Maps (BPFM) by using the inverse function of the\nintermediate layers of CNN models, on which the neurons not used for\ndecision-making are abandoned. Meanwhile, the cascading NA-BP modules calculate\nthe tensors of importance coefficients which are linearly combined with the\ntensors of BPFMs to form the NAFlow. Secondly, to be able to visualize\nattention flow for similarity metric-based CNN models, a new channel\ncontribution weights module is proposed to calculate the importance\ncoefficients via Jacobian Matrix. The effectiveness of the proposed NAFlow is\nvalidated on nine widely-used CNN models for various tasks of general image\nclassification, contrastive learning classification, few-shot image\nclassification, and image retrieval.\n","authors":["Yi Liao","Yongsheng Gao","Weichuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.01202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01169v1","updated":"2024-12-02T06:13:01Z","published":"2024-12-02T06:13:01Z","title":"OmniFlow: Any-to-Any Generation with Multi-Modal Rectified Flows","summary":" We introduce OmniFlow, a novel generative model designed for any-to-any\ngeneration tasks such as text-to-image, text-to-audio, and audio-to-image\nsynthesis. OmniFlow advances the rectified flow (RF) framework used in\ntext-to-image models to handle the joint distribution of multiple modalities.\nIt outperforms previous any-to-any models on a wide range of tasks, such as\ntext-to-image and text-to-audio synthesis. Our work offers three key\ncontributions: First, we extend RF to a multi-modal setting and introduce a\nnovel guidance mechanism, enabling users to flexibly control the alignment\nbetween different modalities in the generated outputs. Second, we propose a\nnovel architecture that extends the text-to-image MMDiT architecture of Stable\nDiffusion 3 and enables audio and text generation. The extended modules can be\nefficiently pretrained individually and merged with the vanilla text-to-image\nMMDiT for fine-tuning. Lastly, we conduct a comprehensive study on the design\nchoices of rectified flow transformers for large-scale audio and text\ngeneration, providing valuable insights into optimizing performance across\ndiverse modalities. The Code will be available at\nhttps://github.com/jacklishufan/OmniFlows.\n","authors":["Shufan Li","Konstantinos Kallidromitis","Akash Gokul","Zichun Liao","Yusuke Kato","Kazuki Kozuka","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2412.01169v1.pdf","comment":"12 pages, 14 figures"},{"id":"http://arxiv.org/abs/2412.01064v1","updated":"2024-12-02T02:50:07Z","published":"2024-12-02T02:50:07Z","title":"FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking\n Portrait","summary":" With the rapid advancement of diffusion-based generative models, portrait\nimage animation has achieved remarkable results. However, it still faces\nchallenges in temporally consistent video generation and fast sampling due to\nits iterative sampling nature. This paper presents FLOAT, an audio-driven\ntalking portrait video generation method based on flow matching generative\nmodel. We shift the generative modeling from the pixel-based latent space to a\nlearned motion latent space, enabling efficient design of temporally consistent\nmotion. To achieve this, we introduce a transformer-based vector field\npredictor with a simple yet effective frame-wise conditioning mechanism.\nAdditionally, our method supports speech-driven emotion enhancement, enabling a\nnatural incorporation of expressive motions. Extensive experiments demonstrate\nthat our method outperforms state-of-the-art audio-driven talking portrait\nmethods in terms of visual quality, motion fidelity, and efficiency.\n","authors":["Taekyung Ki","Dongchan Min","Gyoungsu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.01064v1.pdf","comment":"Project page: https://deepbrainai-research.github.io/float/"}],"Genomics":[{"id":"http://arxiv.org/abs/2412.01649v1","updated":"2024-12-02T15:59:16Z","published":"2024-12-02T15:59:16Z","title":"Microbial Mat Metagenomes from Waikite Valley, Aotearoa New Zealand","summary":" The rise of complex multicellular ecosystems Neoproterozoic time was preceded\nby a microbial Proterozoic biosphere, where productivity may have been largely\nrestricted to microbial mats made up of bacteria including oxygenic\nphotosynthetic Cyanobacteria, anoxygenic phototrophs, and heterotrophs. In\nmodern environments, analogous microbial mats can be found in restricted\nenvironments such as carbonate tidal flats and terrestrial hot springs. Here,\nwe report metagenomic sequence data from an analog in the hot springs of\nWaikite Valley, Aotearoa New Zealand, where carbon-rich, slightly-alkaline\ngeothermal waters support diverse phototrophic microbial mats.\n The Waikite Valley hot spring in the Taupo Volcanic Zone of Aotearoa New\nZealand was sampled in duplicate at 8 points along a temperature gradient\ntransect of the outflow, from ~62 C (near the source) to ~37 C (~100 meters\ndownstream). ~686 Gb of shotgun metagenomic sequence data was generated by\nIllumina Novaseq. Each sample was assembled using SPAdes, followed by binning\nof metagenome-assembled genomes (MAGs) by MetaBAT. These data are useful for\nthe genomic analysis of novel phototrophic bacteria, as well as for ecological\ncomparisons between thermophilic communities with varying temperatures but\notherwise similar conditions.\n","authors":["Beatrice Tauer","Elizabeth Trembath-Reichert","L. M. Ward"],"pdf_url":"https://arxiv.org/pdf/2412.01649v1.pdf","comment":"55 pages, 1 table, 3 data sets"},{"id":"http://arxiv.org/abs/2412.01561v1","updated":"2024-12-02T14:50:13Z","published":"2024-12-02T14:50:13Z","title":"pasta: Pattern Analysis for Spatial Omics Data","summary":" Spatial omics assays allow for the molecular characterisation of cells in\ntheir spatial context. Notably, the two main technological streams,\nimaging-based and high-throughput sequencing-based, can give rise to very\ndifferent data modalities. The characteristics of the two data types are well\nknown in adjacent fields such as spatial statistics as point patterns and\nlattice data, and there is a wide range of tools available. This paper\ndiscusses the application of spatial statistics to spatially-resolved omics\ndata and in particular, discusses various advantages, challenges, and nuances.\nThis work is accompanied by a vignette, pasta, that showcases the usefulness of\nspatial statistics in biology using several R packages.\n","authors":["Martin Emons","Samuel Gunz","Helena L. Crowell","Izaskun Mallona","Reinhard Furrer","Mark D. Robinson"],"pdf_url":"https://arxiv.org/pdf/2412.01561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01124v1","updated":"2024-12-02T05:02:18Z","published":"2024-12-02T05:02:18Z","title":"SUICA: Learning Super-high Dimensional Sparse Implicit Neural\n Representations for Spatial Transcriptomics","summary":" Spatial Transcriptomics (ST) is a method that captures spatial gene\nexpression profiles within histological sections. The discrete spatial\ndistribution and the super-high dimensional sequencing results make ST data\nchallenging to be modeled effectively. In this paper, we manage to model ST in\na continuous and compact manner by the proposed tool, SUICA, empowered by the\ngreat approximation capability of Implicit Neural Representations (INRs) that\ncan improve both the spatial resolution and the gene expression. Concretely\nwithin the proposed SUICA, we incorporate a graph-augmented Autoencoder to\neffectively model the context information of the unstructured spots and provide\ninformative embeddings that are structure-aware for spatial mapping. We also\ntackle the extremely skewed distribution in a regression-by-classification\nfashion and enforce classification-based loss functions for the optimization of\nSUICA. By extensive experiments of a wide range of common ST platforms, SUICA\noutperforms both conventional INR variants and SOTA methods for ST\nsuper-resolution regarding numerical fidelity, statistical correlation, and\nbio-conservation. The prediction by SUICA also showcases amplified gene\nsignatures that enriches the bio-conservation of the raw data and benefits\nsubsequent analysis. The code is available at https://github.com/Szym29/SUICA.\n","authors":["Qingtian Zhu","Yumin Zheng","Yuling Sang","Yifan Zhan","Ziyan Zhu","Jun Ding","Yinqiang Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.01124v1.pdf","comment":null}]},"2024-12-01T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.17994v2","updated":"2024-12-01T22:35:05Z","published":"2024-11-27T02:22:14Z","title":"Differentiable Inverse Rendering with Interpretable Basis BRDFs","summary":" Inverse rendering seeks to reconstruct both geometry and spatially varying\nBRDFs (SVBRDFs) from captured images. To address the inherent ill-posedness of\ninverse rendering, basis BRDF representations are commonly used, modeling\nSVBRDFs as spatially varying blends of a set of basis BRDFs. However, existing\nmethods often yield basis BRDFs that lack intuitive separation and have limited\nscalability to scenes of varying complexity. In this paper, we introduce a\ndifferentiable inverse rendering method that produces interpretable basis\nBRDFs. Our approach models a scene using 2D Gaussians, where the reflectance of\neach Gaussian is defined by a weighted blend of basis BRDFs. We efficiently\nrender an image from the 2D Gaussians and basis BRDFs using differentiable\nrasterization and impose a rendering loss with the input images. During this\nanalysis-by-synthesis optimization process of differentiable inverse rendering,\nwe dynamically adjust the number of basis BRDFs to fit the target scene while\nencouraging sparsity in the basis weights. This ensures that the reflectance of\neach Gaussian is represented by only a few basis BRDFs. This approach enables\nthe reconstruction of accurate geometry and interpretable basis BRDFs that are\nspatially separated. Consequently, the resulting scene representation,\ncomprising basis BRDFs and 2D Gaussians, supports physically-based novel-view\nrelighting and intuitive scene editing.\n","authors":["Hoon-Gyu Chung","Seokjun Choi","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2411.17994v2.pdf","comment":"This is a different paper from my previous paper \"Differentiable\n Point-based Inverse Rendering\". It must not be removed automatically"},{"id":"http://arxiv.org/abs/2311.09614v3","updated":"2024-12-01T22:01:58Z","published":"2023-11-16T06:58:46Z","title":"Comprehensive framework for evaluation of deep neural networks in\n detection and quantification of lymphoma from PET/CT images: clinical\n insights, pitfalls, and observer agreement analyses","summary":" This study addresses critical gaps in automated lymphoma segmentation from\nPET/CT images, focusing on issues often overlooked in existing literature.\nWhile deep learning has been applied for lymphoma lesion segmentation, few\nstudies incorporate out-of-distribution testing, raising concerns about model\ngeneralizability across diverse imaging conditions and patient populations. We\nhighlight the need to compare model performance with expert human annotators,\nincluding intra- and inter-observer variability, to understand task difficulty\nbetter. Most approaches focus on overall segmentation accuracy but overlook\nlesion-specific metrics important for precise lesion detection and disease\nquantification.To address these gaps, we propose a clinically-relevant\nframework for evaluating deep neural networks. Using this lesion-specific\nevaluation, we assess the performance of four deep segmentation networks\n(ResUNet, SegResNet, DynUNet, and SwinUNETR) across 611 cases from\nmulti-institutional datasets, covering various lymphoma subtypes and lesion\ncharacteristics. Beyond standard metrics like the Dice similarity coefficient\n(DSC), we evaluate clinical lesion measures and their prediction errors. We\nalso introduce detection criteria for lesion localization and propose a new\ndetection Criterion 3 based on metabolic characteristics. We show that networks\nperform better on large, intense lesions with higher metabolic\nactivity.Finally, we compare network performance to expert human observers via\nintra- and inter-observer variability analyses, demonstrating that network\nerrors closely resemble those made by experts. Some small, faint lesions remain\nchallenging for both humans and networks. This study aims to improve automated\nlesion segmentation's clinical relevance, supporting better treatment decisions\nfor lymphoma patients. The code is available at:\nhttps://github.com/microsoft/lymphoma-segmentation-dnn\n","authors":["Shadab Ahamed","Yixi Xu","Sara Kurkowska","Claire Gowdy","Joo H. O","Ingrid Bloise","Don Wilson","Patrick Martineau","François Bénard","Fereshteh Yousefirizi","Rahul Dodhia","Juan M. Lavista","William B. Weeks","Carlos F. Uribe","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2311.09614v3.pdf","comment":"32 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2403.09905v2","updated":"2024-12-01T21:42:37Z","published":"2024-03-14T22:33:22Z","title":"Right Place, Right Time! Generalizing ObjectNav to Dynamic Environments\n with Portable Targets","summary":" ObjectNav is a popular task in Embodied AI, where an agent navigates to a\ntarget object in an unseen environment. Prior literature makes the assumption\nof a static environment with stationary objects, which lacks realism. To\naddress this, we present a novel formulation to generalize ObjectNav to dynamic\nenvironments with non-stationary objects, and refer to it as Portable ObjectNav\nor P-ObjectNav. In our formulation, we first address several challenging issues\nwith dynamizing existing topological scene graphs by developing a novel method\nthat introduces multiple transition behaviors to portable objects in the scene.\nWe use this technique to dynamize Matterport3D, a popular simulator for\nevaluating embodied tasks. We then present a benchmark for P-ObjectNav using a\ncombination of heuristic, reinforcement learning, and Large Language Model\n(LLM)-based navigation approaches on the dynamized environment, while\nintroducing novel evaluation metrics tailored for our task. Our work\nfundamentally challenges the \"static-environment\" notion of prior ObjectNav\nwork; the code and dataset for P-ObjectNav will be made publicly available to\nfoster research on embodied navigation in dynamic scenes. We provide an\nanonymized repository for our code and dataset:\nhttps://anonymous.4open.science/r/PObjectNav-1C6D.\n","authors":["Vishnu Sashank Dorbala","Bhrij Patel","Amrit Singh Bedi","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2403.09905v2.pdf","comment":"19"},{"id":"http://arxiv.org/abs/2302.10883v2","updated":"2024-12-01T20:17:28Z","published":"2023-02-21T18:58:32Z","title":"Combining Blockchain and Biometrics: A Survey on Technical Aspects and a\n First Legal Analysis","summary":" Biometric recognition as a unique, hard-to-forge, and efficient way of\nidentification and verification has become an indispensable part of the current\ndigital world. The fast evolution of this technology has been a strong\nincentive for integrating it into many applications. Meanwhile, blockchain, the\nvery attractive decentralized ledger technology, has been widely received both\nby the research and industry in the past years and it is being increasingly\ndeployed nowadays in many different applications, such as money transfer, IoT,\nhealthcare, or logistics. Recently, researchers have started to speculate what\nwould be the pros and cons and what would be the best applications when these\ntwo technologies cross paths. This paper provides a survey of technical\nliterature research on the combination of blockchain and biometrics and\nincludes a first legal analysis of this integration to shed light on challenges\nand potentials. While this combination is still in its infancy and a growing\nbody of literature discusses specific blockchain applications and solutions in\nan advanced technological set-up, this paper presents a holistic understanding\nof blockchains applicability in the biometric sector. This study demonstrates\nthat combining blockchain and biometrics would be beneficial for novel\napplications in biometrics such as the PKI mechanism, distributed trusted\nservice, and identity management. However, blockchain networks at their current\nstage are not efficient and economical for real-time applications. From a legal\npoint of view, the allocation of accountability remains a main issue, while\nother difficulties remain, such as conducting a proper Data Protection Impact\nAssessment. Finally, it supplies technical and legal recommendations to reap\nthe benefits and mitigate the risks of the combination.\n","authors":["Mahdi Ghafourian","Bilgesu Sumer","Ruben Vera-Rodriguez","Julian Fierrez","Ruben Tolosana","Aythami Moralez","Els Kindt"],"pdf_url":"https://arxiv.org/pdf/2302.10883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04346v2","updated":"2024-12-01T20:10:57Z","published":"2023-11-07T21:06:06Z","title":"SaFL: Sybil-aware Federated Learning with Application to Face\n Recognition","summary":" Federated Learning (FL) is a machine learning paradigm to conduct\ncollaborative learning among clients on a joint model. The primary goal is to\nshare clients' local training parameters with an integrating server while\npreserving their privacy. This method permits to exploit the potential of\nmassive mobile users' data for the benefit of machine learning models'\nperformance while keeping sensitive data on local devices. On the downside, FL\nraises security and privacy concerns that have just started to be studied. To\naddress some of the key threats in FL, researchers have proposed to use secure\naggregation methods (e.g. homomorphic encryption, secure multiparty\ncomputation, etc.). These solutions improve some security and privacy metrics,\nbut at the same time bring about other serious threats such as poisoning\nattacks, backdoor attacks, and free running attacks. This paper proposes a new\ndefense method against poisoning attacks in FL called SaFL (Sybil-aware\nFederated Learning) that minimizes the effect of sybils with a novel\ntime-variant aggregation scheme.\n","authors":["Mahdi Ghafourian","Julian Fierrez","Ruben Vera-Rodriguez","Ruben Tolosana","Aythami Morales"],"pdf_url":"https://arxiv.org/pdf/2311.04346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16740v2","updated":"2024-12-01T19:13:25Z","published":"2024-11-23T18:14:42Z","title":"Document Haystacks: Vision-Language Reasoning Over Piles of 1000+\n Documents","summary":" Large multimodal models (LMMs) have achieved impressive progress in\nvision-language understanding, yet they face limitations in real-world\napplications requiring complex reasoning over a large number of images.\nExisting benchmarks for multi-image question-answering are limited in scope,\neach question is paired with only up to 30 images, which does not fully capture\nthe demands of large-scale retrieval tasks encountered in the real-world\nusages. To reduce these gaps, we introduce two document haystack benchmarks,\ndubbed DocHaystack and InfoHaystack, designed to evaluate LMM performance on\nlarge-scale visual document retrieval and understanding. Additionally, we\npropose V-RAG, a novel, vision-centric retrieval-augmented generation (RAG)\nframework that leverages a suite of multimodal vision encoders, each optimized\nfor specific strengths, and a dedicated question-document relevance module.\nV-RAG sets a new standard, with a 9% and 11% improvement in Recall@1 on the\nchallenging DocHaystack-1000 and InfoHaystack-1000 benchmarks, respectively,\ncompared to the previous best baseline models. Additionally, integrating V-RAG\nwith LMMs enables them to efficiently operate across thousands of images,\nyielding significant improvements on our DocHaystack and InfoHaystack\nbenchmarks. Our code and datasets are available at\nhttps://github.com/Vision-CAIR/dochaystacks\n","authors":["Jun Chen","Dannong Xu","Junjie Fei","Chun-Mei Feng","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2411.16740v2.pdf","comment":"the correct arxiv version"},{"id":"http://arxiv.org/abs/2404.07410v2","updated":"2024-12-01T18:48:37Z","published":"2024-04-11T00:49:38Z","title":"Improving Shift Invariance in Convolutional Neural Networks with\n Translation Invariant Polyphase Sampling","summary":" Downsampling operators break the shift invariance of convolutional neural\nnetworks (CNNs) and this affects the robustness of features learned by CNNs\nwhen dealing with even small pixel-level shift. Through a large-scale\ncorrelation analysis framework, we study shift invariance of CNNs by inspecting\nexisting downsampling operators in terms of their maximum-sampling bias (MSB),\nand find that MSB is negatively correlated with shift invariance. Based on this\ncrucial insight, we propose a learnable pooling operator called Translation\nInvariant Polyphase Sampling (TIPS) and two regularizations on the intermediate\nfeature maps of TIPS to reduce MSB and learn translation-invariant\nrepresentations. TIPS can be integrated into any CNN and can be trained\nend-to-end with marginal computational overhead. Our experiments demonstrate\nthat TIPS results in consistent performance gains in terms of accuracy, shift\nconsistency, and shift fidelity on multiple benchmarks for image classification\nand semantic segmentation compared to previous methods and also leads to\nimprovements in adversarial and distributional robustness. TIPS results in the\nlowest MSB compared to all previous methods, thus explaining our strong\nempirical results.\n","authors":["Sourajit Saha","Tejas Gokhale"],"pdf_url":"https://arxiv.org/pdf/2404.07410v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2409.09566v2","updated":"2024-12-01T17:48:24Z","published":"2024-09-15T00:53:44Z","title":"Learning Transferable Features for Implicit Neural Representations","summary":" Implicit neural representations (INRs) have demonstrated success in a variety\nof applications, including inverse problems and neural rendering. An INR is\ntypically trained to capture one signal of interest, resulting in learned\nneural features that are highly attuned to that signal. Assumed to be less\ngeneralizable, we explore the aspect of transferability of such learned neural\nfeatures for fitting similar signals. We introduce a new INR training\nframework, STRAINER that learns transferrable features for fitting INRs to new\nsignals from a given distribution, faster and with better reconstruction\nquality. Owing to the sequential layer-wise affine operations in an INR, we\npropose to learn transferable representations by sharing initial encoder layers\nacross multiple INRs with independent decoder layers. At test time, the learned\nencoder representations are transferred as initialization for an otherwise\nrandomly initialized INR. We find STRAINER to yield extremely powerful\ninitialization for fitting images from the same domain and allow for $\\approx\n+10dB$ gain in signal quality early on compared to an untrained INR itself.\nSTRAINER also provides a simple way to encode data-driven priors in INRs. We\nevaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks\nand inverse problems and further provide detailed analysis and discussion on\nthe transferability of STRAINER's features. Our demo can be accessed at\nhttps://colab.research.google.com/drive/1fBZAwqE8C_lrRPAe-hQZJTWrMJuAKtG2?usp=sharing .\n","authors":["Kushal Vyas","Ahmed Imtiaz Humayun","Aniket Dashpute","Richard G. Baraniuk","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2409.09566v2.pdf","comment":"Project Website: https://kushalvyas.github.io/strainer.html"},{"id":"http://arxiv.org/abs/2209.05227v5","updated":"2024-12-01T16:50:02Z","published":"2022-09-12T13:26:26Z","title":"DUET: A Tuning-Free Device-Cloud Collaborative Parameters Generation\n Framework for Efficient Device Model Generalization","summary":" Device Model Generalization (DMG) is a practical yet under-investigated\nresearch topic for on-device machine learning applications. It aims to improve\nthe generalization ability of pre-trained models when deployed on\nresource-constrained devices, such as improving the performance of pre-trained\ncloud models on smart mobiles. While quite a lot of works have investigated the\ndata distribution shift across clouds and devices, most of them focus on model\nfine-tuning on personalized data for individual devices to facilitate DMG.\nDespite their promising, these approaches require on-device re-training, which\nis practically infeasible due to the overfitting problem and high time delay\nwhen performing gradient calculation on real-time data. In this paper, we argue\nthat the computational cost brought by fine-tuning can be rather unnecessary.\nWe consequently present a novel perspective to improving DMG without increasing\ncomputational cost, i.e., device-specific parameter generation which directly\nmaps data distribution to parameters. Specifically, we propose an efficient\nDevice-cloUd collaborative parametErs generaTion framework DUET. DUET is\ndeployed on a powerful cloud server that only requires the low cost of\nforwarding propagation and low time delay of data transmission between the\ndevice and the cloud. By doing so, DUET can rehearse the device-specific model\nweight realizations conditioned on the personalized real-time data for an\nindividual device. Importantly, our DUET elegantly connects the cloud and\ndevice as a 'duet' collaboration, frees the DMG from fine-tuning, and enables a\nfaster and more accurate DMG paradigm. We conduct an extensive experimental\nstudy of DUET on three public datasets, and the experimental results confirm\nour framework's effectiveness and generalisability for different DMG tasks.\n","authors":["Zheqi Lv","Wenqiao Zhang","Shengyu Zhang","Kun Kuang","Feng Wang","Yongwei Wang","Zhengyu Chen","Tao Shen","Hongxia Yang","Beng Chin Ooi","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2209.05227v5.pdf","comment":"Published on WWW'23: Proceedings of the ACM on Web Conference 2023\n (pp. 3077 - 3085)"},{"id":"http://arxiv.org/abs/2411.18499v2","updated":"2024-12-01T16:07:41Z","published":"2024-11-27T16:39:04Z","title":"GATE OpenING: A Comprehensive Benchmark for Judging Open-ended\n Interleaved Image-Text Generation","summary":" Multimodal Large Language Models (MLLMs) have made significant strides in\nvisual understanding and generation tasks. However, generating interleaved\nimage-text content remains a challenge, which requires integrated multimodal\nunderstanding and generation abilities. While the progress in unified models\noffers new solutions, existing benchmarks are insufficient for evaluating these\nmethods due to data size and diversity limitations. To bridge this gap, we\nintroduce GATE OpenING (OpenING), a comprehensive benchmark comprising 5,400\nhigh-quality human-annotated instances across 56 real-world tasks. OpenING\ncovers diverse daily scenarios such as travel guide, design, and brainstorming,\noffering a robust platform for challenging interleaved generation methods. In\naddition, we present IntJudge, a judge model for evaluating open-ended\nmultimodal generation methods. Trained with a novel data pipeline, our IntJudge\nachieves an agreement rate of 82. 42% with human judgments, outperforming\nGPT-based evaluators by 11.34%. Extensive experiments on OpenING reveal that\ncurrent interleaved generation methods still have substantial room for\nimprovement. Key findings on interleaved image-text generation are further\npresented to guide the development of next-generation models. The OpenING is\nopen-sourced at https://opening-benchmark.github.io.\n","authors":["Pengfei Zhou","Xiaopeng Peng","Jiajun Song","Chuanhao Li","Zhaopan Xu","Yue Yang","Ziyao Guo","Hao Zhang","Yuqi Lin","Yefei He","Lirui Zhao","Shuo Liu","Tianhua Li","Yuxuan Xie","Xiaojun Chang","Yu Qiao","Wenqi Shao","Kaipeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.18499v2.pdf","comment":"53 pages, 19 figures"},{"id":"http://arxiv.org/abs/2407.04041v2","updated":"2024-12-01T15:58:23Z","published":"2024-07-04T16:29:05Z","title":"Towards Cross-View-Consistent Self-Supervised Surround Depth Estimation","summary":" Depth estimation is a cornerstone for autonomous driving, yet acquiring\nper-pixel depth ground truth for supervised learning is challenging.\nSelf-Supervised Surround Depth Estimation (SSSDE) from consecutive images\noffers an economical alternative. While previous SSSDE methods have proposed\ndifferent mechanisms to fuse information across images, few of them explicitly\nconsider the cross-view constraints, leading to inferior performance,\nparticularly in overlapping regions. This paper proposes an efficient and\nconsistent pose estimation design and two loss functions to enhance cross-view\nconsistency for SSSDE. For pose estimation, we propose to use only front-view\nimages to reduce training memory and sustain pose estimation consistency. The\nfirst loss function is the dense depth consistency loss, which penalizes the\ndifference between predicted depths in overlapping regions. The second one is\nthe multi-view reconstruction consistency loss, which aims to maintain\nconsistency between reconstruction from spatial and spatial-temporal contexts.\nAdditionally, we introduce a novel flipping augmentation to improve the\nperformance further. Our techniques enable a simple neural model to achieve\nstate-of-the-art performance on the DDAD and nuScenes datasets. Last but not\nleast, our proposed techniques can be easily applied to other methods. The code\nwill be made public.\n","authors":["Laiyan Ding","Hualie Jiang","Jie Li","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2407.04041v2.pdf","comment":"Accepted by IROS2024"},{"id":"http://arxiv.org/abs/2403.03077v4","updated":"2024-12-01T14:57:40Z","published":"2024-03-05T16:01:55Z","title":"MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual\n Grounding","summary":" 3D visual grounding involves matching natural language descriptions with\ntheir corresponding objects in 3D spaces. Existing methods often face\nchallenges with accuracy in object recognition and struggle in interpreting\ncomplex linguistic queries, particularly with descriptions that involve\nmultiple anchors or are view-dependent. In response, we present the MiKASA\n(Multi-Key-Anchor Scene-Aware) Transformer. Our novel end-to-end trained model\nintegrates a self-attention-based scene-aware object encoder and an original\nmulti-key-anchor technique, enhancing object recognition accuracy and the\nunderstanding of spatial relationships. Furthermore, MiKASA improves the\nexplainability of decision-making, facilitating error diagnosis. Our model\nachieves the highest overall accuracy in the Referit3D challenge for both the\nSr3D and Nr3D datasets, particularly excelling by a large margin in categories\nthat require viewpoint-dependent descriptions.\n","authors":["Chun-Peng Chang","Shaoxiang Wang","Alain Pagani","Didier Stricker"],"pdf_url":"https://arxiv.org/pdf/2403.03077v4.pdf","comment":"CVPR 2024"},{"id":"http://arxiv.org/abs/2408.08855v2","updated":"2024-12-01T14:53:07Z","published":"2024-08-16T17:30:27Z","title":"DPA: Dual Prototypes Alignment for Unsupervised Adaptation of\n Vision-Language Models","summary":" Vision-language models (VLMs), e.g., CLIP, have shown remarkable potential in\nzero-shot image classification. However, adapting these models to new domains\nremains challenging, especially in unsupervised settings where labeled data is\nunavailable. Recent research has proposed pseudo-labeling approaches to adapt\nCLIP in an unsupervised manner using unlabeled target data. Nonetheless, these\nmethods struggle due to noisy pseudo-labels resulting from the misalignment\nbetween CLIP's visual and textual representations. This study introduces DPA,\nan unsupervised domain adaptation method for VLMs. DPA introduces the concept\nof dual prototypes, acting as distinct classifiers, along with the convex\ncombination of their outputs, thereby leading to accurate pseudo-label\nconstruction. Next, it ranks pseudo-labels to facilitate robust self-training,\nparticularly during early training. Finally, it addresses visual-textual\nmisalignment by aligning textual prototypes with image prototypes to further\nimprove the adaptation performance. Experiments on 13 downstream vision tasks\ndemonstrate that DPA significantly outperforms zero-shot CLIP and the\nstate-of-the-art unsupervised adaptation baselines.\n","authors":["Eman Ali","Sathira Silva","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2408.08855v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.17017v2","updated":"2024-12-01T14:37:22Z","published":"2024-11-26T01:00:09Z","title":"TED-VITON: Transformer-Empowered Diffusion Models for Virtual Try-On","summary":" Recent advancements in Virtual Try-On (VTO) have demonstrated exceptional\nefficacy in generating realistic images and preserving garment details, largely\nattributed to the robust generative capabilities of text-to-image (T2I)\ndiffusion backbones. However, the T2I models that underpin these methods have\nbecome outdated, thereby limiting the potential for further improvement in VTO.\nAdditionally, current methods face notable challenges in accurately rendering\ntext on garments without distortion and preserving fine-grained details, such\nas textures and material fidelity. The emergence of Diffusion Transformer (DiT)\nbased T2I models has showcased impressive performance and offers a promising\nopportunity for advancing VTO. Directly applying existing VTO techniques to\ntransformer-based T2I models is ineffective due to substantial architectural\ndifferences, which hinder their ability to fully leverage the models' advanced\ncapabilities for improved text generation. To address these challenges and\nunlock the full potential of DiT-based T2I models for VTO, we propose\nTED-VITON, a novel framework that integrates a Garment Semantic (GS) Adapter\nfor enhancing garment-specific features, a Text Preservation Loss to ensure\naccurate and distortion-free text rendering, and a constraint mechanism to\ngenerate prompts by optimizing Large Language Model (LLM). These innovations\nenable state-of-the-art (SOTA) performance in visual quality and text fidelity,\nestablishing a new benchmark for VTO task. Project page:\n\\url{https://zhenchenwan.github.io/TED-VITON/}\n","authors":["Zhenchen Wan","Yanwu Xu","Zhaoqing Wang","Feng Liu","Tongliang Liu","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2411.17017v2.pdf","comment":"Project page: \\href{https://github.com/ZhenchenWan/TED-VITON}{this\n URL}"},{"id":"http://arxiv.org/abs/2408.02555v3","updated":"2024-12-01T14:34:01Z","published":"2024-08-05T15:33:45Z","title":"MeshAnything V2: Artist-Created Mesh Generation With Adjacent Mesh\n Tokenization","summary":" Meshes are the de facto 3D representation in the industry but are\nlabor-intensive to produce. Recently, a line of research has focused on\nautoregressively generating meshes. This approach processes meshes into a\nsequence composed of vertices and then generates them vertex by vertex, similar\nto how a language model generates text. These methods have achieved some\nsuccess but still struggle to generate complex meshes. One primary reason for\nthis limitation is their inefficient tokenization methods. To address this\nissue, we introduce MeshAnything V2, an advanced mesh generation model designed\nto create Artist-Created Meshes that align precisely with specified shapes. A\nkey innovation behind MeshAnything V2 is our novel Adjacent Mesh Tokenization\n(AMT) method. Unlike traditional approaches that represent each face using\nthree vertices, AMT optimizes this by employing a single vertex wherever\nfeasible, effectively reducing the token sequence length by about half on\naverage. This not only streamlines the tokenization process but also results in\nmore compact and well-structured sequences, enhancing the efficiency of mesh\ngeneration. With these improvements, MeshAnything V2 effectively doubles the\nface limit compared to previous models, delivering superior performance without\nincreasing computational costs. We will make our code and models publicly\navailable. Project Page: https://buaacyw.github.io/meshanything-v2/\n","authors":["Yiwen Chen","Yikai Wang","Yihao Luo","Zhengyi Wang","Zilong Chen","Jun Zhu","Chi Zhang","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2408.02555v3.pdf","comment":"Project Page: https://buaacyw.github.io/meshanything-v2/ Github:\n https://github.com/buaacyw/MeshAnythingV2"},{"id":"http://arxiv.org/abs/2310.13026v2","updated":"2024-12-01T14:27:28Z","published":"2023-10-19T07:16:54Z","title":"Weakly-Supervised Semantic Segmentation with Image-Level Labels: from\n Traditional Models to Foundation Models","summary":" The rapid development of deep learning has driven significant progress in\nimage semantic segmentation - a fundamental task in computer vision. Semantic\nsegmentation algorithms often depend on the availability of pixel-level labels\n(i.e., masks of objects), which are expensive, time-consuming, and\nlabor-intensive. Weakly-supervised semantic segmentation (WSSS) is an effective\nsolution to avoid such labeling. It utilizes only partial or incomplete\nannotations and provides a cost-effective alternative to fully-supervised\nsemantic segmentation. In this journal, our focus is on the WSSS with\nimage-level labels, which is the most challenging form of WSSS. Our work has\ntwo parts. First, we conduct a comprehensive survey on traditional methods,\nprimarily focusing on those presented at premier research conferences. We\ncategorize them into four groups based on where their methods operate:\npixel-wise, image-wise, cross-image, and external data. Second, we investigate\nthe applicability of visual foundation models, such as the Segment Anything\nModel (SAM), in the context of WSSS. We scrutinize SAM in two intriguing\nscenarios: text prompting and zero-shot learning. We provide insights into the\npotential and challenges of deploying visual foundational models for WSSS,\nfacilitating future developments in this exciting research area.\n","authors":["Zhaozheng Chen","Qianru Sun"],"pdf_url":"https://arxiv.org/pdf/2310.13026v2.pdf","comment":"Accepted to ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2303.13495v2","updated":"2024-12-01T14:04:22Z","published":"2023-03-23T17:56:10Z","title":"ReVersion: Diffusion-Based Relation Inversion from Images","summary":" Diffusion models gain increasing popularity for their generative\ncapabilities. Recently, there have been surging needs to generate customized\nimages by inverting diffusion models from exemplar images, and existing\ninversion methods mainly focus on capturing object appearances (i.e., the\n\"look\"). However, how to invert object relations, another important pillar in\nthe visual world, remains unexplored. In this work, we propose the Relation\nInversion task, which aims to learn a specific relation (represented as\n\"relation prompt\") from exemplar images. Specifically, we learn a relation\nprompt with a frozen pre-trained text-to-image diffusion model. The learned\nrelation prompt can then be applied to generate relation-specific images with\nnew objects, backgrounds, and styles.\n To tackle the Relation Inversion task, we propose the ReVersion Framework.\nSpecifically, we propose a novel \"relation-steering contrastive learning\"\nscheme to steer the relation prompt towards relation-dense regions, and\ndisentangle it away from object appearances. We further devise \"relation-focal\nimportance sampling\" to emphasize high-level interactions over low-level\nappearances (e.g., texture, color). To comprehensively evaluate this new task,\nwe contribute the ReVersion Benchmark, which provides various exemplar images\nwith diverse relations. Extensive experiments validate the superiority of our\napproach over existing methods across a wide range of visual relations. Our\nproposed task and method could be good inspirations for future research in\nvarious domains like generative inversion, few-shot learning, and visual\nrelation detection.\n","authors":["Ziqi Huang","Tianxing Wu","Yuming Jiang","Kelvin C. K. Chan","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2303.13495v2.pdf","comment":"SIGGRAPH Asia (Conference Track) 2024, Project page:\n https://ziqihuangg.github.io/projects/reversion.html Code:\n https://github.com/ziqihuangg/ReVersion"},{"id":"http://arxiv.org/abs/2403.06090v4","updated":"2024-12-01T12:13:57Z","published":"2024-03-10T04:23:24Z","title":"What Matters When Repurposing Diffusion Models for General Dense\n Perception Tasks?","summary":" Extensive pre-training with large data is indispensable for downstream\ngeometry and semantic visual perception tasks. Thanks to large-scale\ntext-to-image (T2I) pretraining, recent works show promising results by simply\nfine-tuning T2I diffusion models for dense perception tasks. However, several\ncrucial design decisions in this process still lack comprehensive\njustification, encompassing the necessity of the multi-step stochastic\ndiffusion mechanism, training strategy, inference ensemble strategy, and\nfine-tuning data quality. In this work, we conduct a thorough investigation\ninto critical factors that affect transfer efficiency and performance when\nusing diffusion priors. Our key findings are: 1) High-quality fine-tuning data\nis paramount for both semantic and geometry perception tasks. 2) The stochastic\nnature of diffusion models has a slightly negative impact on deterministic\nvisual perception tasks. 3) Apart from fine-tuning the diffusion model with\nonly latent space supervision, task-specific image-level supervision is\nbeneficial to enhance fine-grained details. These observations culminate in the\ndevelopment of GenPercept, an effective deterministic one-step fine-tuning\nparadigm tailed for dense visual perception tasks. Different from the previous\nmulti-step methods, our paradigm has a much faster inference speed, and can be\nseamlessly integrated with customized perception decoders and loss functions\nfor image-level supervision, which is critical to improving the fine-grained\ndetails of predictions. Comprehensive experiments on diverse dense visual\nperceptual tasks, including monocular depth estimation, surface normal\nestimation, image segmentation, and matting, are performed to demonstrate the\nremarkable adaptability and effectiveness of our proposed method.\n","authors":["Guangkai Xu","Yongtao Ge","Mingyu Liu","Chengxiang Fan","Kangyang Xie","Zhiyue Zhao","Hao Chen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2403.06090v4.pdf","comment":"Code is at: https://github.com/aim-uofa/GenPercept"},{"id":"http://arxiv.org/abs/2305.03614v5","updated":"2024-12-01T12:06:46Z","published":"2023-05-05T15:20:27Z","title":"Denoising-Contrastive Alignment for Continuous Sign Language Recognition","summary":" Continuous sign language recognition (CSLR) aims to recognize signs in\nuntrimmed sign language videos to textual glosses. A key challenge of CSLR is\nachieving effective cross-modality alignment between video and gloss sequences\nto enhance video representation. However, current cross-modality alignment\nparadigms often neglect the role of textual grammar to guide the video\nrepresentation in learning global temporal context, which adversely affects\nrecognition performance. To tackle this limitation, we propose a\nDenoising-Contrastive Alignment (DCA) paradigm. DCA creatively leverages\ntextual grammar to enhance video representations through two complementary\napproaches: modeling the instance correspondence between signs and glosses from\na discrimination perspective and aligning their global context from a\ngenerative perspective. Specifically, DCA accomplishes flexible instance-level\ncorrespondence between signs and glosses using a contrastive loss. Building on\nthis, DCA models global context alignment between the video and gloss sequences\nby denoising the gloss representation from noise, guided by video\nrepresentation. Additionally, DCA introduces gradient modulation to optimize\nthe alignment and recognition gradients, ensuring a more effective learning\nprocess. By integrating gloss-wise and global context knowledge, DCA\nsignificantly enhances video representations for CSLR tasks. Experimental\nresults across public benchmarks validate the effectiveness of DCA and confirm\nits video representation enhancement feasibility.\n","authors":["Leming Guo","Wanli Xue","Shengyong Chen"],"pdf_url":"https://arxiv.org/pdf/2305.03614v5.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2410.18879v2","updated":"2024-12-01T11:58:37Z","published":"2024-10-24T16:13:06Z","title":"Multi-Class Abnormality Classification in Video Capsule Endoscopy Using\n Deep Learning","summary":" This report outlines Team Seq2Cure's deep learning approach for the Capsule\nVision 2024 Challenge, leveraging an ensemble of convolutional neural networks\n(CNNs) and transformer-based architectures for multi-class abnormality\nclassification in video capsule endoscopy frames. The dataset comprised over\n50,000 frames from three public sources and one private dataset, labeled across\n10 abnormality classes. To overcome the limitations of traditional CNNs in\ncapturing global context, we integrated CNN and transformer models within a\nmulti-model ensemble. Our approach achieved a balanced accuracy of 86.34\npercent and a mean AUC-ROC score of 0.9908 on the validation set, earning our\nsubmission 5th place in the challenge. Code is available at\nhttp://github.com/arnavs04/capsule-vision-2024 .\n","authors":["Arnav Samal","Ranya Batsyas"],"pdf_url":"https://arxiv.org/pdf/2410.18879v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07671v2","updated":"2024-12-01T11:49:00Z","published":"2024-04-11T12:06:50Z","title":"Deep learning-driven pulmonary artery and vein segmentation reveals\n demography-associated vasculature anatomical differences","summary":" Pulmonary artery-vein segmentation is crucial for disease diagnosis and\nsurgical planning and is traditionally achieved by Computed Tomography\nPulmonary Angiography (CTPA). However, concerns regarding adverse health\neffects from contrast agents used in CTPA have constrained its clinical\nutility. In contrast, identifying arteries and veins using non-contrast CT, a\nconventional and low-cost clinical examination routine, has long been\nconsidered impossible. Here we propose a High-abundant Pulmonary Artery-vein\nSegmentation (HiPaS) framework achieving accurate artery-vein segmentation on\nboth non-contrast CT and CTPA across various spatial resolutions. HiPaS first\nperforms spatial normalization on raw CT volumes via a super-resolution module,\nand then iteratively achieves segmentation results at different branch levels\nby utilizing the lower-level vessel segmentation as a prior for higher-level\nvessel segmentation. We trained and validated HiPaS on our established\nmulti-centric dataset comprising 1,073 CT volumes with meticulous manual\nannotations. Both quantitative experiments and clinical evaluation demonstrated\nthe superior performance of HiPaS, achieving an average dice score of 91.8% and\na sensitivity of 98.0%. Further experiments showed the non-inferiority of HiPaS\nsegmentation on non-contrast CT compared to segmentation on CTPA. Employing\nHiPaS, we have conducted an anatomical study of pulmonary vasculature on 11,784\nparticipants in China (six sites), discovering a new association of pulmonary\nvessel anatomy with sex, age, and disease states: vessel abundance suggests a\nsignificantly higher association with females than males with slightly\ndecreasing with age, and is also influenced by certain diseases, under the\ncontrolling of lung volumes.\n","authors":["Yuetan Chu","Gongning Luo","Longxi Zhou","Shaodong Cao","Guolin Ma","Xianglin Meng","Juexiao Zhou","Changchun Yang","Dexuan Xie","Dan Mu","Ricardo Henao","Gianluca Setti","Xigang Xiao","Lianming Wu","Zhaowen Qiu","Xin Gao"],"pdf_url":"https://arxiv.org/pdf/2404.07671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10462v2","updated":"2024-12-01T11:39:46Z","published":"2024-06-15T01:27:58Z","title":"CoMM: A Coherent Interleaved Image-Text Dataset for Multimodal\n Understanding and Generation","summary":" Interleaved image-text generation has emerged as a crucial multimodal task,\naiming at creating sequences of interleaved visual and textual content given a\nquery. Despite notable advancements in recent multimodal large language models\n(MLLMs), generating integrated image-text sequences that exhibit narrative\ncoherence and entity and style consistency remains challenging due to poor\ntraining data quality. To address this gap, we introduce CoMM, a high-quality\nCoherent interleaved image-text MultiModal dataset designed to enhance the\ncoherence, consistency, and alignment of generated multimodal content.\nInitially, CoMM harnesses raw data from diverse sources, focusing on\ninstructional content and visual storytelling, establishing a foundation for\ncoherent and consistent content. To further refine the data quality, we devise\na multi-perspective filter strategy that leverages advanced pre-trained models\nto ensure the development of sentences, consistency of inserted images, and\nsemantic alignment between them. Various quality evaluation metrics are\ndesigned to prove the high quality of the filtered dataset. Meanwhile,\nextensive few-shot experiments on various downstream tasks demonstrate CoMM's\neffectiveness in significantly enhancing the in-context learning capabilities\nof MLLMs. Moreover, we propose four new tasks to evaluate MLLMs' interleaved\ngeneration abilities, supported by a comprehensive evaluation framework. We\nbelieve CoMM opens a new avenue for advanced MLLMs with superior multimodal\nin-context learning and understanding ability.\n","authors":["Wei Chen","Lin Li","Yongqi Yang","Bin Wen","Fan Yang","Tingting Gao","Yu Wu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2406.10462v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2410.22059v2","updated":"2024-12-01T11:38:07Z","published":"2024-10-29T14:10:28Z","title":"PACA: Perspective-Aware Cross-Attention Representation for Zero-Shot\n Scene Rearrangement","summary":" Scene rearrangement, like table tidying, is a challenging task in robotic\nmanipulation due to the complexity of predicting diverse object arrangements.\nWeb-scale trained generative models such as Stable Diffusion can aid by\ngenerating natural scenes as goals. To facilitate robot execution, object-level\nrepresentations must be extracted to match the real scenes with the generated\ngoals and to calculate object pose transformations. Current methods typically\nuse a multi-step design that involves separate models for generation,\nsegmentation, and feature encoding, which can lead to a low success rate due to\nerror accumulation. Furthermore, they lack control over the viewing\nperspectives of the generated goals, restricting the tasks to 3-DoF settings.\nIn this paper, we propose PACA, a zero-shot pipeline for scene rearrangement\nthat leverages perspective-aware cross-attention representation derived from\nStable Diffusion. Specifically, we develop a representation that integrates\ngeneration, segmentation, and feature encoding into a single step to produce\nobject-level representations. Additionally, we introduce perspective control,\nthus enabling the matching of 6-DoF camera views and extending past approaches\nthat were limited to 3-DoF top-down views. The efficacy of our method is\ndemonstrated through its zero-shot performance in real robot experiments across\nvarious scenes, achieving an average matching accuracy and execution success\nrate of 87% and 67%, respectively.\n","authors":["Shutong Jin","Ruiyu Wang","Kuangyi Chen","Florian T. Pokorny"],"pdf_url":"https://arxiv.org/pdf/2410.22059v2.pdf","comment":"Accepted by WACV2025"},{"id":"http://arxiv.org/abs/2408.15829v3","updated":"2024-12-01T10:30:10Z","published":"2024-08-28T14:44:42Z","title":"SITransformer: Shared Information-Guided Transformer for Extreme\n Multimodal Summarization","summary":" Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an\nattractive summarization approach by integrating various types of information\nto create extremely concise yet informative summaries for individual\nmodalities. Existing methods overlook the issue that multimodal data often\ncontains more topic irrelevant information, which can mislead the model into\nproducing inaccurate summaries especially for extremely short ones. In this\npaper, we propose SITransformer, a Shared Information-guided Transformer for\nextreme multimodal summarization. It has a shared information guided pipeline\nwhich involves a cross-modal shared information extractor and a cross-modal\ninteraction module. The extractor formulates semantically shared salient\ninformation from different modalities by devising a novel filtering process\nconsisting of a differentiable top-k selector and a shared-information guided\ngating unit. As a result, the common, salient, and relevant contents across\nmodalities are identified. Next, a transformer with cross-modal attentions is\ndeveloped for intra- and inter-modality learning with the shared information\nguidance to produce the extreme summary. Comprehensive experiments demonstrate\nthat SITransformer significantly enhances the summarization quality for both\nvideo and text summaries for XMSMO. Our code will be publicly available at\nhttps://github.com/SichengLeoLiu/MMAsia24-XMSMO.\n","authors":["Sicheng Liu","Lintao Wang","Xiaogang Zhu","Xuequan Lu","Zhiyong Wang","Kun Hu"],"pdf_url":"https://arxiv.org/pdf/2408.15829v3.pdf","comment":"8 pages, 5 figures, submitted to ACM Multimedia Asia 2024"},{"id":"http://arxiv.org/abs/2411.18207v2","updated":"2024-12-01T10:23:18Z","published":"2024-11-27T10:33:51Z","title":"From Open Vocabulary to Open World: Teaching Vision Language Models to\n Detect Novel Objects","summary":" Traditional object detection methods operate under the closed-set assumption,\nwhere models can only detect a fixed number of objects predefined in the\ntraining set. Recent works on open vocabulary object detection (OVD) enable the\ndetection of objects defined by an unbounded vocabulary, which reduces the cost\nof training models for specific tasks. However, OVD heavily relies on accurate\nprompts provided by an ''oracle'', which limits their use in critical\napplications such as driving scene perception. OVD models tend to misclassify\nnear-out-of-distribution (NOOD) objects that have similar semantics to known\nclasses, and ignore far-out-of-distribution (FOOD) objects. To address theses\nlimitations, we propose a framework that enables OVD models to operate in open\nworld settings, by identifying and incrementally learning novel objects. To\ndetect FOOD objects, we propose Open World Embedding Learning (OWEL) and\nintroduce the concept of Pseudo Unknown Embedding which infers the location of\nunknown classes in a continuous semantic space based on the information of\nknown classes. We also propose Multi-Scale Contrastive Anchor Learning (MSCAL),\nwhich enables the identification of misclassified unknown objects by promoting\nthe intra-class consistency of object embeddings at different scales. The\nproposed method achieves state-of-the-art performance in common open world\nobject detection and autonomous driving benchmarks.\n","authors":["Zizhao Li","Zhengkang Xiang","Joseph West","Kourosh Khoshelham"],"pdf_url":"https://arxiv.org/pdf/2411.18207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04788v5","updated":"2024-12-01T08:24:11Z","published":"2024-05-08T03:43:58Z","title":"SemiCD-VL: Visual-Language Model Guidance Makes Better Semi-supervised\n Change Detector","summary":" Change Detection (CD) aims to identify pixels with semantic changes between\nimages. However, annotating massive numbers of pixel-level images is\nlabor-intensive and costly, especially for multi-temporal images, which require\npixel-wise comparisons by human experts. Considering the excellent performance\nof visual language models (VLMs) for zero-shot, open-vocabulary, etc. with\nprompt-based reasoning, it is promising to utilize VLMs to make better CD under\nlimited labeled data. In this paper, we propose a VLM guidance-based\nsemi-supervised CD method, namely SemiCD-VL. The insight of SemiCD-VL is to\nsynthesize free change labels using VLMs to provide additional supervision\nsignals for unlabeled data. However, almost all current VLMs are designed for\nsingle-temporal images and cannot be directly applied to bi- or multi-temporal\nimages. Motivated by this, we first propose a VLM-based mixed change event\ngeneration (CEG) strategy to yield pseudo labels for unlabeled CD data. Since\nthe additional supervised signals provided by these VLM-driven pseudo labels\nmay conflict with the pseudo labels from the consistency regularization\nparadigm (e.g. FixMatch), we propose the dual projection head for de-entangling\ndifferent signal sources. Further, we explicitly decouple the bi-temporal\nimages semantic representation through two auxiliary segmentation decoders,\nwhich are also guided by VLM. Finally, to make the model more adequately\ncapture change representations, we introduce metric-aware supervision by\nfeature-level contrastive loss in auxiliary branches. Extensive experiments\nshow the advantage of SemiCD-VL. For instance, SemiCD-VL improves the FixMatch\nbaseline by +5.3 IoU on WHU-CD and by +2.4 IoU on LEVIR-CD with 5% labels. In\naddition, our CEG strategy, in an un-supervised manner, can achieve performance\nfar superior to state-of-the-art un-supervised CD methods.\n","authors":["Kaiyu Li","Xiangyong Cao","Yupeng Deng","Jiayi Song","Junmin Liu","Deyu Meng","Zhi Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04788v5.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.17788v2","updated":"2024-12-01T08:00:56Z","published":"2024-11-26T15:29:38Z","title":"Geometric Point Attention Transformer for 3D Shape Reassembly","summary":" Shape assembly, which aims to reassemble separate parts into a complete\nobject, has gained significant interest in recent years. Existing methods\nprimarily rely on networks to predict the poses of individual parts, but often\nfail to effectively capture the geometric interactions between the parts and\ntheir poses. In this paper, we present the Geometric Point Attention\nTransformer (GPAT), a network specifically designed to address the challenges\nof reasoning about geometric relationships. In the geometric point attention\nmodule, we integrate both global shape information and local pairwise geometric\nfeatures, along with poses represented as rotation and translation vectors for\neach part. To enable iterative updates and dynamic reasoning, we introduce a\ngeometric recycling scheme, where each prediction is fed into the next\niteration for refinement. We evaluate our model on both the semantic and\ngeometric assembly tasks, showing that it outperforms previous methods in\nabsolute pose estimation, achieving accurate pose predictions and high\nalignment accuracy.\n","authors":["Jiahan Li","Chaoran Cheng","Jianzhu Ma","Ge Liu"],"pdf_url":"https://arxiv.org/pdf/2411.17788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18003v2","updated":"2024-12-01T07:20:36Z","published":"2024-11-27T02:47:17Z","title":"HAAT: Hybrid Attention Aggregation Transformer for Image\n Super-Resolution","summary":" In the research area of image super-resolution, Swin-transformer-based models\nare favored for their global spatial modeling and shifting window attention\nmechanism. However, existing methods often limit self-attention to non\noverlapping windows to cut costs and ignore the useful information that exists\nacross channels. To address this issue, this paper introduces a novel model,\nthe Hybrid Attention Aggregation Transformer (HAAT), designed to better\nleverage feature information. HAAT is constructed by integrating\nSwin-Dense-Residual-Connected Blocks (SDRCB) with Hybrid Grid Attention Blocks\n(HGAB). SDRCB expands the receptive field while maintaining a streamlined\narchitecture, resulting in enhanced performance. HGAB incorporates channel\nattention, sparse attention, and window attention to improve nonlocal feature\nfusion and achieve more visually compelling results. Experimental evaluations\ndemonstrate that HAAT surpasses state-of-the-art methods on benchmark datasets.\n Keywords: Image super-resolution, Computer vision, Attention mechanism,\nTransformer\n","authors":["Song-Jiang Lai","Tsun-Hin Cheung","Ka-Chun Fung","Kai-wen Xue","Kin-Man Lam"],"pdf_url":"https://arxiv.org/pdf/2411.18003v2.pdf","comment":"6 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.19048v2","updated":"2024-12-01T07:07:02Z","published":"2024-06-27T09:56:38Z","title":"BiCo-Fusion: Bidirectional Complementary LiDAR-Camera Fusion for\n Semantic- and Spatial-Aware 3D Object Detection","summary":" 3D object detection is an important task that has been widely applied in\nautonomous driving. To perform this task, a new trend is to fuse multi-modal\ninputs, i.e., LiDAR and camera. Under such a trend, recent methods fuse these\ntwo modalities by unifying them in the same 3D space. However, during direct\nfusion in a unified space, the drawbacks of both modalities (LiDAR features\nstruggle with detailed semantic information and the camera lacks accurate 3D\nspatial information) are also preserved, diluting semantic and spatial\nawareness of the final unified representation. To address the issue, this\nletter proposes a novel bidirectional complementary LiDAR-camera fusion\nframework, called BiCo-Fusion that can achieve robust semantic- and\nspatial-aware 3D object detection. The key insight is to fuse LiDAR and camera\nfeatures in a bidirectional complementary way to enhance the semantic awareness\nof the LiDAR and the 3D spatial awareness of the camera. The enhanced features\nfrom both modalities are then adaptively fused to build a semantic- and\nspatial-aware unified representation. Specifically, we introduce Pre-Fusion\nconsisting of a Voxel Enhancement Module (VEM) to enhance the semantic\nawareness of voxel features from 2D camera features and Image Enhancement\nModule (IEM) to enhance the 3D spatial awareness of camera features from 3D\nvoxel features. We then introduce Unified Fusion (U-Fusion) to adaptively fuse\nthe enhanced features from the last stage to build a unified representation.\nExtensive experiments demonstrate the superiority of our BiCo-Fusion against\nthe prior arts. Project page: https://t-ys.github.io/BiCo-Fusion/.\n","authors":["Yang Song","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19048v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.00765v2","updated":"2024-12-01T06:08:00Z","published":"2024-08-01T17:59:54Z","title":"MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models\n for Integrated Capabilities","summary":" MM-Vet, with open-ended vision-language questions targeting at evaluating\nintegrated capabilities, has become one of the most popular benchmarks for\nlarge multimodal model evaluation. MM-Vet assesses six core vision-language\n(VL) capabilities: recognition, knowledge, spatial awareness, language\ngeneration, OCR, and math. However, its question format is restricted to single\nimage-text pairs, lacking the interleaved image and text sequences prevalent in\nreal-world scenarios. To address this limitation, we introduce MM-Vet v2, which\nincludes a new VL capability called \"image-text sequence understanding\",\nevaluating models' ability to process VL sequences. Furthermore, we maintain\nthe high quality of evaluation samples while further expanding the evaluation\nset size. Using MM-Vet v2 to benchmark large multimodal models, we found that\nClaude 3.5 Sonnet is the best model with a score of 71.8, slightly\noutperforming GPT-4o which scored 71.0. Among open-weight models,\nInternVL2-Llama3-76B leads with a score of 68.4. The code, data, and\nleaderboard are accessible at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Lingfeng Ren","Linjie Li","Jianfeng Wang","Kevin Lin","Chung-Ching Lin","Zicheng Liu","Lijuan Wang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.00765v2.pdf","comment":"Code, data and leaderboard: https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2308.02490v4","updated":"2024-12-01T05:46:03Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":" We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v4.pdf","comment":"ICML 2024. Code, data and leaderboard:\n https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2405.18560v2","updated":"2024-12-01T05:22:22Z","published":"2024-05-28T20:10:06Z","title":"Potential Field Based Deep Metric Learning","summary":" Deep metric learning (DML) involves training a network to learn a\nsemantically meaningful representation space. Many current approaches mine\nn-tuples of examples and model interactions within each tuplets. We present a\nnovel, compositional DML model, inspired by electrostatic fields in physics\nthat, instead of in tuples, represents the influence of each example\n(embedding) by a continuous potential field, and superposes the fields to\nobtain their combined global potential field. We use attractive/repulsive\npotential fields to represent interactions among embeddings from images of the\nsame/different classes. Contrary to typical learning methods, where mutual\ninfluence of samples is proportional to their distance, we enforce reduction in\nsuch influence with distance, leading to a decaying field. We show that such\ndecay helps improve performance on real world datasets with large intra-class\nvariations and label noise. Like other proxy-based methods, we also use proxies\nto succinctly represent sub-populations of examples. We evaluate our method on\nthree standard DML benchmarks- Cars-196, CUB-200-2011, and SOP datasets where\nit outperforms state-of-the-art baselines.\n","authors":["Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2405.18560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05846v2","updated":"2024-12-01T05:03:30Z","published":"2024-05-09T15:32:00Z","title":"An Inversion-based Measure of Memorization for Diffusion Models","summary":" The past few years have witnessed substantial advances in image generation\npowered by diffusion models. However, it was shown that diffusion models are\nvulnerable to training data memorization, raising concerns regarding copyright\ninfringement and privacy invasion. This study delves into a rigorous analysis\nof memorization in diffusion models. We introduce an inversion-based measure of\nmemorization, InvMM, which searches for a sensitive latent noise distribution\naccounting for the replication of an image. For accurate estimation of the\nmemorization score, we propose an adaptive algorithm that balances the\nnormality and sensitivity of the inverted distribution. Comprehensive\nexperiments, conducted on both unconditional and text-guided diffusion models,\ndemonstrate that InvMM is capable of detecting heavily memorized images and\nelucidating the effect of various factors on memorization. Additionally, we\ndiscuss how memorization differs from membership. In practice, InvMM serves as\na useful tool for model developers to reliably assess the risk of memorization,\nthereby contributing to the enhancement of trustworthiness and\nprivacy-preserving capabilities of diffusion models.\n","authors":["Zhe Ma","Qingming Li","Xuhong Zhang","Tianyu Du","Ruixiao Lin","Zonghui Wang","Shouling Ji","Wenzhi Chen"],"pdf_url":"https://arxiv.org/pdf/2405.05846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08149v2","updated":"2024-12-01T03:29:46Z","published":"2024-08-15T13:35:59Z","title":"Unsupervised Variational Translator for Bridging Image Restoration and\n High-Level Vision Tasks","summary":" Recent research tries to extend image restoration capabilities from human\nperception to machine perception, thereby enhancing the performance of\nhigh-level vision tasks in degraded environments. These methods, primarily\nbased on supervised learning, typically involve the retraining of restoration\nnetworks or high-level vision networks. However, collecting paired data in\nreal-world scenarios and retraining large-scale models are challenge. To this\nend, we propose an unsupervised learning method called \\textbf{Va}riational\n\\textbf{T}ranslator (VaT), which does not require retraining existing\nrestoration and high-level vision networks. Instead, it establishes a\nlightweight network that serves as an intermediate bridge between them. By\nvariational inference, VaT approximates the joint distribution of restoration\noutput and high-level vision input, dividing the optimization objective into\npreserving content and maximizing marginal likelihood associated with\nhigh-level vision tasks. By cleverly leveraging self-training paradigms, VaT\nachieves the above optimization objective without requiring labels. As a\nresult, the translated images maintain a close resemblance to their original\ncontent while also demonstrating exceptional performance on high-level vision\ntasks. Extensive experiments in dehazing and low-light enhancement for\ndetection and classification show the superiority of our method over other\nstate-of-the-art unsupervised counterparts, even significantly surpassing\nsupervised methods in some complex real-world scenarios.\n","authors":["Jiawei Wu","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2408.08149v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.12279v3","updated":"2024-12-01T02:12:08Z","published":"2024-11-19T06:57:45Z","title":"HouseLLM: LLM-Assisted Two-Phase Text-to-Floorplan Generation","summary":" This paper proposes a two-phase text-to-floorplan generation method, which\nguides a Large Language Model (LLM) to generate an initial layout (Layout-LLM)\nand refines them into the final floorplans through conditional diffusion model.\nWe incorporate a Chain-of-Thought approach to prompt the LLM based on user text\nspecifications, enabling a more user-friendly and intuitive house layout\ndesign. This method allows users to describe their needs in natural language,\nenhancing accessibility and providing clearer geometric constraints. The final\nfloorplans generated by Layout-LLM through conditional diffusion refinement are\nmore accurate and better meet user requirements. Experimental results\ndemonstrate that our approach achieves state-of-the-art performance across all\nmetrics, validating its effectiveness in practical home design applications. We\nplan to release our code for public use.\n","authors":["Ziyang Zong","Zhaohuan Zhan","Guang Tan"],"pdf_url":"https://arxiv.org/pdf/2411.12279v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.23904v2","updated":"2024-12-01T01:35:19Z","published":"2024-10-31T13:06:29Z","title":"EZ-HOI: VLM Adaptation via Guided Prompt Learning for Zero-Shot HOI\n Detection","summary":" Detecting Human-Object Interactions (HOI) in zero-shot settings, where models\nmust handle unseen classes, poses significant challenges. Existing methods that\nrely on aligning visual encoders with large Vision-Language Models (VLMs) to\ntap into the extensive knowledge of VLMs, require large, computationally\nexpensive models and encounter training difficulties. Adapting VLMs with prompt\nlearning offers an alternative to direct alignment. However, fine-tuning on\ntask-specific datasets often leads to overfitting to seen classes and\nsuboptimal performance on unseen classes, due to the absence of unseen class\nlabels. To address these challenges, we introduce a novel prompt learning-based\nframework for Efficient Zero-Shot HOI detection (EZ-HOI). First, we introduce\nLarge Language Model (LLM) and VLM guidance for learnable prompts, integrating\ndetailed HOI descriptions and visual semantics to adapt VLMs to HOI tasks.\nHowever, because training datasets contain seen-class labels alone, fine-tuning\nVLMs on such datasets tends to optimize learnable prompts for seen classes\ninstead of unseen ones. Therefore, we design prompt learning for unseen classes\nusing information from related seen classes, with LLMs utilized to highlight\nthe differences between unseen and related seen classes. Quantitative\nevaluations on benchmark datasets demonstrate that our EZ-HOI achieves\nstate-of-the-art performance across various zero-shot settings with only 10.35%\nto 33.95% of the trainable parameters compared to existing methods. Code is\navailable at https://github.com/ChelsieLei/EZ-HOI.\n","authors":["Qinqian Lei","Bo Wang","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2410.23904v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.17146v2","updated":"2024-12-01T00:19:01Z","published":"2024-06-24T21:36:01Z","title":"Vastextures: Vast repository of textures and PBR materials extracted\n from real-world images using unsupervised methods","summary":" Vastextures is a vast repository of 500,000 textures and PBR materials\nextracted from real-world images using an unsupervised process. The extracted\nmaterials and textures are extremely diverse and cover a vast range of\nreal-world patterns, but at the same time less refined compared to existing\nrepositories. The repository is composed of 2D textures cropped from natural\nimages and SVBRDF/PBR materials generated from these textures. Textures and PBR\nmaterials are essential for CGI. Existing materials repositories focus on\ngames, animation, and arts, that demand a limited amount of high-quality\nassets. However, virtual worlds and synthetic data are becoming increasingly\nimportant for training A.I systems for computer vision. This application\ndemands a huge amount of diverse assets but at the same time less affected by\nnoisy and unrefined assets. Vastexture aims to address this need by creating a\nfree, huge, and diverse assets repository that covers as many real-world\nmaterials as possible. The materials are automatically extracted from natural\nimages in two steps: 1) Automatically scanning a giant amount of images to\nidentify and crop regions with uniform textures. This is done by splitting the\nimage into a grid of cells and identifying regions in which all of the cells\nshare a similar statistical distribution. 2) Extracting the properties of the\nPBR material from the cropped texture. This is done by randomly guessing every\ncorrelation between the properties of the texture image and the properties of\nthe PBR material. The resulting PBR materials exhibit a vast amount of\nreal-world patterns as well as unexpected emergent properties. Neutral nets\ntrained on this repository outperformed nets trained using handcrafted assets.\n","authors":["Sagi Eppel"],"pdf_url":"https://arxiv.org/pdf/2406.17146v2.pdf","comment":"Vastexture was published as part of Learning Zero-Shot Material\n States Segmentation, by Implanting Natural Image Patterns in Synthetic Data,\n refer to this work in citations. This document gives a more detailed and\n technical discussion of this repository"},{"id":"http://arxiv.org/abs/2309.03468v2","updated":"2024-12-01T00:09:44Z","published":"2023-09-07T03:33:49Z","title":"Support-Set Context Matters for Bongard Problems","summary":" Current machine learning methods struggle to solve Bongard problems, which\nare a type of IQ test that requires deriving an abstract \"concept\" from a set\nof positive and negative \"support\" images, and then classifying whether or not\na new query image depicts the key concept. On Bongard-HOI, a benchmark for\nnatural-image Bongard problems, most existing methods have reached at best 69%\naccuracy (where chance is 50%). Low accuracy is often attributed to neural\nnets' lack of ability to find human-like symbolic rules. In this work, we point\nout that many existing methods are forfeiting accuracy due to a much simpler\nproblem: they do not adapt image features given information contained in the\nsupport set as a whole, and rely instead on information extracted from\nindividual supports. This is a critical issue, because the \"key concept\" in a\ntypical Bongard problem can often only be distinguished using multiple\npositives and multiple negatives. We explore simple methods to incorporate this\ncontext and show substantial gains over prior works, leading to new\nstate-of-the-art accuracy on Bongard-LOGO (75.3%) and Bongard-HOI (76.4%)\ncompared to methods with equivalent vision backbone architectures and strong\nperformance on the original Bongard problem set (60.8%).\n","authors":["Nikhil Raghuraman","Adam W. Harley","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2309.03468v2.pdf","comment":"TMLR October 2024. Code:\n https://github.com/nraghuraman/bongard-context"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2311.09614v3","updated":"2024-12-01T22:01:58Z","published":"2023-11-16T06:58:46Z","title":"Comprehensive framework for evaluation of deep neural networks in\n detection and quantification of lymphoma from PET/CT images: clinical\n insights, pitfalls, and observer agreement analyses","summary":" This study addresses critical gaps in automated lymphoma segmentation from\nPET/CT images, focusing on issues often overlooked in existing literature.\nWhile deep learning has been applied for lymphoma lesion segmentation, few\nstudies incorporate out-of-distribution testing, raising concerns about model\ngeneralizability across diverse imaging conditions and patient populations. We\nhighlight the need to compare model performance with expert human annotators,\nincluding intra- and inter-observer variability, to understand task difficulty\nbetter. Most approaches focus on overall segmentation accuracy but overlook\nlesion-specific metrics important for precise lesion detection and disease\nquantification.To address these gaps, we propose a clinically-relevant\nframework for evaluating deep neural networks. Using this lesion-specific\nevaluation, we assess the performance of four deep segmentation networks\n(ResUNet, SegResNet, DynUNet, and SwinUNETR) across 611 cases from\nmulti-institutional datasets, covering various lymphoma subtypes and lesion\ncharacteristics. Beyond standard metrics like the Dice similarity coefficient\n(DSC), we evaluate clinical lesion measures and their prediction errors. We\nalso introduce detection criteria for lesion localization and propose a new\ndetection Criterion 3 based on metabolic characteristics. We show that networks\nperform better on large, intense lesions with higher metabolic\nactivity.Finally, we compare network performance to expert human observers via\nintra- and inter-observer variability analyses, demonstrating that network\nerrors closely resemble those made by experts. Some small, faint lesions remain\nchallenging for both humans and networks. This study aims to improve automated\nlesion segmentation's clinical relevance, supporting better treatment decisions\nfor lymphoma patients. The code is available at:\nhttps://github.com/microsoft/lymphoma-segmentation-dnn\n","authors":["Shadab Ahamed","Yixi Xu","Sara Kurkowska","Claire Gowdy","Joo H. O","Ingrid Bloise","Don Wilson","Patrick Martineau","François Bénard","Fereshteh Yousefirizi","Rahul Dodhia","Juan M. Lavista","William B. Weeks","Carlos F. Uribe","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2311.09614v3.pdf","comment":"32 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.10792v8","updated":"2024-12-01T22:01:51Z","published":"2023-08-21T15:35:16Z","title":"Instruction Tuning for Large Language Models: A Survey","summary":" This paper surveys research works in the quickly advancing field of\ninstruction tuning (IT), which can also be referred to as supervised\nfine-tuning (SFT)\\footnote{In this paper, unless specified otherwise,\nsupervised fine-tuning (SFT) and instruction tuning (IT) are used\ninterchangeably.}, a crucial technique to enhance the capabilities and\ncontrollability of large language models (LLMs). Instruction tuning refers to\nthe process of further training LLMs on a dataset consisting of\n\\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the\ngap between the next-word prediction objective of LLMs and the users' objective\nof having LLMs adhere to human instructions. In this work, we make a systematic\nreview of the literature, including the general methodology of SFT, the\nconstruction of SFT datasets, the training of SFT models, and applications to\ndifferent modalities, domains and application, along with analysis on aspects\nthat influence the outcome of SFT (e.g., generation of instruction outputs,\nsize of the instruction dataset, etc). We also review the potential pitfalls of\nSFT along with criticism against it, along with efforts pointing out current\ndeficiencies of existing strategies and suggest some avenues for fruitful\nresearch. Project Page: github.com/xiaoya-li/Instruction-Tuning-Survey\n","authors":["Shengyu Zhang","Linfeng Dong","Xiaoya Li","Sen Zhang","Xiaofei Sun","Shuhe Wang","Jiwei Li","Runyi Hu","Tianwei Zhang","Fei Wu","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10792v8.pdf","comment":"V5; Last update: Dec. 1, 2024"},{"id":"http://arxiv.org/abs/2303.03106v3","updated":"2024-12-01T19:15:57Z","published":"2023-03-03T10:53:30Z","title":"Rotation Invariant Quantization for Model Compression","summary":" Post-training Neural Network (NN) model compression is an attractive approach\nfor deploying large, memory-consuming models on devices with limited memory\nresources. In this study, we investigate the rate-distortion tradeoff for NN\nmodel compression. First, we suggest a Rotation-Invariant Quantization (RIQ)\ntechnique that utilizes a single parameter to quantize the entire NN model,\nyielding a different rate at each layer, i.e., mixed-precision quantization.\nThen, we prove that our rotation-invariant approach is optimal in terms of\ncompression. We rigorously evaluate RIQ and demonstrate its capabilities on\nvarious models and tasks. For example, RIQ facilitates $\\times 19.4$ and\n$\\times 52.9$ compression ratios on pre-trained VGG dense and pruned models,\nrespectively, with $<0.4\\%$ accuracy degradation. Code is available in\n\\href{https://github.com/ehaleva/RIQ}{github.com/ehaleva/RIQ}.\n","authors":["Joseph Kampeas","Yury Nahshan","Hanoch Kremer","Gil Lederman","Shira Zaloshinski","Zheng Li","Emir Haleva"],"pdf_url":"https://arxiv.org/pdf/2303.03106v3.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.16740v2","updated":"2024-12-01T19:13:25Z","published":"2024-11-23T18:14:42Z","title":"Document Haystacks: Vision-Language Reasoning Over Piles of 1000+\n Documents","summary":" Large multimodal models (LMMs) have achieved impressive progress in\nvision-language understanding, yet they face limitations in real-world\napplications requiring complex reasoning over a large number of images.\nExisting benchmarks for multi-image question-answering are limited in scope,\neach question is paired with only up to 30 images, which does not fully capture\nthe demands of large-scale retrieval tasks encountered in the real-world\nusages. To reduce these gaps, we introduce two document haystack benchmarks,\ndubbed DocHaystack and InfoHaystack, designed to evaluate LMM performance on\nlarge-scale visual document retrieval and understanding. Additionally, we\npropose V-RAG, a novel, vision-centric retrieval-augmented generation (RAG)\nframework that leverages a suite of multimodal vision encoders, each optimized\nfor specific strengths, and a dedicated question-document relevance module.\nV-RAG sets a new standard, with a 9% and 11% improvement in Recall@1 on the\nchallenging DocHaystack-1000 and InfoHaystack-1000 benchmarks, respectively,\ncompared to the previous best baseline models. Additionally, integrating V-RAG\nwith LMMs enables them to efficiently operate across thousands of images,\nyielding significant improvements on our DocHaystack and InfoHaystack\nbenchmarks. Our code and datasets are available at\nhttps://github.com/Vision-CAIR/dochaystacks\n","authors":["Jun Chen","Dannong Xu","Junjie Fei","Chun-Mei Feng","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2411.16740v2.pdf","comment":"the correct arxiv version"},{"id":"http://arxiv.org/abs/2303.17614v2","updated":"2024-12-01T19:02:51Z","published":"2023-03-30T02:54:59Z","title":"Estimating Continuous Muscle Fatigue For Multi-Muscle Coordinated\n Exercise: A Pilot Study on Walking","summary":" Assessing the progression of muscle fatigue for daily exercises provides\nvital indicators for precise rehabilitation, personalized training dose,\nespecially under the context of Metaverse. Assessing fatigue of multi-muscle\ncoordination-involved daily exercises requires the neuromuscular features that\nrepresent the fatigue-induced characteristics of spatiotemporal adaptions of\nmultiple muscles and the estimator that captures the time-evolving progression\nof fatigue. In this paper, we propose to depict fatigue by the features of\nmuscle compensation and spinal module activation changes and estimate\ncontinuous fatigue by a physiological rationale model. First, we extract muscle\nsynergy fractionation and the variance of spinal module spikings as features\ninspired by the prior of fatigue-induced neuromuscular adaptations. Second, we\ntreat the features as observations and develop a Bayesian Gaussian process to\ncapture the time-evolving progression. Third, we solve the issue of lacking\nsupervision information by mathematically formulating the time-evolving\ncharacteristics of fatigue as the loss function. Finally, we adapt the metrics\nthat follow the physiological principles of fatigue to quantitatively evaluate\nthe performance. Our extensive experiments present a 0.99 similarity between\ndays, a over 0.7 similarity with other views of fatigue and a nearly 1 weak\nmonotonicity, which outperform other methods. This study would aim the\nobjective assessment of muscle fatigue.\n","authors":["Chunzhi Yi","Xiaolei Sun","Chunyu Zhang","Wei Jin","Jianfei Zhu","Haiqi Zhu","Baichun Wei"],"pdf_url":"https://arxiv.org/pdf/2303.17614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17855v2","updated":"2024-12-01T19:02:28Z","published":"2024-11-26T20:11:46Z","title":"\"Give me the code\" -- Log Analysis of First-Year CS Students'\n Interactions With GPT","summary":" The impact of Large Language Models (LLMs) like GPT-3, GPT-4, and Bard in\ncomputer science (CS) education is expected to be profound. Students now have\nthe power to generate code solutions for a wide array of programming\nassignments. For first-year students, this may be particularly problematic\nsince the foundational skills are still in development and an over-reliance on\ngenerative AI tools can hinder their ability to grasp essential programming\nconcepts. This paper analyzes the prompts used by 69 freshmen undergraduate\nstudents to solve a certain programming problem within a project assignment,\nwithout giving them prior prompt training. We also present the rules of the\nexercise that motivated the prompts, designed to foster critical thinking\nskills during the interaction. Despite using unsophisticated prompting\ntechniques, our findings suggest that the majority of students successfully\nleveraged GPT, incorporating the suggested solutions into their projects.\nAdditionally, half of the students demonstrated the ability to exercise\njudgment in selecting from multiple GPT-generated solutions, showcasing the\ndevelopment of their critical thinking skills in evaluating AI-generated code.\n","authors":["Pedro Alves","Bruno Pereira Cipriano"],"pdf_url":"https://arxiv.org/pdf/2411.17855v2.pdf","comment":"This is the author's version of the work. It is posted here for your\n personal use. Not for redistribution"},{"id":"http://arxiv.org/abs/1802.07228v2","updated":"2024-12-01T17:59:04Z","published":"2018-02-20T18:07:50Z","title":"The Malicious Use of Artificial Intelligence: Forecasting, Prevention,\n and Mitigation","summary":" This report surveys the landscape of potential security threats from\nmalicious uses of AI, and proposes ways to better forecast, prevent, and\nmitigate these threats. After analyzing the ways in which AI may influence the\nthreat landscape in the digital, physical, and political domains, we make four\nhigh-level recommendations for AI researchers and other stakeholders. We also\nsuggest several promising areas for further research that could expand the\nportfolio of defenses, or make attacks less effective or harder to execute.\nFinally, we discuss, but do not conclusively resolve, the long-term equilibrium\nof attackers and defenders.\n","authors":["Miles Brundage","Shahar Avin","Jack Clark","Helen Toner","Peter Eckersley","Ben Garfinkel","Allan Dafoe","Paul Scharre","Thomas Zeitzoff","Bobby Filar","Hyrum Anderson","Heather Roff","Gregory C. Allen","Jacob Steinhardt","Carrick Flynn","Seán Ó hÉigeartaigh","SJ Beard","Haydn Belfield","Sebastian Farquhar","Clare Lyle","Rebecca Crootof","Owain Evans","Michael Page","Joanna Bryson","Roman Yampolskiy","Dario Amodei"],"pdf_url":"https://arxiv.org/pdf/1802.07228v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09566v2","updated":"2024-12-01T17:48:24Z","published":"2024-09-15T00:53:44Z","title":"Learning Transferable Features for Implicit Neural Representations","summary":" Implicit neural representations (INRs) have demonstrated success in a variety\nof applications, including inverse problems and neural rendering. An INR is\ntypically trained to capture one signal of interest, resulting in learned\nneural features that are highly attuned to that signal. Assumed to be less\ngeneralizable, we explore the aspect of transferability of such learned neural\nfeatures for fitting similar signals. We introduce a new INR training\nframework, STRAINER that learns transferrable features for fitting INRs to new\nsignals from a given distribution, faster and with better reconstruction\nquality. Owing to the sequential layer-wise affine operations in an INR, we\npropose to learn transferable representations by sharing initial encoder layers\nacross multiple INRs with independent decoder layers. At test time, the learned\nencoder representations are transferred as initialization for an otherwise\nrandomly initialized INR. We find STRAINER to yield extremely powerful\ninitialization for fitting images from the same domain and allow for $\\approx\n+10dB$ gain in signal quality early on compared to an untrained INR itself.\nSTRAINER also provides a simple way to encode data-driven priors in INRs. We\nevaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks\nand inverse problems and further provide detailed analysis and discussion on\nthe transferability of STRAINER's features. Our demo can be accessed at\nhttps://colab.research.google.com/drive/1fBZAwqE8C_lrRPAe-hQZJTWrMJuAKtG2?usp=sharing .\n","authors":["Kushal Vyas","Ahmed Imtiaz Humayun","Aniket Dashpute","Richard G. Baraniuk","Ashok Veeraraghavan","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2409.09566v2.pdf","comment":"Project Website: https://kushalvyas.github.io/strainer.html"},{"id":"http://arxiv.org/abs/2406.12644v3","updated":"2024-12-01T17:45:28Z","published":"2024-06-18T14:12:27Z","title":"Hierarchical Prompting Taxonomy: A Universal Evaluation Framework for\n Large Language Models","summary":" Assessing the effectiveness of large language models (LLMs) in performing\ndifferent tasks is crucial for understanding their strengths and weaknesses.\nThis paper presents the Hierarchical Prompting Taxonomy (HPT), grounded on\nhuman cognitive principles and designed to assess LLMs by examining the\ncognitive demands of various tasks. The HPT uses the Hierarchical Prompting\nFramework (HPF), a prompt selection framework that organizes five distinct\nprompting strategies by their cognitive load on LLMs. This study introduces the\nHierarchical Prompting Index (HPI) to measure task complexity, which\ndemonstrates LLMs' abilities across different datasets and serves as a\nuniversal metric for task complexity. The HPT offers a reliable method for\nevaluating LLMs' problem-solving skills in diverse scenarios, leading to\nclearer conclusions. Extensive experiments with multiple datasets and LLMs show\nthat the HPF enhances LLM performance by 2\\% to 63\\% compared to standard\nbenchmark datasets, confirming the effectiveness of the HPT. To support future\nresearch in this domain, the implementations of HPT and HPF are publicly\navailable\n","authors":["Devichand Budagam","Ashutosh Kumar","Mahsa Khoshnoodi","Sankalp KJ","Vinija Jain","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2406.12644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05227v5","updated":"2024-12-01T16:50:02Z","published":"2022-09-12T13:26:26Z","title":"DUET: A Tuning-Free Device-Cloud Collaborative Parameters Generation\n Framework for Efficient Device Model Generalization","summary":" Device Model Generalization (DMG) is a practical yet under-investigated\nresearch topic for on-device machine learning applications. It aims to improve\nthe generalization ability of pre-trained models when deployed on\nresource-constrained devices, such as improving the performance of pre-trained\ncloud models on smart mobiles. While quite a lot of works have investigated the\ndata distribution shift across clouds and devices, most of them focus on model\nfine-tuning on personalized data for individual devices to facilitate DMG.\nDespite their promising, these approaches require on-device re-training, which\nis practically infeasible due to the overfitting problem and high time delay\nwhen performing gradient calculation on real-time data. In this paper, we argue\nthat the computational cost brought by fine-tuning can be rather unnecessary.\nWe consequently present a novel perspective to improving DMG without increasing\ncomputational cost, i.e., device-specific parameter generation which directly\nmaps data distribution to parameters. Specifically, we propose an efficient\nDevice-cloUd collaborative parametErs generaTion framework DUET. DUET is\ndeployed on a powerful cloud server that only requires the low cost of\nforwarding propagation and low time delay of data transmission between the\ndevice and the cloud. By doing so, DUET can rehearse the device-specific model\nweight realizations conditioned on the personalized real-time data for an\nindividual device. Importantly, our DUET elegantly connects the cloud and\ndevice as a 'duet' collaboration, frees the DMG from fine-tuning, and enables a\nfaster and more accurate DMG paradigm. We conduct an extensive experimental\nstudy of DUET on three public datasets, and the experimental results confirm\nour framework's effectiveness and generalisability for different DMG tasks.\n","authors":["Zheqi Lv","Wenqiao Zhang","Shengyu Zhang","Kun Kuang","Feng Wang","Yongwei Wang","Zhengyu Chen","Tao Shen","Hongxia Yang","Beng Chin Ooi","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2209.05227v5.pdf","comment":"Published on WWW'23: Proceedings of the ACM on Web Conference 2023\n (pp. 3077 - 3085)"},{"id":"http://arxiv.org/abs/2302.07335v3","updated":"2024-12-01T16:41:49Z","published":"2023-02-14T20:44:12Z","title":"Intelligent Model Update Strategy for Sequential Recommendation","summary":" Modern online platforms are increasingly employing recommendation systems to\naddress information overload and improve user engagement. There is an evolving\nparadigm in this research field that recommendation network learning occurs\nboth on the cloud and on edges with knowledge transfer in between (i.e.,\nedge-cloud collaboration). Recent works push this field further by enabling\nedge-specific context-aware adaptivity, where model parameters are updated in\nreal-time based on incoming on-edge data. However, we argue that frequent data\nexchanges between the cloud and edges often lead to inefficiency and waste of\ncommunication/computation resources, as considerable parameter updates might be\nredundant. To investigate this problem, we introduce Intelligent Edge-Cloud\nParameter Request Model, abbreviated as IntellectReq.\n IntellectReq is designed to operate on edge, evaluating the cost-benefit\nlandscape of parameter requests with minimal computation and communication\noverhead. We formulate this as a novel learning task, aimed at the detection of\nout-of-distribution data, thereby fine-tuning adaptive communication\nstrategies. Further, we employ statistical mapping techniques to convert\nreal-time user behavior into a normal distribution, thereby employing\nmulti-sample outputs to quantify the model's uncertainty and thus its\ngeneralization capabilities. Rigorous empirical validation on four\nwidely-adopted benchmarks evaluates our approach, evidencing a marked\nimprovement in the efficiency and generalizability of edge-cloud collaborative\nand dynamic recommendation systems.\n","authors":["Zheqi Lv","Wenqiao Zhang","Zhengyu Chen","Shengyu Zhang","Kun Kuang"],"pdf_url":"https://arxiv.org/pdf/2302.07335v3.pdf","comment":"Published on WWW'24(Oral): Proceedings of the ACM on Web Conference\n 2024 (pp. 3117-3128)"},{"id":"http://arxiv.org/abs/2408.01444v2","updated":"2024-12-01T16:11:18Z","published":"2024-07-21T16:11:00Z","title":"No Size Fits All: The Perils and Pitfalls of Leveraging LLMs Vary with\n Company Size","summary":" Large language models (LLMs) are playing a pivotal role in deploying\nstrategic use cases across a range of organizations, from large pan-continental\ncompanies to emerging startups. The issues and challenges involved in the\nsuccessful utilization of LLMs can vary significantly depending on the size of\nthe organization. It is important to study and discuss these pertinent issues\nof LLM adaptation with a focus on the scale of the industrial concerns and\nbrainstorm possible solutions and prospective directions. Such a study has not\nbeen prominently featured in the current research literature. In this study, we\nadopt a threefold strategy: first, we conduct a case study with industry\npractitioners to formulate the key research questions; second, we examine\nexisting industrial publications to address these questions; and finally, we\nprovide a practical guide for industries to utilize LLMs more efficiently. We\nrelease the\nGitHub\\footnote{\\url{https://github.com/vinayakcse/IndustrialLLMsPapers}}\nrepository with the most recent papers in the field.\n","authors":["Ashok Urlana","Charaka Vinayak Kumar","Bala Mallikarjunarao Garlapati","Ajeet Kumar Singh","Rahul Mishra"],"pdf_url":"https://arxiv.org/pdf/2408.01444v2.pdf","comment":"COLING2025 Industry track"},{"id":"http://arxiv.org/abs/2410.10578v4","updated":"2024-12-01T15:49:16Z","published":"2024-10-14T14:52:23Z","title":"Burning RED: Unlocking Subtask-Driven Reinforcement Learning and\n Risk-Awareness in Average-Reward Markov Decision Processes","summary":" Average-reward Markov decision processes (MDPs) provide a foundational\nframework for sequential decision-making under uncertainty. However,\naverage-reward MDPs have remained largely unexplored in reinforcement learning\n(RL) settings, with the majority of RL-based efforts having been allocated to\nepisodic and discounted MDPs. In this work, we study a unique structural\nproperty of average-reward MDPs and utilize it to introduce Reward-Extended\nDifferential (or RED) reinforcement learning: a novel RL framework that can be\nused to effectively and efficiently solve various subtasks simultaneously in\nthe average-reward setting. We introduce a family of RED learning algorithms\nfor prediction and control, including proven-convergent algorithms for the\ntabular case. We then showcase the power of these algorithms by demonstrating\nhow they can be used to learn a policy that optimizes, for the first time, the\nwell-known conditional value-at-risk (CVaR) risk measure in a fully-online\nmanner, without the use of an explicit bi-level optimization scheme or an\naugmented state-space.\n","authors":["Juan Sebastian Rojas","Chi-Guhn Lee"],"pdf_url":"https://arxiv.org/pdf/2410.10578v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05037v3","updated":"2024-12-01T15:17:03Z","published":"2023-08-09T16:09:44Z","title":"Separate Anything You Describe","summary":" Language-queried audio source separation (LASS) is a new paradigm for\ncomputational auditory scene analysis (CASA). LASS aims to separate a target\nsound from an audio mixture given a natural language query, which provides a\nnatural and scalable interface for digital audio applications. Recent works on\nLASS, despite attaining promising separation performance on specific sources\n(e.g., musical instruments, limited classes of audio events), are unable to\nseparate audio concepts in the open domain. In this work, we introduce\nAudioSep, a foundation model for open-domain audio source separation with\nnatural language queries. We train AudioSep on large-scale multimodal datasets\nand extensively evaluate its capabilities on numerous tasks including audio\nevent separation, musical instrument separation, and speech enhancement.\nAudioSep demonstrates strong separation performance and impressive zero-shot\ngeneralization ability using audio captions or text labels as queries,\nsubstantially outperforming previous audio-queried and language-queried sound\nseparation models. For reproducibility of this work, we will release the source\ncode, evaluation benchmark and pre-trained model at:\nhttps://github.com/Audio-AGI/AudioSep.\n","authors":["Xubo Liu","Qiuqiang Kong","Yan Zhao","Haohe Liu","Yi Yuan","Yuzhuo Liu","Rui Xia","Yuxuan Wang","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05037v3.pdf","comment":"Code, benchmark and pre-trained models:\n https://github.com/Audio-AGI/AudioSep"},{"id":"http://arxiv.org/abs/2409.07154v2","updated":"2024-12-01T15:07:43Z","published":"2024-09-11T09:59:56Z","title":"Recurrent Aggregators in Neural Algorithmic Reasoning","summary":" Neural algorithmic reasoning (NAR) is an emerging field that seeks to design\nneural networks that mimic classical algorithmic computations. Today, graph\nneural networks (GNNs) are widely used in neural algorithmic reasoners due to\ntheir message passing framework and permutation equivariance. In this extended\nabstract, we challenge this design choice, and replace the equivariant\naggregation function with a recurrent neural network. While seemingly\ncounter-intuitive, this approach has appropriate grounding when nodes have a\nnatural ordering -- and this is the case frequently in established reasoning\nbenchmarks like CLRS-30. Indeed, our recurrent NAR (RNAR) model performs very\nstrongly on such tasks, while handling many others gracefully. A notable\nachievement of RNAR is its decisive state-of-the-art result on the Heapsort and\nQuickselect tasks, both deemed as a significant challenge for contemporary\nneural algorithmic reasoners -- especially the latter, where RNAR achieves a\nmean micro-F1 score of 87%.\n","authors":["Kaijia Xu","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2409.07154v2.pdf","comment":"Presented at the Third Learning on Graphs Conference (LoG 2024). 10\n pages, 1 figure"},{"id":"http://arxiv.org/abs/2408.02555v3","updated":"2024-12-01T14:34:01Z","published":"2024-08-05T15:33:45Z","title":"MeshAnything V2: Artist-Created Mesh Generation With Adjacent Mesh\n Tokenization","summary":" Meshes are the de facto 3D representation in the industry but are\nlabor-intensive to produce. Recently, a line of research has focused on\nautoregressively generating meshes. This approach processes meshes into a\nsequence composed of vertices and then generates them vertex by vertex, similar\nto how a language model generates text. These methods have achieved some\nsuccess but still struggle to generate complex meshes. One primary reason for\nthis limitation is their inefficient tokenization methods. To address this\nissue, we introduce MeshAnything V2, an advanced mesh generation model designed\nto create Artist-Created Meshes that align precisely with specified shapes. A\nkey innovation behind MeshAnything V2 is our novel Adjacent Mesh Tokenization\n(AMT) method. Unlike traditional approaches that represent each face using\nthree vertices, AMT optimizes this by employing a single vertex wherever\nfeasible, effectively reducing the token sequence length by about half on\naverage. This not only streamlines the tokenization process but also results in\nmore compact and well-structured sequences, enhancing the efficiency of mesh\ngeneration. With these improvements, MeshAnything V2 effectively doubles the\nface limit compared to previous models, delivering superior performance without\nincreasing computational costs. We will make our code and models publicly\navailable. Project Page: https://buaacyw.github.io/meshanything-v2/\n","authors":["Yiwen Chen","Yikai Wang","Yihao Luo","Zhengyi Wang","Zilong Chen","Jun Zhu","Chi Zhang","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2408.02555v3.pdf","comment":"Project Page: https://buaacyw.github.io/meshanything-v2/ Github:\n https://github.com/buaacyw/MeshAnythingV2"},{"id":"http://arxiv.org/abs/2409.15100v3","updated":"2024-12-01T13:04:28Z","published":"2024-09-23T15:11:40Z","title":"Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise\n with Median Anchored Clipping","summary":" Leveraging over-the-air computations for model aggregation is an effective\napproach to cope with the communication bottleneck in federated edge learning.\nBy exploiting the superposition properties of multi-access channels, this\napproach facilitates an integrated design of communication and computation,\nthereby enhancing system privacy while reducing implementation costs. However,\nthe inherent electromagnetic interference in radio channels often exhibits\nheavy-tailed distributions, giving rise to exceptionally strong noise in\nglobally aggregated gradients that can significantly deteriorate the training\nperformance. To address this issue, we propose a novel gradient clipping\nmethod, termed Median Anchored Clipping (MAC), to combat the detrimental\neffects of heavy-tailed noise. We also derive analytical expressions for the\nconvergence rate of model training with analog over-the-air federated learning\nunder MAC, which quantitatively demonstrates the effect of MAC on training\nperformance. Extensive experimental results show that the proposed MAC\nalgorithm effectively mitigates the impact of heavy-tailed noise, hence\nsubstantially enhancing system robustness.\n","authors":["Jiaxing Li","Zihan Chen","Kai Fong Ernest Chong","Bikramjit Das","Tony Q. S. Quek","Howard H. Yang"],"pdf_url":"https://arxiv.org/pdf/2409.15100v3.pdf","comment":"This is the full version of the paper, and the appendix contains a\n complete convergence analysis under non-convex conditions"},{"id":"http://arxiv.org/abs/2407.02292v2","updated":"2024-12-01T11:31:18Z","published":"2024-07-02T14:27:06Z","title":"Strategic Demand-Planning in Wireless Networks: Can Generative-AI Save\n Spectrum and Energy?","summary":" Generative-AI (GenAI), a novel technology capable of producing various types\nof outputs, including text, images, and videos, offers significant potential\nfor wireless communications. This article introduces the concept of strategic\ndemand-planning through demand-labeling, demand-shaping, and\ndemand-rescheduling. Accordingly, GenAI is proposed as a powerful tool to\nfacilitate demand-shaping in wireless networks. More specifically, GenAI is\nused to compress and convert the content of various types (e.g., from a higher\nbandwidth mode to a lower one, such as from a video to text), which\nsubsequently enhances performance of wireless networks in various usage\nscenarios, such as cell-switching, user association and load balancing,\ninterference management, as well as disasters and unusual gatherings.\nTherefore, GenAI can serve a function in saving energy and spectrum in wireless\nnetworks. With recent advancements in AI, including sophisticated algorithms\nlike large language models and the development of more powerful hardware built\nexclusively for AI tasks, such as AI accelerators, the concept of\ndemand-planning, particularly demand-shaping through GenAI, becomes\nincreasingly relevant. Furthermore, recent efforts to make GenAI accessible on\ndevices, such as user terminals, make the implementation of this concept even\nmore straightforward and feasible.\n","authors":["Berk Çiloğlu","Görkem Berkay Koç","Afsoon Alidadi Shamsabadi","Metin Ozturk","Halim Yanikomeroglu"],"pdf_url":"https://arxiv.org/pdf/2407.02292v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18207v2","updated":"2024-12-01T10:23:18Z","published":"2024-11-27T10:33:51Z","title":"From Open Vocabulary to Open World: Teaching Vision Language Models to\n Detect Novel Objects","summary":" Traditional object detection methods operate under the closed-set assumption,\nwhere models can only detect a fixed number of objects predefined in the\ntraining set. Recent works on open vocabulary object detection (OVD) enable the\ndetection of objects defined by an unbounded vocabulary, which reduces the cost\nof training models for specific tasks. However, OVD heavily relies on accurate\nprompts provided by an ''oracle'', which limits their use in critical\napplications such as driving scene perception. OVD models tend to misclassify\nnear-out-of-distribution (NOOD) objects that have similar semantics to known\nclasses, and ignore far-out-of-distribution (FOOD) objects. To address theses\nlimitations, we propose a framework that enables OVD models to operate in open\nworld settings, by identifying and incrementally learning novel objects. To\ndetect FOOD objects, we propose Open World Embedding Learning (OWEL) and\nintroduce the concept of Pseudo Unknown Embedding which infers the location of\nunknown classes in a continuous semantic space based on the information of\nknown classes. We also propose Multi-Scale Contrastive Anchor Learning (MSCAL),\nwhich enables the identification of misclassified unknown objects by promoting\nthe intra-class consistency of object embeddings at different scales. The\nproposed method achieves state-of-the-art performance in common open world\nobject detection and autonomous driving benchmarks.\n","authors":["Zizhao Li","Zhengkang Xiang","Joseph West","Kourosh Khoshelham"],"pdf_url":"https://arxiv.org/pdf/2411.18207v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03768v4","updated":"2024-12-01T08:43:03Z","published":"2024-01-08T09:47:19Z","title":"Corn Yield Prediction Model with Deep Neural Networks for Smallholder\n Farmer Decision Support System","summary":" Crop yield prediction has been modeled on the assumption that there is no\ninteraction between weather and soil variables. However, this paper argues that\nan interaction exists, and it can be finely modelled using the Kendall\nCorrelation coefficient. Given the nonlinearity of the interaction between\nweather and soil variables, a deep neural network regressor (DNNR) is carefully\ndesigned with consideration to the depth, number of neurons of the hidden\nlayers, and the hyperparameters with their optimizations. Additionally, a new\nmetric, the average of absolute root squared error (ARSE) is proposed to\ncombine the strengths of root mean square error (RMSE) and mean absolute error\n(MAE). With the ARSE metric, the proposed DNNR(s), optimised random forest\nregressor (RFR) and the extreme gradient boosting regressor (XGBR) achieved\nimpressively small yield errors, 0.0172 t/ha, and 0.0243 t/ha, 0.0001 t/ha, and\n0.001 t/ha, respectively. However, the DNNR(s), with changes to the explanatory\nvariables to ensure generalizability to unforeseen data, DNNR(s) performed\nbest. Further analysis reveals that a strong interaction does exist between\nweather and soil variables. Precisely, yield is observed to increase when\nprecipitation is reduced and silt increased, and vice-versa. However, the\ndegree of decrease or increase is not quantified in this paper. Contrary to\nexisting yield models targeted towards agricultural policies and global food\nsecurity, the goal of the proposed corn yield model is to empower the\nsmallholder farmer to farm smartly and intelligently, thus the prediction model\nis integrated into a mobile application that includes education, and a\nfarmer-to-market access module.\n","authors":["Chollette C. Olisah","Lyndon Smith","Melvyn Smith","Morolake O. Lawrence","Osita Ojukwu"],"pdf_url":"https://arxiv.org/pdf/2401.03768v4.pdf","comment":"30 Pages, 11 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2411.14491v3","updated":"2024-12-01T08:37:51Z","published":"2024-11-20T12:34:44Z","title":"A Survey on Human-Centric LLMs","summary":" The rapid evolution of large language models (LLMs) and their capacity to\nsimulate human cognition and behavior has given rise to LLM-based frameworks\nand tools that are evaluated and applied based on their ability to perform\ntasks traditionally performed by humans, namely those involving cognition,\ndecision-making, and social interaction. This survey provides a comprehensive\nexamination of such human-centric LLM capabilities, focusing on their\nperformance in both individual tasks (where an LLM acts as a stand-in for a\nsingle human) and collective tasks (where multiple LLMs coordinate to mimic\ngroup dynamics). We first evaluate LLM competencies across key areas including\nreasoning, perception, and social cognition, comparing their abilities to\nhuman-like skills. Then, we explore real-world applications of LLMs in\nhuman-centric domains such as behavioral science, political science, and\nsociology, assessing their effectiveness in replicating human behaviors and\ninteractions. Finally, we identify challenges and future research directions,\nsuch as improving LLM adaptability, emotional intelligence, and cultural\nsensitivity, while addressing inherent biases and enhancing frameworks for\nhuman-AI collaboration. This survey aims to provide a foundational\nunderstanding of LLMs from a human-centric perspective, offering insights into\ntheir current capabilities and potential for future development.\n","authors":["Jing Yi Wang","Nicholas Sukiennik","Tong Li","Weikang Su","Qianyue Hao","Jingbo Xu","Zihan Huang","Fengli Xu","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2411.14491v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.09427v2","updated":"2024-12-01T08:33:42Z","published":"2024-01-24T05:28:29Z","title":"DoorINet: Door Heading Prediction through Inertial Deep Learning","summary":" Inertial sensors are widely used in a variety of applications. A common task\nis orientation estimation. To tackle such a task, attitude and heading\nreference system algorithms are applied. Relying on the gyroscope readings, the\naccelerometer measurements are used to update the attitude angles, and\nmagnetometer measurements are utilized to update the heading angle. In indoor\nenvironments, magnetometers suffer from interference that degrades their\nperformance resulting in poor heading angle estimation. Therefore, applications\nthat estimate the heading angle of moving objects, such as walking pedestrians,\nclosets, and refrigerators, are prone to error. To circumvent such situations,\nwe propose DoorINet, an end-to-end deep-learning framework to calculate the\nheading angle from door-mounted, low-cost inertial sensors without using\nmagnetometers. To evaluate our approach, we record a unique dataset containing\n391 minutes of accelerometer and gyroscope measurements and corresponding\nground-truth heading angle. We show that our proposed approach outperforms\ncommonly used, model based approaches and data-driven methods.\n","authors":["Aleksei Zakharchenko","Sharon Farber","Itzik Klein"],"pdf_url":"https://arxiv.org/pdf/2402.09427v2.pdf","comment":"10 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.17788v2","updated":"2024-12-01T08:00:56Z","published":"2024-11-26T15:29:38Z","title":"Geometric Point Attention Transformer for 3D Shape Reassembly","summary":" Shape assembly, which aims to reassemble separate parts into a complete\nobject, has gained significant interest in recent years. Existing methods\nprimarily rely on networks to predict the poses of individual parts, but often\nfail to effectively capture the geometric interactions between the parts and\ntheir poses. In this paper, we present the Geometric Point Attention\nTransformer (GPAT), a network specifically designed to address the challenges\nof reasoning about geometric relationships. In the geometric point attention\nmodule, we integrate both global shape information and local pairwise geometric\nfeatures, along with poses represented as rotation and translation vectors for\neach part. To enable iterative updates and dynamic reasoning, we introduce a\ngeometric recycling scheme, where each prediction is fed into the next\niteration for refinement. We evaluate our model on both the semantic and\ngeometric assembly tasks, showing that it outperforms previous methods in\nabsolute pose estimation, achieving accurate pose predictions and high\nalignment accuracy.\n","authors":["Jiahan Li","Chaoran Cheng","Jianzhu Ma","Ge Liu"],"pdf_url":"https://arxiv.org/pdf/2411.17788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18003v2","updated":"2024-12-01T07:20:36Z","published":"2024-11-27T02:47:17Z","title":"HAAT: Hybrid Attention Aggregation Transformer for Image\n Super-Resolution","summary":" In the research area of image super-resolution, Swin-transformer-based models\nare favored for their global spatial modeling and shifting window attention\nmechanism. However, existing methods often limit self-attention to non\noverlapping windows to cut costs and ignore the useful information that exists\nacross channels. To address this issue, this paper introduces a novel model,\nthe Hybrid Attention Aggregation Transformer (HAAT), designed to better\nleverage feature information. HAAT is constructed by integrating\nSwin-Dense-Residual-Connected Blocks (SDRCB) with Hybrid Grid Attention Blocks\n(HGAB). SDRCB expands the receptive field while maintaining a streamlined\narchitecture, resulting in enhanced performance. HGAB incorporates channel\nattention, sparse attention, and window attention to improve nonlocal feature\nfusion and achieve more visually compelling results. Experimental evaluations\ndemonstrate that HAAT surpasses state-of-the-art methods on benchmark datasets.\n Keywords: Image super-resolution, Computer vision, Attention mechanism,\nTransformer\n","authors":["Song-Jiang Lai","Tsun-Hin Cheung","Ka-Chun Fung","Kai-wen Xue","Kin-Man Lam"],"pdf_url":"https://arxiv.org/pdf/2411.18003v2.pdf","comment":"6 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2406.19048v2","updated":"2024-12-01T07:07:02Z","published":"2024-06-27T09:56:38Z","title":"BiCo-Fusion: Bidirectional Complementary LiDAR-Camera Fusion for\n Semantic- and Spatial-Aware 3D Object Detection","summary":" 3D object detection is an important task that has been widely applied in\nautonomous driving. To perform this task, a new trend is to fuse multi-modal\ninputs, i.e., LiDAR and camera. Under such a trend, recent methods fuse these\ntwo modalities by unifying them in the same 3D space. However, during direct\nfusion in a unified space, the drawbacks of both modalities (LiDAR features\nstruggle with detailed semantic information and the camera lacks accurate 3D\nspatial information) are also preserved, diluting semantic and spatial\nawareness of the final unified representation. To address the issue, this\nletter proposes a novel bidirectional complementary LiDAR-camera fusion\nframework, called BiCo-Fusion that can achieve robust semantic- and\nspatial-aware 3D object detection. The key insight is to fuse LiDAR and camera\nfeatures in a bidirectional complementary way to enhance the semantic awareness\nof the LiDAR and the 3D spatial awareness of the camera. The enhanced features\nfrom both modalities are then adaptively fused to build a semantic- and\nspatial-aware unified representation. Specifically, we introduce Pre-Fusion\nconsisting of a Voxel Enhancement Module (VEM) to enhance the semantic\nawareness of voxel features from 2D camera features and Image Enhancement\nModule (IEM) to enhance the 3D spatial awareness of camera features from 3D\nvoxel features. We then introduce Unified Fusion (U-Fusion) to adaptively fuse\nthe enhanced features from the last stage to build a unified representation.\nExtensive experiments demonstrate the superiority of our BiCo-Fusion against\nthe prior arts. Project page: https://t-ys.github.io/BiCo-Fusion/.\n","authors":["Yang Song","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2406.19048v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.08958v2","updated":"2024-12-01T06:47:45Z","published":"2024-09-13T16:23:17Z","title":"PINNfluence: Influence Functions for Physics-Informed Neural Networks","summary":" Recently, physics-informed neural networks (PINNs) have emerged as a flexible\nand promising application of deep learning to partial differential equations in\nthe physical sciences. While offering strong performance and competitive\ninference speeds on forward and inverse problems, their black-box nature limits\ninterpretability, particularly regarding alignment with expected physical\nbehavior. In the present work, we explore the application of influence\nfunctions (IFs) to validate and debug PINNs post-hoc. Specifically, we apply\nvariations of IF-based indicators to gauge the influence of different types of\ncollocation points on the prediction of PINNs applied to a 2D Navier-Stokes\nfluid flow problem. Our results demonstrate how IFs can be adapted to PINNs to\nreveal the potential for further studies. The code is publicly available at\nhttps://github.com/aleks-krasowski/PINNfluence.\n","authors":["Jonas R. Naujoks","Aleksander Krasowski","Moritz Weckbecker","Thomas Wiegand","Sebastian Lapuschkin","Wojciech Samek","René P. Klausen"],"pdf_url":"https://arxiv.org/pdf/2409.08958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07602v2","updated":"2024-12-01T06:39:41Z","published":"2024-11-12T07:24:41Z","title":"Circuit Complexity Bounds for RoPE-based Transformer Architecture","summary":" Characterizing the express power of the Transformer architecture is critical\nto understanding its capacity limits and scaling law. Recent works provide the\ncircuit complexity bounds to Transformer-like architecture. On the other hand,\nRotary Position Embedding ($\\mathsf{RoPE}$) has emerged as a crucial technique\nin modern large language models, offering superior performance in capturing\npositional information compared to traditional position embeddings, which shows\ngreat potential in application prospects, particularly for the long context\nscenario. Empirical evidence also suggests that $\\mathsf{RoPE}$-based\nTransformer architectures demonstrate greater generalization capabilities\ncompared to conventional Transformer models. In this work, we establish a\ncircuit complexity bound for Transformers with $\\mathsf{RoPE}$ attention. Our\nkey contribution is that we show that unless $\\mathsf{TC}^0 = \\mathsf{NC}^1$, a\n$\\mathsf{RoPE}$-based Transformer with $\\mathrm{poly}(n)$-precision, $O(1)$\nlayers, hidden dimension $d \\leq O(n)$ cannot solve the Arithmetic formula\nevaluation problem or the Boolean formula value problem. This result\nsignificantly demonstrates the fundamental limitation of the expressivity of\nthe $\\mathsf{RoPE}$-based Transformer architecture, although it achieves giant\nempirical success. Our theoretical result not only establishes the complexity\nbound but also may instruct further work on the $\\mathsf{RoPE}$-based\nTransformer.\n","authors":["Bo Chen","Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2411.07602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00765v2","updated":"2024-12-01T06:08:00Z","published":"2024-08-01T17:59:54Z","title":"MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models\n for Integrated Capabilities","summary":" MM-Vet, with open-ended vision-language questions targeting at evaluating\nintegrated capabilities, has become one of the most popular benchmarks for\nlarge multimodal model evaluation. MM-Vet assesses six core vision-language\n(VL) capabilities: recognition, knowledge, spatial awareness, language\ngeneration, OCR, and math. However, its question format is restricted to single\nimage-text pairs, lacking the interleaved image and text sequences prevalent in\nreal-world scenarios. To address this limitation, we introduce MM-Vet v2, which\nincludes a new VL capability called \"image-text sequence understanding\",\nevaluating models' ability to process VL sequences. Furthermore, we maintain\nthe high quality of evaluation samples while further expanding the evaluation\nset size. Using MM-Vet v2 to benchmark large multimodal models, we found that\nClaude 3.5 Sonnet is the best model with a score of 71.8, slightly\noutperforming GPT-4o which scored 71.0. Among open-weight models,\nInternVL2-Llama3-76B leads with a score of 68.4. The code, data, and\nleaderboard are accessible at https://github.com/yuweihao/MM-Vet.\n","authors":["Weihao Yu","Zhengyuan Yang","Lingfeng Ren","Linjie Li","Jianfeng Wang","Kevin Lin","Chung-Ching Lin","Zicheng Liu","Lijuan Wang","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2408.00765v2.pdf","comment":"Code, data and leaderboard: https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2308.02490v4","updated":"2024-12-01T05:46:03Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":" We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v4.pdf","comment":"ICML 2024. Code, data and leaderboard:\n https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2405.18560v2","updated":"2024-12-01T05:22:22Z","published":"2024-05-28T20:10:06Z","title":"Potential Field Based Deep Metric Learning","summary":" Deep metric learning (DML) involves training a network to learn a\nsemantically meaningful representation space. Many current approaches mine\nn-tuples of examples and model interactions within each tuplets. We present a\nnovel, compositional DML model, inspired by electrostatic fields in physics\nthat, instead of in tuples, represents the influence of each example\n(embedding) by a continuous potential field, and superposes the fields to\nobtain their combined global potential field. We use attractive/repulsive\npotential fields to represent interactions among embeddings from images of the\nsame/different classes. Contrary to typical learning methods, where mutual\ninfluence of samples is proportional to their distance, we enforce reduction in\nsuch influence with distance, leading to a decaying field. We show that such\ndecay helps improve performance on real world datasets with large intra-class\nvariations and label noise. Like other proxy-based methods, we also use proxies\nto succinctly represent sub-populations of examples. We evaluate our method on\nthree standard DML benchmarks- Cars-196, CUB-200-2011, and SOP datasets where\nit outperforms state-of-the-art baselines.\n","authors":["Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2405.18560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07712v2","updated":"2024-12-01T04:34:53Z","published":"2024-08-13T23:08:06Z","title":"Introduction to Reinforcement Learning","summary":" Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI),\nfocuses on training agents to make decisions by interacting with their\nenvironment to maximize cumulative rewards. This paper provides an overview of\nRL, covering its core concepts, methodologies, and resources for further\nlearning. It offers a thorough explanation of fundamental components such as\nstates, actions, policies, and reward signals, ensuring readers develop a solid\nfoundational understanding. Additionally, the paper presents a variety of RL\nalgorithms, categorized based on the key factors such as model-free,\nmodel-based, value-based, policy-based, and other key factors. Resources for\nlearning and implementing RL, such as books, courses, and online communities\nare also provided. By offering a clear, structured introduction, this paper\naims to simplify the complexities of RL for beginners, providing a\nstraightforward pathway to understanding and applying real-time techniques.\n","authors":["Majid Ghasemi","Dariush Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2408.07712v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2401.17133v2","updated":"2024-12-01T04:06:27Z","published":"2024-01-30T16:07:44Z","title":"SongBsAb: A Dual Prevention Approach against Singing Voice Conversion\n based Illegal Song Covers","summary":" Singing voice conversion (SVC) automates song covers by converting a source\nsinging voice from a source singer into a new singing voice with the same\nlyrics and melody as the source, but sounds like being covered by the target\nsinger of some given target singing voices. However, it raises serious concerns\nabout copyright and civil right infringements. We propose SongBsAb, the first\nproactive approach to tackle SVC-based illegal song covers. SongBsAb adds\nperturbations to singing voices before releasing them, so that when they are\nused, the process of SVC will be interfered, leading to unexpected singing\nvoices. Perturbations are carefully crafted to (1) provide a dual prevention,\ni.e., preventing the singing voice from being used as the source and target\nsinging voice in SVC, by proposing a gender-transformation loss and a high/low\nhierarchy multi-target loss, respectively; and (2) be harmless, i.e., no\nside-effect on the enjoyment of protected songs, by refining a psychoacoustic\nmodel-based loss with the backing track as an additional masker, a unique\naccompanying element for singing voices compared to ordinary speech voices. We\nalso adopt a frame-level interaction reduction-based loss and encoder ensemble\nto enhance the transferability of SongBsAb to unknown SVC models. We\ndemonstrate the prevention effectiveness, harmlessness, and robustness of\nSongBsAb on five diverse and promising SVC models, using both English and\nChinese datasets, and both objective and human study-based subjective metrics.\nOur work fosters an emerging research direction for mitigating illegal\nautomated song covers.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song","Ting Wang","Xiaoning Du","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.17133v2.pdf","comment":"In Proceedings of the 32nd Network and Distributed System Security\n (NDSS) Symposium 2025"},{"id":"http://arxiv.org/abs/2406.11920v3","updated":"2024-12-01T03:49:57Z","published":"2024-06-17T07:22:51Z","title":"Job-SDF: A Multi-Granularity Dataset for Job Skill Demand Forecasting\n and Benchmarking","summary":" In a rapidly evolving job market, skill demand forecasting is crucial as it\nenables policymakers and businesses to anticipate and adapt to changes,\nensuring that workforce skills align with market needs, thereby enhancing\nproductivity and competitiveness. Additionally, by identifying emerging skill\nrequirements, it directs individuals towards relevant training and education\nopportunities, promoting continuous self-learning and development. However, the\nabsence of comprehensive datasets presents a significant challenge, impeding\nresearch and the advancement of this field. To bridge this gap, we present\nJob-SDF, a dataset designed to train and benchmark job-skill demand forecasting\nmodels. Based on 10.35 million public job advertisements collected from major\nonline recruitment platforms in China between 2021 and 2023, this dataset\nencompasses monthly recruitment demand for 2,324 types of skills across 521\ncompanies. Our dataset uniquely enables evaluating skill demand forecasting\nmodels at various granularities, including occupation, company, and regional\nlevels. We benchmark a range of models on this dataset, evaluating their\nperformance in standard scenarios, in predictions focused on lower value\nranges, and in the presence of structural breaks, providing new insights for\nfurther research. Our code and dataset are publicly accessible via the\nhttps://github.com/Job-SDF/benchmark.\n","authors":["Xi Chen","Chuan Qin","Chuyu Fang","Chao Wang","Chen Zhu","Fuzhen Zhuang","Hengshu Zhu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.11920v3.pdf","comment":"NeurIPS 2024 Accepted"},{"id":"http://arxiv.org/abs/2404.08857v2","updated":"2024-12-01T03:49:30Z","published":"2024-04-13T00:07:40Z","title":"Voice Attribute Editing with Text Prompt","summary":" Despite recent advancements in speech generation with text prompt providing\ncontrol over speech style, voice attributes in synthesized speech remain\nelusive and challenging to control. This paper introduces a novel task: voice\nattribute editing with text prompt, with the goal of making relative\nmodifications to voice attributes according to the actions described in the\ntext prompt. To solve this task, VoxEditor, an end-to-end generative model, is\nproposed. In VoxEditor, addressing the insufficiency of text prompt, a Residual\nMemory (ResMem) block is designed, that efficiently maps voice attributes and\nthese descriptors into the shared feature space. Additionally, the ResMem block\nis enhanced with a voice attribute degree prediction (VADP) block to align\nvoice attributes with corresponding descriptors, addressing the imprecision of\ntext prompt caused by non-quantitative descriptions of voice attributes. We\nalso establish the open-source VCTK-RVA dataset, which leads the way in manual\nannotations detailing voice characteristic differences among different\nspeakers. Extensive experiments demonstrate the effectiveness and\ngeneralizability of our proposed method in terms of both objective and\nsubjective metrics. The dataset and audio samples are available on the website.\n","authors":["Zhengyan Sheng","Yang Ai","Li-Juan Liu","Jia Pan","Zhen-Hua Ling"],"pdf_url":"https://arxiv.org/pdf/2404.08857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.15876v2","updated":"2024-12-01T02:38:17Z","published":"2024-10-21T10:57:45Z","title":"FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL","summary":" Multi-agent reinforcement learning has demonstrated significant potential in\naddressing complex cooperative tasks across various real-world applications.\nHowever, existing MARL approaches often rely on the restrictive assumption that\nthe number of entities (e.g., agents, obstacles) remains constant between\ntraining and inference. This overlooks scenarios where entities are dynamically\nremoved or added during the inference trajectory -- a common occurrence in\nreal-world environments like search and rescue missions and dynamic combat\nsituations. In this paper, we tackle the challenge of intra-trajectory dynamic\nentity composition under zero-shot out-of-domain (OOD) generalization, where\nsuch dynamic changes cannot be anticipated beforehand. Our empirical studies\nreveal that existing MARL methods suffer significant performance degradation\nand increased uncertainty in these scenarios. In response, we propose\nFlickerFusion, a novel OOD generalization method that acts as a universally\napplicable augmentation technique for MARL backbone methods. FlickerFusion\nstochastically drops out parts of the observation space, emulating being\nin-domain when inferenced OOD. The results show that FlickerFusion not only\nachieves superior inference rewards but also uniquely reduces uncertainty\nvis-\\`a-vis the backbone, compared to existing methods. Benchmarks,\nimplementations, and model weights are organized and open-sourced at\nflickerfusion305.github.io, accompanied by ample demo video renderings.\n","authors":["Woosung Koh","Wonbeen Oh","Siyeol Kim","Suhin Shin","Hyeongjin Kim","Jaein Jang","Junghyun Lee","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2410.15876v2.pdf","comment":"NeurIPS '24 Open-World Agents Workshop (v2: minor revision)"},{"id":"http://arxiv.org/abs/2410.14803v4","updated":"2024-12-01T02:09:21Z","published":"2024-10-18T18:19:56Z","title":"DistRL: An Asynchronous Distributed Reinforcement Learning Framework for\n On-Device Control Agents","summary":" On-device control agents, especially on mobile devices, are responsible for\noperating mobile devices to fulfill users' requests, enabling seamless and\nintuitive interactions. Integrating Multimodal Large Language Models (MLLMs)\ninto these agents enhances their ability to understand and execute complex\ncommands, thereby improving user experience. However, fine-tuning MLLMs for\non-device control presents significant challenges due to limited data\navailability and inefficient online training processes. This paper introduces\nDistRL, a novel framework designed to enhance the efficiency of online RL\nfine-tuning for mobile device control agents. DistRL employs centralized\ntraining and decentralized data acquisition to ensure efficient fine-tuning in\nthe context of dynamic online interactions. Additionally, the framework is\nbacked by our tailor-made RL algorithm, which effectively balances exploration\nwith the prioritized utilization of collected data to ensure stable and robust\ntraining. Our experiments show that, on average, DistRL delivers a 3X\nimprovement in training efficiency and enables training data collection 2.4X\nfaster than the leading synchronous multi-machine methods. Notably, after\ntraining, DistRL achieves a 20% relative improvement in success rate compared\nto state-of-the-art methods on general Android tasks from an open benchmark,\nsignificantly outperforming existing approaches while maintaining the same\ntraining time. These results validate DistRL as a scalable and efficient\nsolution, offering substantial improvements in both training efficiency and\nagent performance for real-world, in-the-wild device control tasks.\n","authors":["Taiyi Wang","Zhihao Wu","Jianheng Liu","Jianye Hao","Jun Wang","Kun Shao"],"pdf_url":"https://arxiv.org/pdf/2410.14803v4.pdf","comment":"Paper and Appendix, 26 pages"},{"id":"http://arxiv.org/abs/2309.17249v3","updated":"2024-12-01T01:36:50Z","published":"2023-09-29T13:55:45Z","title":"Batch Calibration: Rethinking Calibration for In-Context Learning and\n Prompt Engineering","summary":" Prompting and in-context learning (ICL) have become efficient learning\nparadigms for large language models (LLMs). However, LLMs suffer from prompt\nbrittleness and various bias factors in the prompt, including but not limited\nto the formatting, the choice verbalizers, and the ICL examples. To address\nthis problem that results in unexpected performance degradation, calibration\nmethods have been developed to mitigate the effects of these biases while\nrecovering LLM performance. In this work, we first conduct a systematic\nanalysis of the existing calibration methods, where we both provide a unified\nview and reveal the failure cases. Inspired by these analyses, we propose Batch\nCalibration (BC), a simple yet intuitive method that controls the contextual\nbias from the batched input, unifies various prior approaches, and effectively\naddresses the aforementioned issues. BC is zero-shot, inference-only, and\nincurs negligible additional costs. In the few-shot setup, we further extend BC\nto allow it to learn the contextual bias from labeled data. We validate the\neffectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate\nstate-of-the-art performance over previous calibration baselines across more\nthan 10 natural language understanding and image classification tasks.\n","authors":["Han Zhou","Xingchen Wan","Lev Proleev","Diana Mincu","Jilin Chen","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2309.17249v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2411.14592v2","updated":"2024-12-01T01:21:24Z","published":"2024-11-21T21:22:58Z","title":"G-RAG: Knowledge Expansion in Material Science","summary":" In the field of Material Science, effective information retrieval systems are\nessential for facilitating research. Traditional Retrieval-Augmented Generation\n(RAG) approaches in Large Language Models (LLMs) often encounter challenges\nsuch as outdated information, hallucinations, limited interpretability due to\ncontext constraints, and inaccurate retrieval. To address these issues, Graph\nRAG integrates graph databases to enhance the retrieval process. Our proposed\nmethod processes Material Science documents by extracting key entities\n(referred to as MatIDs) from sentences, which are then utilized to query\nexternal Wikipedia knowledge bases (KBs) for additional relevant information.\nWe implement an agent-based parsing technique to achieve a more detailed\nrepresentation of the documents. Our improved version of Graph RAG called G-RAG\nfurther leverages a graph database to capture relationships between these\nentities, improving both retrieval accuracy and contextual understanding. This\nenhanced approach demonstrates significant improvements in performance for\ndomains that require precise information retrieval, such as Material Science.\n","authors":["Radeen Mostafa","Mirza Nihal Baig","Mashaekh Tausif Ehsan","Jakir Hasan"],"pdf_url":"https://arxiv.org/pdf/2411.14592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.11848v2","updated":"2024-12-01T00:49:32Z","published":"2024-05-20T07:47:06Z","title":"Alternators For Sequence Modeling","summary":" This paper introduces alternators, a novel family of non-Markovian dynamical\nmodels for sequences. An alternator features two neural networks: the\nobservation trajectory network (OTN) and the feature trajectory network (FTN).\nThe OTN and the FTN work in conjunction, alternating between outputting samples\nin the observation space and some feature space, respectively, over a cycle.\nThe parameters of the OTN and the FTN are not time-dependent and are learned\nvia a minimum cross-entropy criterion over the trajectories. Alternators are\nversatile. They can be used as dynamical latent-variable generative models or\nas sequence-to-sequence predictors. Alternators can uncover the latent dynamics\nunderlying complex sequential data, accurately forecast and impute missing\ndata, and sample new trajectories. We showcase the capabilities of alternators\nin three applications. We first used alternators to model the Lorenz equations,\noften used to describe chaotic behavior. We then applied alternators to\nNeuroscience, to map brain activity to physical activity. Finally, we applied\nalternators to Climate Science, focusing on sea-surface temperature\nforecasting. In all our experiments, we found alternators are stable to train,\nfast to sample from, yield high-quality generated samples and latent variables,\nand often outperform strong baselines such as Mambas, neural ODEs, and\ndiffusion models in the domains we studied.\n","authors":["Mohammad Reza Rezaei","Adji Bousso Dieng"],"pdf_url":"https://arxiv.org/pdf/2405.11848v2.pdf","comment":"A new versatile family of sequence models that can be used for both\n generative modeling and supervised learning. The codebase will be made\n available upon publication. This paper is dedicated to Thomas Sankara"},{"id":"http://arxiv.org/abs/2309.03468v2","updated":"2024-12-01T00:09:44Z","published":"2023-09-07T03:33:49Z","title":"Support-Set Context Matters for Bongard Problems","summary":" Current machine learning methods struggle to solve Bongard problems, which\nare a type of IQ test that requires deriving an abstract \"concept\" from a set\nof positive and negative \"support\" images, and then classifying whether or not\na new query image depicts the key concept. On Bongard-HOI, a benchmark for\nnatural-image Bongard problems, most existing methods have reached at best 69%\naccuracy (where chance is 50%). Low accuracy is often attributed to neural\nnets' lack of ability to find human-like symbolic rules. In this work, we point\nout that many existing methods are forfeiting accuracy due to a much simpler\nproblem: they do not adapt image features given information contained in the\nsupport set as a whole, and rely instead on information extracted from\nindividual supports. This is a critical issue, because the \"key concept\" in a\ntypical Bongard problem can often only be distinguished using multiple\npositives and multiple negatives. We explore simple methods to incorporate this\ncontext and show substantial gains over prior works, leading to new\nstate-of-the-art accuracy on Bongard-LOGO (75.3%) and Bongard-HOI (76.4%)\ncompared to methods with equivalent vision backbone architectures and strong\nperformance on the original Bongard problem set (60.8%).\n","authors":["Nikhil Raghuraman","Adam W. Harley","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2309.03468v2.pdf","comment":"TMLR October 2024. Code:\n https://github.com/nraghuraman/bongard-context"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2407.12269v2","updated":"2024-12-01T23:07:27Z","published":"2024-07-17T02:35:24Z","title":"UTG: Towards a Unified View of Snapshot and Event Based Models for\n Temporal Graphs","summary":" Many real world graphs are inherently dynamic, constantly evolving with node\nand edge additions. These graphs can be represented by temporal graphs, either\nthrough a stream of edge events or a sequence of graph snapshots. Until now,\nthe development of machine learning methods for both types has occurred largely\nin isolation, resulting in limited experimental comparison and theoretical\ncrosspollination between the two. In this paper, we introduce Unified Temporal\nGraph (UTG), a framework that unifies snapshot-based and event-based machine\nlearning models under a single umbrella, enabling models developed for one\nrepresentation to be applied effectively to datasets of the other. We also\npropose a novel UTG training procedure to boost the performance of\nsnapshot-based models in the streaming setting. We comprehensively evaluate\nboth snapshot and event-based models across both types of temporal graphs on\nthe temporal link prediction task. Our main findings are threefold: first, when\ncombined with UTG training, snapshot-based models can perform competitively\nwith event-based models such as TGN and GraphMixer even on event datasets.\nSecond, snapshot-based models are at least an order of magnitude faster than\nmost event-based models during inference. Third, while event-based methods such\nas NAT and DyGFormer outperforms snapshot-based methods on both types of\ntemporal graphs, this is because they leverage joint neighborhood structural\nfeatures thus emphasizing the potential to incorporate these features into\nsnapshotbased models as well. These findings highlight the importance of\ncomparing model architectures independent of the data format and suggest the\npotential of combining the efficiency of snapshot-based models with the\nperformance of event-based models in the future.\n","authors":["Shenyang Huang","Farimah Poursafaei","Reihaneh Rabbany","Guillaume Rabusseau","Emanuele Rossi"],"pdf_url":"https://arxiv.org/pdf/2407.12269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17367v2","updated":"2024-12-01T22:48:24Z","published":"2024-11-26T12:20:18Z","title":"Efficient Deployment of Transformer Models in Analog In-Memory Computing\n Hardware","summary":" Analog in-memory computing (AIMC) has emerged as a promising solution to\novercome the von Neumann bottleneck, accelerating neural network computations\nand improving computational efficiency. While AIMC has demonstrated success\nwith architectures such as CNNs, MLPs, and RNNs, deploying transformer-based\nmodels using AIMC presents unique challenges. Transformers are expected to\nhandle diverse downstream tasks and adapt to new user data or instructions\nafter deployment, which requires more flexible approaches to suit AIMC\nconstraints.\n In this paper, we propose a novel method for deploying pre-trained\ntransformer models onto AIMC hardware. Unlike traditional approaches requiring\nhardware-aware training, our technique allows direct deployment without the\nneed for retraining the original model. Instead, we utilize lightweight,\nlow-rank adapters -- compact modules stored in digital cores -- to adapt the\nmodel to hardware constraints. We validate our approach on MobileBERT,\ndemonstrating accuracy on par with, or even exceeding, a traditional\nhardware-aware training approach. Our method is particularly appealing in\nmulti-task scenarios, as it enables a single analog model to be reused across\nmultiple tasks. Moreover, it supports on-chip adaptation to new hardware\nconstraints and tasks without updating analog weights, providing a flexible and\nversatile solution for real-world AI applications. Code is available.\n","authors":["Chen Li","Corey Lammie","Manuel Le Gallo","Bipin Rajendran"],"pdf_url":"https://arxiv.org/pdf/2411.17367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.13166v2","updated":"2024-12-01T22:34:05Z","published":"2024-06-19T02:45:32Z","title":"Enhancing supply chain security with automated machine learning","summary":" The increasing scale and complexity of global supply chains have led to new\nchallenges spanning various fields, such as supply chain disruptions due to\nlong waiting lines at the ports, material shortages, and inflation. Coupled\nwith the size of supply chains and the availability of vast amounts of data,\nefforts towards tackling such challenges have led to an increasing interest in\napplying machine learning methods in many aspects of supply chains. Unlike\nother solutions, ML techniques, including Random Forest, XGBoost, LightGBM, and\nNeural Networks, make predictions and approximate optimal solutions faster.\nThis paper presents an automated ML framework to enhance supply chain security\nby detecting fraudulent activities, predicting maintenance needs, and\nforecasting material backorders. Using datasets of varying sizes, results show\nthat fraud detection achieves an 88% accuracy rate using sampling methods,\nmachine failure prediction reaches 93.4% accuracy, and material backorder\nprediction achieves 89.3% accuracy. Hyperparameter tuning significantly\nimproved the performance of these models, with certain supervised techniques\nlike XGBoost and LightGBM reaching up to 100% precision. This research\ncontributes to supply chain security by streamlining data preprocessing,\nfeature selection, model optimization, and inference deployment, addressing\ncritical challenges and boosting operational efficiency.\n","authors":["Haibo Wang","Lutfu S. Sua","Bahram Alidaee"],"pdf_url":"https://arxiv.org/pdf/2406.13166v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2311.09614v3","updated":"2024-12-01T22:01:58Z","published":"2023-11-16T06:58:46Z","title":"Comprehensive framework for evaluation of deep neural networks in\n detection and quantification of lymphoma from PET/CT images: clinical\n insights, pitfalls, and observer agreement analyses","summary":" This study addresses critical gaps in automated lymphoma segmentation from\nPET/CT images, focusing on issues often overlooked in existing literature.\nWhile deep learning has been applied for lymphoma lesion segmentation, few\nstudies incorporate out-of-distribution testing, raising concerns about model\ngeneralizability across diverse imaging conditions and patient populations. We\nhighlight the need to compare model performance with expert human annotators,\nincluding intra- and inter-observer variability, to understand task difficulty\nbetter. Most approaches focus on overall segmentation accuracy but overlook\nlesion-specific metrics important for precise lesion detection and disease\nquantification.To address these gaps, we propose a clinically-relevant\nframework for evaluating deep neural networks. Using this lesion-specific\nevaluation, we assess the performance of four deep segmentation networks\n(ResUNet, SegResNet, DynUNet, and SwinUNETR) across 611 cases from\nmulti-institutional datasets, covering various lymphoma subtypes and lesion\ncharacteristics. Beyond standard metrics like the Dice similarity coefficient\n(DSC), we evaluate clinical lesion measures and their prediction errors. We\nalso introduce detection criteria for lesion localization and propose a new\ndetection Criterion 3 based on metabolic characteristics. We show that networks\nperform better on large, intense lesions with higher metabolic\nactivity.Finally, we compare network performance to expert human observers via\nintra- and inter-observer variability analyses, demonstrating that network\nerrors closely resemble those made by experts. Some small, faint lesions remain\nchallenging for both humans and networks. This study aims to improve automated\nlesion segmentation's clinical relevance, supporting better treatment decisions\nfor lymphoma patients. The code is available at:\nhttps://github.com/microsoft/lymphoma-segmentation-dnn\n","authors":["Shadab Ahamed","Yixi Xu","Sara Kurkowska","Claire Gowdy","Joo H. O","Ingrid Bloise","Don Wilson","Patrick Martineau","François Bénard","Fereshteh Yousefirizi","Rahul Dodhia","Juan M. Lavista","William B. Weeks","Carlos F. Uribe","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2311.09614v3.pdf","comment":"32 pages, 15 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.10792v8","updated":"2024-12-01T22:01:51Z","published":"2023-08-21T15:35:16Z","title":"Instruction Tuning for Large Language Models: A Survey","summary":" This paper surveys research works in the quickly advancing field of\ninstruction tuning (IT), which can also be referred to as supervised\nfine-tuning (SFT)\\footnote{In this paper, unless specified otherwise,\nsupervised fine-tuning (SFT) and instruction tuning (IT) are used\ninterchangeably.}, a crucial technique to enhance the capabilities and\ncontrollability of large language models (LLMs). Instruction tuning refers to\nthe process of further training LLMs on a dataset consisting of\n\\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the\ngap between the next-word prediction objective of LLMs and the users' objective\nof having LLMs adhere to human instructions. In this work, we make a systematic\nreview of the literature, including the general methodology of SFT, the\nconstruction of SFT datasets, the training of SFT models, and applications to\ndifferent modalities, domains and application, along with analysis on aspects\nthat influence the outcome of SFT (e.g., generation of instruction outputs,\nsize of the instruction dataset, etc). We also review the potential pitfalls of\nSFT along with criticism against it, along with efforts pointing out current\ndeficiencies of existing strategies and suggest some avenues for fruitful\nresearch. Project Page: github.com/xiaoya-li/Instruction-Tuning-Survey\n","authors":["Shengyu Zhang","Linfeng Dong","Xiaoya Li","Sen Zhang","Xiaofei Sun","Shuhe Wang","Jiwei Li","Runyi Hu","Tianwei Zhang","Fei Wu","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10792v8.pdf","comment":"V5; Last update: Dec. 1, 2024"},{"id":"http://arxiv.org/abs/2405.15722v3","updated":"2024-12-01T21:18:32Z","published":"2024-05-24T17:10:08Z","title":"Models That Prove Their Own Correctness","summary":" How can we trust the correctness of a learned model on a particular input of\ninterest? Model accuracy is typically measured *on average* over a distribution\nof inputs, giving no guarantee for any fixed input. This paper proposes a\ntheoretically-founded solution to this problem: to train *Self-Proving models*\nthat prove the correctness of their output to a verification algorithm $V$ via\nan Interactive Proof. Self-Proving models satisfy that, with high probability\nover a random input, the model generates a correct output *and* successfully\nproves its correctness to $V\\!$. The *soundness* property of $V$ guarantees\nthat, for *every* input, no model can convince $V$ of the correctness of an\nincorrect output. Thus, a Self-Proving model proves correctness of most of its\noutputs, while *all* incorrect outputs (of any model) are detected by $V$. We\ndevise a generic method for learning Self-Proving models, and we prove\nconvergence bounds under certain assumptions. The theoretical framework and\nresults are complemented by experiments on an arithmetic capability: computing\nthe greatest common divisor (GCD) of two integers. Our learning method is used\nto train a Self-Proving transformer that computes the GCD *and* proves the\ncorrectness of its answer.\n","authors":["Noga Amit","Shafi Goldwasser","Orr Paradise","Guy Rothblum"],"pdf_url":"https://arxiv.org/pdf/2405.15722v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08562v3","updated":"2024-12-01T20:58:49Z","published":"2024-10-11T06:35:48Z","title":"Adaptive Constraint Integration for Simultaneously Optimizing Crystal\n Structures with Multiple Targeted Properties","summary":" In materials science, finding crystal structures that have targeted\nproperties is crucial. While recent methodologies such as Bayesian optimization\nand deep generative models have made some advances on this issue, these methods\noften face difficulties in adaptively incorporating various constraints, such\nas electrical neutrality and targeted properties optimization, while keeping\nthe desired specific crystal structure. To address these challenges, we have\ndeveloped the Simultaneous Multi-property Optimization using Adaptive Crystal\nSynthesizer (SMOACS), which utilizes state-of-the-art property prediction\nmodels and their gradients to directly optimize input crystal structures for\ntargeted properties simultaneously. SMOACS enables the integration of adaptive\nconstraints into the optimization process without necessitating model\nretraining. Thanks to this feature, SMOACS has succeeded in simultaneously\noptimizing targeted properties while maintaining perovskite structures, even\nwith models trained on diverse crystal types. We have demonstrated the band gap\noptimization while meeting a challenging constraint, that is, maintaining\nelectrical neutrality in large atomic configurations up to 135 atom sites,\nwhere the verification of the electrical neutrality is challenging. The\nproperties of the most promising materials have been confirmed by density\nfunctional theory calculations.\n","authors":["Akihiro Fujii","Yoshitaka Ushiku","Koji Shimizu","Anh Khoa Augustin Lu","Satoshi Watanabe"],"pdf_url":"https://arxiv.org/pdf/2410.08562v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.04393v2","updated":"2024-12-01T20:48:11Z","published":"2024-04-05T20:36:30Z","title":"Counting Like Transformers: Compiling Temporal Counting Logic Into\n Softmax Transformers","summary":" Deriving formal bounds on the expressivity of transformers, as well as\nstudying transformers that are constructed to implement known algorithms, are\nboth effective methods for better understanding the computational power of\ntransformers. Towards both ends, we introduce the temporal counting logic\n$\\textsf{K}_\\text{t}$[#] alongside the RASP variant $\\textsf{C-RASP}$. We show\nthey are equivalent to each other, and that together they are the best-known\nlower bound on the formal expressivity of future-masked soft attention\ntransformers with unbounded input size. We prove this by showing all\n$\\textsf{K}_\\text{t}$[#] formulas can be compiled into these transformers.\n","authors":["Andy Yang","David Chiang"],"pdf_url":"https://arxiv.org/pdf/2404.04393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10883v2","updated":"2024-12-01T20:17:28Z","published":"2023-02-21T18:58:32Z","title":"Combining Blockchain and Biometrics: A Survey on Technical Aspects and a\n First Legal Analysis","summary":" Biometric recognition as a unique, hard-to-forge, and efficient way of\nidentification and verification has become an indispensable part of the current\ndigital world. The fast evolution of this technology has been a strong\nincentive for integrating it into many applications. Meanwhile, blockchain, the\nvery attractive decentralized ledger technology, has been widely received both\nby the research and industry in the past years and it is being increasingly\ndeployed nowadays in many different applications, such as money transfer, IoT,\nhealthcare, or logistics. Recently, researchers have started to speculate what\nwould be the pros and cons and what would be the best applications when these\ntwo technologies cross paths. This paper provides a survey of technical\nliterature research on the combination of blockchain and biometrics and\nincludes a first legal analysis of this integration to shed light on challenges\nand potentials. While this combination is still in its infancy and a growing\nbody of literature discusses specific blockchain applications and solutions in\nan advanced technological set-up, this paper presents a holistic understanding\nof blockchains applicability in the biometric sector. This study demonstrates\nthat combining blockchain and biometrics would be beneficial for novel\napplications in biometrics such as the PKI mechanism, distributed trusted\nservice, and identity management. However, blockchain networks at their current\nstage are not efficient and economical for real-time applications. From a legal\npoint of view, the allocation of accountability remains a main issue, while\nother difficulties remain, such as conducting a proper Data Protection Impact\nAssessment. Finally, it supplies technical and legal recommendations to reap\nthe benefits and mitigate the risks of the combination.\n","authors":["Mahdi Ghafourian","Bilgesu Sumer","Ruben Vera-Rodriguez","Julian Fierrez","Ruben Tolosana","Aythami Moralez","Els Kindt"],"pdf_url":"https://arxiv.org/pdf/2302.10883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04346v2","updated":"2024-12-01T20:10:57Z","published":"2023-11-07T21:06:06Z","title":"SaFL: Sybil-aware Federated Learning with Application to Face\n Recognition","summary":" Federated Learning (FL) is a machine learning paradigm to conduct\ncollaborative learning among clients on a joint model. The primary goal is to\nshare clients' local training parameters with an integrating server while\npreserving their privacy. This method permits to exploit the potential of\nmassive mobile users' data for the benefit of machine learning models'\nperformance while keeping sensitive data on local devices. On the downside, FL\nraises security and privacy concerns that have just started to be studied. To\naddress some of the key threats in FL, researchers have proposed to use secure\naggregation methods (e.g. homomorphic encryption, secure multiparty\ncomputation, etc.). These solutions improve some security and privacy metrics,\nbut at the same time bring about other serious threats such as poisoning\nattacks, backdoor attacks, and free running attacks. This paper proposes a new\ndefense method against poisoning attacks in FL called SaFL (Sybil-aware\nFederated Learning) that minimizes the effect of sybils with a novel\ntime-variant aggregation scheme.\n","authors":["Mahdi Ghafourian","Julian Fierrez","Ruben Vera-Rodriguez","Ruben Tolosana","Aythami Morales"],"pdf_url":"https://arxiv.org/pdf/2311.04346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03106v3","updated":"2024-12-01T19:15:57Z","published":"2023-03-03T10:53:30Z","title":"Rotation Invariant Quantization for Model Compression","summary":" Post-training Neural Network (NN) model compression is an attractive approach\nfor deploying large, memory-consuming models on devices with limited memory\nresources. In this study, we investigate the rate-distortion tradeoff for NN\nmodel compression. First, we suggest a Rotation-Invariant Quantization (RIQ)\ntechnique that utilizes a single parameter to quantize the entire NN model,\nyielding a different rate at each layer, i.e., mixed-precision quantization.\nThen, we prove that our rotation-invariant approach is optimal in terms of\ncompression. We rigorously evaluate RIQ and demonstrate its capabilities on\nvarious models and tasks. For example, RIQ facilitates $\\times 19.4$ and\n$\\times 52.9$ compression ratios on pre-trained VGG dense and pruned models,\nrespectively, with $<0.4\\%$ accuracy degradation. Code is available in\n\\href{https://github.com/ehaleva/RIQ}{github.com/ehaleva/RIQ}.\n","authors":["Joseph Kampeas","Yury Nahshan","Hanoch Kremer","Gil Lederman","Shira Zaloshinski","Zheng Li","Emir Haleva"],"pdf_url":"https://arxiv.org/pdf/2303.03106v3.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2404.07410v2","updated":"2024-12-01T18:48:37Z","published":"2024-04-11T00:49:38Z","title":"Improving Shift Invariance in Convolutional Neural Networks with\n Translation Invariant Polyphase Sampling","summary":" Downsampling operators break the shift invariance of convolutional neural\nnetworks (CNNs) and this affects the robustness of features learned by CNNs\nwhen dealing with even small pixel-level shift. Through a large-scale\ncorrelation analysis framework, we study shift invariance of CNNs by inspecting\nexisting downsampling operators in terms of their maximum-sampling bias (MSB),\nand find that MSB is negatively correlated with shift invariance. Based on this\ncrucial insight, we propose a learnable pooling operator called Translation\nInvariant Polyphase Sampling (TIPS) and two regularizations on the intermediate\nfeature maps of TIPS to reduce MSB and learn translation-invariant\nrepresentations. TIPS can be integrated into any CNN and can be trained\nend-to-end with marginal computational overhead. Our experiments demonstrate\nthat TIPS results in consistent performance gains in terms of accuracy, shift\nconsistency, and shift fidelity on multiple benchmarks for image classification\nand semantic segmentation compared to previous methods and also leads to\nimprovements in adversarial and distributional robustness. TIPS results in the\nlowest MSB compared to all previous methods, thus explaining our strong\nempirical results.\n","authors":["Sourajit Saha","Tejas Gokhale"],"pdf_url":"https://arxiv.org/pdf/2404.07410v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2403.03871v2","updated":"2024-12-01T18:13:30Z","published":"2024-03-06T17:23:28Z","title":"Decoupled Vertical Federated Learning for Practical Training on\n Vertically Partitioned Data","summary":" Vertical Federated Learning (VFL) is an emergent distributed machine learning\nparadigm for collaborative learning between clients who have disjoint features\nof common entities. However, standard VFL lacks fault tolerance, with each\nparticipant and connection being a single point of failure. Prior attempts to\ninduce fault tolerance in VFL focus on the scenario of \"straggling clients\",\nusually entailing that all messages eventually arrive or that there is an upper\nbound on the number of late messages. To handle the more general problem of\narbitrary crashes, we propose Decoupled VFL (DVFL). To handle training with\nfaults, DVFL decouples training between communication rounds using local\nunsupervised objectives. By further decoupling label supervision from\naggregation, DVFL also enables redundant aggregators. As secondary benefits,\nDVFL can enhance data efficiency and provides immunity against gradient-based\nattacks. In this work, we implement DVFL for split neural networks with a\nself-supervised autoencoder loss. When there are faults, DVFL outperforms the\nbest VFL-based alternative (97.58% vs 96.95% on an MNIST task). Even under\nperfect conditions, performance is comparable.\n","authors":["Avi Amalanshu","Yash Sirvi","David I. Inouye"],"pdf_url":"https://arxiv.org/pdf/2403.03871v2.pdf","comment":"Revised manuscript. Nothing removed, additional baseline results\n added"},{"id":"http://arxiv.org/abs/2411.17661v2","updated":"2024-12-01T17:10:16Z","published":"2024-11-26T18:25:57Z","title":"BERT or FastText? A Comparative Analysis of Contextual as well as\n Non-Contextual Embeddings","summary":" Natural Language Processing (NLP) for low-resource languages presents\nsignificant challenges, particularly due to the scarcity of high-quality\nannotated data and linguistic resources. The choice of embeddings plays a\ncritical role in enhancing the performance of NLP tasks, such as news\nclassification, sentiment analysis, and hate speech detection, especially for\nlow-resource languages like Marathi. In this study, we investigate the impact\nof various embedding techniques- Contextual BERT-based, Non-Contextual\nBERT-based, and FastText-based on NLP classification tasks specific to the\nMarathi language. Our research includes a thorough evaluation of both\ncompressed and uncompressed embeddings, providing a comprehensive overview of\nhow these embeddings perform across different scenarios. Specifically, we\ncompare two BERT model embeddings, Muril and MahaBERT, as well as two FastText\nmodel embeddings, IndicFT and MahaFT. Our evaluation includes applying\nembeddings to a Multiple Logistic Regression (MLR) classifier for task\nperformance assessment, as well as TSNE visualizations to observe the spatial\ndistribution of these embeddings. The results demonstrate that contextual\nembeddings outperform non-contextual embeddings. Furthermore, BERT-based\nnon-contextual embeddings extracted from the first BERT embedding layer yield\nbetter results than FastText-based embeddings, suggesting a potential\nalternative to FastText embeddings.\n","authors":["Abhay Shanbhag","Suramya Jadhav","Amogh Thakurdesai","Ridhima Sinare","Raviraj Joshi"],"pdf_url":"https://arxiv.org/pdf/2411.17661v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13682v3","updated":"2024-12-01T16:49:16Z","published":"2024-05-22T14:25:02Z","title":"Unified Universality Theorem for Deep and Shallow\n Joint-Group-Equivariant Machines","summary":" We present a constructive universal approximation theorem for learning\nmachines equipped with joint-group-equivariant feature maps, called the\njoint-equivariant machines, based on the group representation theory.\n\"Constructive\" here indicates that the distribution of parameters is given in a\nclosed-form expression known as the ridgelet transform.\nJoint-group-equivariance encompasses a broad class of feature maps that\ngeneralize classical group-equivariance. Particularly, fully-connected networks\nare not group-equivariant but are joint-group-equivariant. Our main theorem\nalso unifies the universal approximation theorems for both shallow and deep\nnetworks. Until this study, the universality of deep networks has been shown in\na different manner from the universality of shallow networks, but our results\ndiscuss them on common ground. Now we can understand the approximation schemes\nof various learning machines in a unified manner. As applications, we show the\nconstructive universal approximation properties of four examples: depth-$n$\njoint-equivariant machine, depth-$n$ fully-connected network, depth-$n$\ngroup-convolutional network, and a new depth-$2$ network with quadratic forms\nwhose universality has not been known.\n","authors":["Sho Sonoda","Yuka Hashimoto","Isao Ishikawa","Masahiro Ikeda"],"pdf_url":"https://arxiv.org/pdf/2405.13682v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01357v2","updated":"2024-12-01T16:18:23Z","published":"2024-11-02T20:27:51Z","title":"WaKA: Data Attribution using K-Nearest Neighbors and Membership Privacy\n Principles","summary":" In this paper, we introduce WaKA (Wasserstein K-nearest-neighbors\nAttribution), a novel attribution method that leverages principles from the\nLiRA (Likelihood Ratio Attack) framework and k-nearest neighbors classifiers\n(k-NN). WaKA efficiently measures the contribution of individual data points to\nthe model's loss distribution, analyzing every possible k-NN that can be\nconstructed using the training set, without requiring to sample subsets of the\ntraining set. WaKA is versatile and can be used a posteriori as a membership\ninference attack (MIA) to assess privacy risks or a priori for privacy\ninfluence measurement and data valuation. Thus, WaKA can be seen as bridging\nthe gap between data attribution and membership inference attack (MIA) by\nproviding a unified framework to distinguish between a data point's value and\nits privacy risk. For instance, we have shown that self-attribution values are\nmore strongly correlated with the attack success rate than the contribution of\na point to the model generalization. WaKA's different usage were also evaluated\nacross diverse real-world datasets, demonstrating performance very close to\nLiRA when used as an MIA on k-NN classifiers, but with greater computational\nefficiency. Additionally, WaKA shows greater robustness than Shapley Values for\ndata minimization tasks (removal or addition) on imbalanced datasets.\n","authors":["Patrick Mesana","Clément Bénesse","Hadrien Lautraite","Gilles Caporossi","Sébastien Gambs"],"pdf_url":"https://arxiv.org/pdf/2411.01357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02511v3","updated":"2024-12-01T15:55:50Z","published":"2024-02-04T14:51:49Z","title":"PoCo: Policy Composition from and for Heterogeneous Robot Learning","summary":" Training general robotic policies from heterogeneous data for different tasks\nis a significant challenge. Existing robotic datasets vary in different\nmodalities such as color, depth, tactile, and proprioceptive information, and\ncollected in different domains such as simulation, real robots, and human\nvideos. Current methods usually collect and pool all data from one domain to\ntrain a single policy to handle such heterogeneity in tasks and domains, which\nis prohibitively expensive and difficult. In this work, we present a flexible\napproach, dubbed Policy Composition, to combine information across such diverse\nmodalities and domains for learning scene-level and task-level generalized\nmanipulation skills, by composing different data distributions represented with\ndiffusion models. Our method can use task-level composition for multi-task\nmanipulation and be composed with analytic cost functions to adapt policy\nbehaviors at inference time. We train our method on simulation, human, and real\nrobot data and evaluate in tool-use tasks. The composed policy achieves robust\nand dexterous performance under varying scenes and tasks and outperforms\nbaselines from a single data source in both simulation and real-world\nexperiments. See https://liruiw.github.io/policycomp for more details .\n","authors":["Lirui Wang","Jialiang Zhao","Yilun Du","Edward H. Adelson","Russ Tedrake"],"pdf_url":"https://arxiv.org/pdf/2402.02511v3.pdf","comment":"R:SS 2024"},{"id":"http://arxiv.org/abs/2410.10578v4","updated":"2024-12-01T15:49:16Z","published":"2024-10-14T14:52:23Z","title":"Burning RED: Unlocking Subtask-Driven Reinforcement Learning and\n Risk-Awareness in Average-Reward Markov Decision Processes","summary":" Average-reward Markov decision processes (MDPs) provide a foundational\nframework for sequential decision-making under uncertainty. However,\naverage-reward MDPs have remained largely unexplored in reinforcement learning\n(RL) settings, with the majority of RL-based efforts having been allocated to\nepisodic and discounted MDPs. In this work, we study a unique structural\nproperty of average-reward MDPs and utilize it to introduce Reward-Extended\nDifferential (or RED) reinforcement learning: a novel RL framework that can be\nused to effectively and efficiently solve various subtasks simultaneously in\nthe average-reward setting. We introduce a family of RED learning algorithms\nfor prediction and control, including proven-convergent algorithms for the\ntabular case. We then showcase the power of these algorithms by demonstrating\nhow they can be used to learn a policy that optimizes, for the first time, the\nwell-known conditional value-at-risk (CVaR) risk measure in a fully-online\nmanner, without the use of an explicit bi-level optimization scheme or an\naugmented state-space.\n","authors":["Juan Sebastian Rojas","Chi-Guhn Lee"],"pdf_url":"https://arxiv.org/pdf/2410.10578v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07154v2","updated":"2024-12-01T15:07:43Z","published":"2024-09-11T09:59:56Z","title":"Recurrent Aggregators in Neural Algorithmic Reasoning","summary":" Neural algorithmic reasoning (NAR) is an emerging field that seeks to design\nneural networks that mimic classical algorithmic computations. Today, graph\nneural networks (GNNs) are widely used in neural algorithmic reasoners due to\ntheir message passing framework and permutation equivariance. In this extended\nabstract, we challenge this design choice, and replace the equivariant\naggregation function with a recurrent neural network. While seemingly\ncounter-intuitive, this approach has appropriate grounding when nodes have a\nnatural ordering -- and this is the case frequently in established reasoning\nbenchmarks like CLRS-30. Indeed, our recurrent NAR (RNAR) model performs very\nstrongly on such tasks, while handling many others gracefully. A notable\nachievement of RNAR is its decisive state-of-the-art result on the Heapsort and\nQuickselect tasks, both deemed as a significant challenge for contemporary\nneural algorithmic reasoners -- especially the latter, where RNAR achieves a\nmean micro-F1 score of 87%.\n","authors":["Kaijia Xu","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2409.07154v2.pdf","comment":"Presented at the Third Learning on Graphs Conference (LoG 2024). 10\n pages, 1 figure"},{"id":"http://arxiv.org/abs/2312.00640v2","updated":"2024-12-01T15:02:17Z","published":"2023-12-01T15:00:59Z","title":"One to beat them all: \"RYU\" -- a unifying framework for the construction\n of safe balls","summary":" In this paper, we present a new framework, called \"RYU\" for constructing\n\"safe\" regions -- specifically, bounded sets that are guaranteed to contain the\ndual solution of a target optimization problem. Our framework applies to the\nstandard case where the objective function is composed of two components: a\nclosed, proper, convex function with Lipschitz-smooth gradient and another\nclosed, proper, convex function. We show that the RYU framework not only\nencompasses but also improves upon the state-of-the-art methods proposed over\nthe past decade for this class of optimization problems.\n","authors":["Thu-Le Tran","Clément Elvira","Hong-Phuong Dang","Cédric Herzet"],"pdf_url":"https://arxiv.org/pdf/2312.00640v2.pdf","comment":"19 pages, 1 table"},{"id":"http://arxiv.org/abs/2411.06237v2","updated":"2024-12-01T13:31:14Z","published":"2024-11-09T17:38:01Z","title":"Leveraging Retrieval-Augmented Generation for Persian University\n Knowledge Retrieval","summary":" This paper introduces an innovative approach using Retrieval-Augmented\nGeneration (RAG) pipelines with Large Language Models (LLMs) to enhance\ninformation retrieval and query response systems for university-related\nquestion answering. By systematically extracting data from the university\nofficial webpage and employing advanced prompt engineering techniques, we\ngenerate accurate, contextually relevant responses to user queries.\n We developed a comprehensive university benchmark, UniversityQuestionBench\n(UQB), to rigorously evaluate our system performance, based on common key\nmetrics in the filed of RAG pipelines, assessing accuracy and reliability\nthrough various metrics and real-world scenarios. Our experimental results\ndemonstrate significant improvements in the precision and relevance of\ngenerated responses, enhancing user experience and reducing the time required\nto obtain relevant answers. In summary, this paper presents a novel application\nof RAG pipelines and LLMs, supported by a meticulously prepared university\nbenchmark, offering valuable insights into advanced AI techniques for academic\ndata retrieval and setting the stage for future research in this domain.\n","authors":["Arshia Hemmat","Kianoosh Vadaei","Mohammad Hassan Heydari","Afsaneh Fatemi"],"pdf_url":"https://arxiv.org/pdf/2411.06237v2.pdf","comment":"6 pages, 2 figures, 1 table, Submitted to 15th IKT conference"},{"id":"http://arxiv.org/abs/2411.09111v3","updated":"2024-12-01T13:08:57Z","published":"2024-11-14T00:59:13Z","title":"Reducing Reasoning Costs -- The Path of Optimization for Chain of\n Thought via Sparse Attention Mechanism","summary":" In order to address the chain of thought in the large language model\ninference cost surge, this research proposes to use a sparse attention\nmechanism that only focuses on a few relevant tokens. The researcher\nconstructed a new attention mechanism and used GiantRabbit trained with custom\nGPTs as an experimental tool. The experiment tested and compared the reasoning\ntime, correctness score and chain of thought length of this model and o1\nPreview in solving the linear algebra test questions of MIT OpenCourseWare. The\nresults show that GiantRabbit's reasoning time and chain of thought length are\nsignificantly lower than o1 Preview. It verifies the feasibility of sparse\nattention mechanism for optimizing chain of thought reasoning. Detailed\narchitectural details and experimental process have been uploaded to Github,\nthe link is:https://github.com/brucewang123456789/GeniusTrail.git.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.09111v3.pdf","comment":"The main text is 5 pages, totaling 9 pages; 4 figures, 1 table. It\n have been submitted to NeurIPS 2024 Workshop MusIML and OpenReview"},{"id":"http://arxiv.org/abs/2409.15100v3","updated":"2024-12-01T13:04:28Z","published":"2024-09-23T15:11:40Z","title":"Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise\n with Median Anchored Clipping","summary":" Leveraging over-the-air computations for model aggregation is an effective\napproach to cope with the communication bottleneck in federated edge learning.\nBy exploiting the superposition properties of multi-access channels, this\napproach facilitates an integrated design of communication and computation,\nthereby enhancing system privacy while reducing implementation costs. However,\nthe inherent electromagnetic interference in radio channels often exhibits\nheavy-tailed distributions, giving rise to exceptionally strong noise in\nglobally aggregated gradients that can significantly deteriorate the training\nperformance. To address this issue, we propose a novel gradient clipping\nmethod, termed Median Anchored Clipping (MAC), to combat the detrimental\neffects of heavy-tailed noise. We also derive analytical expressions for the\nconvergence rate of model training with analog over-the-air federated learning\nunder MAC, which quantitatively demonstrates the effect of MAC on training\nperformance. Extensive experimental results show that the proposed MAC\nalgorithm effectively mitigates the impact of heavy-tailed noise, hence\nsubstantially enhancing system robustness.\n","authors":["Jiaxing Li","Zihan Chen","Kai Fong Ernest Chong","Bikramjit Das","Tony Q. S. Quek","Howard H. Yang"],"pdf_url":"https://arxiv.org/pdf/2409.15100v3.pdf","comment":"This is the full version of the paper, and the appendix contains a\n complete convergence analysis under non-convex conditions"},{"id":"http://arxiv.org/abs/2312.05878v2","updated":"2024-12-01T11:36:53Z","published":"2023-12-10T13:12:55Z","title":"Skew-Probabilistic Neural Networks for Learning from Imbalanced Data","summary":" Real-world datasets often exhibit imbalanced data distribution, where certain\nclass levels are severely underrepresented. In such cases, traditional pattern\nclassifiers have shown a bias towards the majority class, impeding accurate\npredictions for the minority class. This paper introduces an imbalanced\ndata-oriented classifier using probabilistic neural networks (PNN) with a\nskew-normal kernel function to address this major challenge. PNN is known for\nproviding probabilistic outputs, enabling quantification of prediction\nconfidence, interpretability, and the ability to handle limited data. By\nleveraging the skew-normal distribution, which offers increased flexibility,\nparticularly for imbalanced and non-symmetric data, our proposed\nSkew-Probabilistic Neural Networks (SkewPNN) can better represent underlying\nclass densities. Hyperparameter fine-tuning is imperative to optimize the\nperformance of the proposed approach on imbalanced datasets. To this end, we\nemploy a population-based heuristic algorithm, the Bat optimization algorithm,\nto explore the hyperparameter space effectively. We also prove the statistical\nconsistency of the density estimates, suggesting that the true distribution\nwill be approached smoothly as the sample size increases. Theoretical analysis\nof the computational complexity of the proposed SkewPNN and BA-SkewPNN is also\nprovided. Numerical simulations have been conducted on different synthetic\ndatasets, comparing various benchmark-imbalanced learners. Real-data analysis\non several datasets shows that SkewPNN and BA-SkewPNN substantially outperform\nmost state-of-the-art machine-learning methods for both balanced and imbalanced\ndatasets (binary and multi-class categories) in most experimental settings.\n","authors":["Shraddha M. Naik","Tanujit Chakraborty","Madhurima Panja","Abdenour Hadid","Bibhas Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2312.05878v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.19456v2","updated":"2024-12-01T11:27:09Z","published":"2024-10-25T10:30:21Z","title":"Computational Bottlenecks of Training Small-scale Large Language Models","summary":" While large language models (LLMs) dominate the AI landscape, Small-scale\nlarge Language Models (SLMs) are gaining attention due to cost and efficiency\ndemands from consumers. However, there is limited research on the training\nbehavior and computational requirements of SLMs. In this study, we explore the\ncomputational bottlenecks of training SLMs (up to 2B parameters) by examining\nthe effects of various hyperparameters and configurations, including GPU type,\nbatch size, model size, communication protocol, attention type, and the number\nof GPUs. We assess these factors on popular cloud services using metrics such\nas loss per dollar and tokens per second. Our findings aim to support the\nbroader adoption and optimization of language model training for low-resource\nAI research institutes.\n","authors":["Saleh Ashkboos","Iman Mirzadeh","Keivan Alizadeh","Mohammad Hossein Sekhavat","Moin Nabi","Mehrdad Farajtabar","Fartash Faghri"],"pdf_url":"https://arxiv.org/pdf/2410.19456v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.17180v2","updated":"2024-12-01T11:10:31Z","published":"2024-11-26T07:41:15Z","title":"Training a neural netwok for data reduction and better generalization","summary":" The motivation for sparse learners is to compress the inputs (features) by\nselecting only the ones needed for good generalization. Linear models with\nLASSO-type regularization achieve this by setting the weights of irrelevant\nfeatures to zero, effectively identifying and ignoring them. In artificial\nneural networks, this selective focus can be achieved by pruning the input\nlayer. Given a cost function enhanced with a sparsity-promoting penalty, our\nproposal selects a regularization term $\\lambda$ (without the use of\ncross-validation or a validation set) that creates a local minimum in the cost\nfunction at the origin where no features are selected. This local minimum acts\nas a baseline, meaning that if there is no strong enough signal to justify a\nfeature inclusion, the local minimum remains at zero with a high prescribed\nprobability. The method is flexible, applying to complex models ranging from\nshallow to deep artificial neural networks and supporting various cost\nfunctions and sparsity-promoting penalties. We empirically show a remarkable\nphase transition in the probability of retrieving the relevant features, as\nwell as good generalization thanks to the choice of $\\lambda$, the non-convex\npenalty and the optimization scheme developed. This approach can be seen as a\nform of compressed sensing for complex models, allowing us to distill\nhigh-dimensional data into a compact, interpretable subset of meaningful\nfeatures.\n","authors":["Sylvain Sardy","Maxime van Cutsem","Xiaoyu Ma"],"pdf_url":"https://arxiv.org/pdf/2411.17180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12644v2","updated":"2024-12-01T09:09:53Z","published":"2024-01-23T10:54:13Z","title":"Binary Feature Mask Optimization for Feature Selection","summary":" We investigate feature selection problem for generic machine learning models.\nWe introduce a novel framework that selects features considering the outcomes\nof the model. Our framework introduces a novel feature masking approach to\neliminate the features during the selection process, instead of completely\nremoving them from the dataset. This allows us to use the same machine learning\nmodel during feature selection, unlike other feature selection methods where we\nneed to train the machine learning model again as the dataset has different\ndimensions on each iteration. We obtain the mask operator using the predictions\nof the machine learning model, which offers a comprehensive view on the subsets\nof the features essential for the predictive performance of the model. A\nvariety of approaches exist in the feature selection literature. However, to\nour knowledge, no study has introduced a training-free framework for a generic\nmachine learning model to select features while considering the importance of\nthe feature subsets as a whole, instead of focusing on the individual features.\nWe demonstrate significant performance improvements on the real-life datasets\nunder different settings using LightGBM and Multi-Layer Perceptron as our\nmachine learning models. The high performance of our General Binary Mask\nOptimization algorithm stems from its feature masking approach to select\nfeatures and its flexibility in the number of selected features. The algorithm\nselects features based on the validation performance of the machine learning\nmodel. Hence, the number of selected features is not predetermined and adjusts\ndynamically to the dataset. Additionally, we openly share the implementation or\nour code to encourage further research in this area.\n","authors":["Mehmet E. Lorasdagi","Mehmet Y. Turali","Suleyman S. Kozat"],"pdf_url":"https://arxiv.org/pdf/2401.12644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03768v4","updated":"2024-12-01T08:43:03Z","published":"2024-01-08T09:47:19Z","title":"Corn Yield Prediction Model with Deep Neural Networks for Smallholder\n Farmer Decision Support System","summary":" Crop yield prediction has been modeled on the assumption that there is no\ninteraction between weather and soil variables. However, this paper argues that\nan interaction exists, and it can be finely modelled using the Kendall\nCorrelation coefficient. Given the nonlinearity of the interaction between\nweather and soil variables, a deep neural network regressor (DNNR) is carefully\ndesigned with consideration to the depth, number of neurons of the hidden\nlayers, and the hyperparameters with their optimizations. Additionally, a new\nmetric, the average of absolute root squared error (ARSE) is proposed to\ncombine the strengths of root mean square error (RMSE) and mean absolute error\n(MAE). With the ARSE metric, the proposed DNNR(s), optimised random forest\nregressor (RFR) and the extreme gradient boosting regressor (XGBR) achieved\nimpressively small yield errors, 0.0172 t/ha, and 0.0243 t/ha, 0.0001 t/ha, and\n0.001 t/ha, respectively. However, the DNNR(s), with changes to the explanatory\nvariables to ensure generalizability to unforeseen data, DNNR(s) performed\nbest. Further analysis reveals that a strong interaction does exist between\nweather and soil variables. Precisely, yield is observed to increase when\nprecipitation is reduced and silt increased, and vice-versa. However, the\ndegree of decrease or increase is not quantified in this paper. Contrary to\nexisting yield models targeted towards agricultural policies and global food\nsecurity, the goal of the proposed corn yield model is to empower the\nsmallholder farmer to farm smartly and intelligently, thus the prediction model\nis integrated into a mobile application that includes education, and a\nfarmer-to-market access module.\n","authors":["Chollette C. Olisah","Lyndon Smith","Melvyn Smith","Morolake O. Lawrence","Osita Ojukwu"],"pdf_url":"https://arxiv.org/pdf/2401.03768v4.pdf","comment":"30 Pages, 11 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2402.09427v2","updated":"2024-12-01T08:33:42Z","published":"2024-01-24T05:28:29Z","title":"DoorINet: Door Heading Prediction through Inertial Deep Learning","summary":" Inertial sensors are widely used in a variety of applications. A common task\nis orientation estimation. To tackle such a task, attitude and heading\nreference system algorithms are applied. Relying on the gyroscope readings, the\naccelerometer measurements are used to update the attitude angles, and\nmagnetometer measurements are utilized to update the heading angle. In indoor\nenvironments, magnetometers suffer from interference that degrades their\nperformance resulting in poor heading angle estimation. Therefore, applications\nthat estimate the heading angle of moving objects, such as walking pedestrians,\nclosets, and refrigerators, are prone to error. To circumvent such situations,\nwe propose DoorINet, an end-to-end deep-learning framework to calculate the\nheading angle from door-mounted, low-cost inertial sensors without using\nmagnetometers. To evaluate our approach, we record a unique dataset containing\n391 minutes of accelerometer and gyroscope measurements and corresponding\nground-truth heading angle. We show that our proposed approach outperforms\ncommonly used, model based approaches and data-driven methods.\n","authors":["Aleksei Zakharchenko","Sharon Farber","Itzik Klein"],"pdf_url":"https://arxiv.org/pdf/2402.09427v2.pdf","comment":"10 pages, 14 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.17788v2","updated":"2024-12-01T08:00:56Z","published":"2024-11-26T15:29:38Z","title":"Geometric Point Attention Transformer for 3D Shape Reassembly","summary":" Shape assembly, which aims to reassemble separate parts into a complete\nobject, has gained significant interest in recent years. Existing methods\nprimarily rely on networks to predict the poses of individual parts, but often\nfail to effectively capture the geometric interactions between the parts and\ntheir poses. In this paper, we present the Geometric Point Attention\nTransformer (GPAT), a network specifically designed to address the challenges\nof reasoning about geometric relationships. In the geometric point attention\nmodule, we integrate both global shape information and local pairwise geometric\nfeatures, along with poses represented as rotation and translation vectors for\neach part. To enable iterative updates and dynamic reasoning, we introduce a\ngeometric recycling scheme, where each prediction is fed into the next\niteration for refinement. We evaluate our model on both the semantic and\ngeometric assembly tasks, showing that it outperforms previous methods in\nabsolute pose estimation, achieving accurate pose predictions and high\nalignment accuracy.\n","authors":["Jiahan Li","Chaoran Cheng","Jianzhu Ma","Ge Liu"],"pdf_url":"https://arxiv.org/pdf/2411.17788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08958v2","updated":"2024-12-01T06:47:45Z","published":"2024-09-13T16:23:17Z","title":"PINNfluence: Influence Functions for Physics-Informed Neural Networks","summary":" Recently, physics-informed neural networks (PINNs) have emerged as a flexible\nand promising application of deep learning to partial differential equations in\nthe physical sciences. While offering strong performance and competitive\ninference speeds on forward and inverse problems, their black-box nature limits\ninterpretability, particularly regarding alignment with expected physical\nbehavior. In the present work, we explore the application of influence\nfunctions (IFs) to validate and debug PINNs post-hoc. Specifically, we apply\nvariations of IF-based indicators to gauge the influence of different types of\ncollocation points on the prediction of PINNs applied to a 2D Navier-Stokes\nfluid flow problem. Our results demonstrate how IFs can be adapted to PINNs to\nreveal the potential for further studies. The code is publicly available at\nhttps://github.com/aleks-krasowski/PINNfluence.\n","authors":["Jonas R. Naujoks","Aleksander Krasowski","Moritz Weckbecker","Thomas Wiegand","Sebastian Lapuschkin","Wojciech Samek","René P. Klausen"],"pdf_url":"https://arxiv.org/pdf/2409.08958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07602v2","updated":"2024-12-01T06:39:41Z","published":"2024-11-12T07:24:41Z","title":"Circuit Complexity Bounds for RoPE-based Transformer Architecture","summary":" Characterizing the express power of the Transformer architecture is critical\nto understanding its capacity limits and scaling law. Recent works provide the\ncircuit complexity bounds to Transformer-like architecture. On the other hand,\nRotary Position Embedding ($\\mathsf{RoPE}$) has emerged as a crucial technique\nin modern large language models, offering superior performance in capturing\npositional information compared to traditional position embeddings, which shows\ngreat potential in application prospects, particularly for the long context\nscenario. Empirical evidence also suggests that $\\mathsf{RoPE}$-based\nTransformer architectures demonstrate greater generalization capabilities\ncompared to conventional Transformer models. In this work, we establish a\ncircuit complexity bound for Transformers with $\\mathsf{RoPE}$ attention. Our\nkey contribution is that we show that unless $\\mathsf{TC}^0 = \\mathsf{NC}^1$, a\n$\\mathsf{RoPE}$-based Transformer with $\\mathrm{poly}(n)$-precision, $O(1)$\nlayers, hidden dimension $d \\leq O(n)$ cannot solve the Arithmetic formula\nevaluation problem or the Boolean formula value problem. This result\nsignificantly demonstrates the fundamental limitation of the expressivity of\nthe $\\mathsf{RoPE}$-based Transformer architecture, although it achieves giant\nempirical success. Our theoretical result not only establishes the complexity\nbound but also may instruct further work on the $\\mathsf{RoPE}$-based\nTransformer.\n","authors":["Bo Chen","Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2411.07602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02490v4","updated":"2024-12-01T05:46:03Z","published":"2023-08-04T17:59:47Z","title":"MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities","summary":" We propose MM-Vet, an evaluation benchmark that examines large multimodal\nmodels (LMMs) on complicated multimodal tasks. Recent LMMs have shown various\nintriguing abilities, such as solving math problems written on the blackboard,\nreasoning about events and celebrities in news images, and explaining visual\njokes. Rapid model advancements pose challenges to evaluation benchmark\ndevelopment. Problems include: (1) How to systematically structure and evaluate\nthe complicated multimodal tasks; (2) How to design evaluation metrics that\nwork well across question and answer types; and (3) How to give model insights\nbeyond a simple performance ranking. To this end, we present MM-Vet, designed\nbased on the insight that the intriguing ability to solve complicated tasks is\noften achieved by a generalist model being able to integrate different core\nvision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and\nexamines the 16 integrations of interest derived from the capability\ncombination. For evaluation metrics, we propose an LLM-based evaluator for\nopen-ended outputs. The evaluator enables the evaluation across different\nquestion types and answer styles, resulting in a unified scoring metric. We\nevaluate representative LMMs on MM-Vet, providing insights into the\ncapabilities of different LMM system paradigms and models.\n","authors":["Weihao Yu","Zhengyuan Yang","Linjie Li","Jianfeng Wang","Kevin Lin","Zicheng Liu","Xinchao Wang","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.02490v4.pdf","comment":"ICML 2024. Code, data and leaderboard:\n https://github.com/yuweihao/MM-Vet"},{"id":"http://arxiv.org/abs/2405.18560v2","updated":"2024-12-01T05:22:22Z","published":"2024-05-28T20:10:06Z","title":"Potential Field Based Deep Metric Learning","summary":" Deep metric learning (DML) involves training a network to learn a\nsemantically meaningful representation space. Many current approaches mine\nn-tuples of examples and model interactions within each tuplets. We present a\nnovel, compositional DML model, inspired by electrostatic fields in physics\nthat, instead of in tuples, represents the influence of each example\n(embedding) by a continuous potential field, and superposes the fields to\nobtain their combined global potential field. We use attractive/repulsive\npotential fields to represent interactions among embeddings from images of the\nsame/different classes. Contrary to typical learning methods, where mutual\ninfluence of samples is proportional to their distance, we enforce reduction in\nsuch influence with distance, leading to a decaying field. We show that such\ndecay helps improve performance on real world datasets with large intra-class\nvariations and label noise. Like other proxy-based methods, we also use proxies\nto succinctly represent sub-populations of examples. We evaluate our method on\nthree standard DML benchmarks- Cars-196, CUB-200-2011, and SOP datasets where\nit outperforms state-of-the-art baselines.\n","authors":["Shubhang Bhatnagar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2405.18560v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.09856v2","updated":"2024-12-01T05:18:12Z","published":"2024-11-15T00:31:45Z","title":"InvestESG: A multi-agent reinforcement learning benchmark for studying\n climate investment as a social dilemma","summary":" InvestESG is a novel multi-agent reinforcement learning (MARL) benchmark\ndesigned to study the impact of Environmental, Social, and Governance (ESG)\ndisclosure mandates on corporate climate investments. Supported by both PyTorch\nand JAX implementation, the benchmark models an intertemporal social dilemma\nwhere companies balance short-term profit losses from climate mitigation\nefforts and long-term benefits from reducing climate risk, while ESG-conscious\ninvestors attempt to influence corporate behavior through their investment\ndecisions, in a scalable and hardware-accelerated manner. Companies allocate\ncapital across mitigation, greenwashing, and resilience, with varying\nstrategies influencing climate outcomes and investor preferences. Our\nexperiments show that without ESG-conscious investors with sufficient capital,\ncorporate mitigation efforts remain limited under the disclosure mandate.\nHowever, when a critical mass of investors prioritizes ESG, corporate\ncooperation increases, which in turn reduces climate risks and enhances\nlong-term financial stability. Additionally, providing more information about\nglobal climate risks encourages companies to invest more in mitigation, even\nwithout investor involvement. Our findings align with empirical research using\nreal-world data, highlighting MARL's potential to inform policy by providing\ninsights into large-scale socio-economic challenges through efficient testing\nof alternative policy and market designs.\n","authors":["Xiaoxuan Hou","Jiayi Yuan","Joel Z. Leibo","Natasha Jaques"],"pdf_url":"https://arxiv.org/pdf/2411.09856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.11858v4","updated":"2024-12-01T05:15:11Z","published":"2024-02-19T06:00:35Z","title":"Stochastic Hessian Fittings with Lie Groups","summary":" This report studies the fitting of Hessian or its inverse for stochastic\noptimizations using a Hessian fitting criterion from the preconditioned\nstochastic gradient descent (PSGD) method, which is intimately related to many\ncommonly used second-order and adaptive gradient optimizers, e.g., BFGS,\nGaussian-Newton algorithm, natural gradient descent, AdaGrad, etc. Our analyses\nreveal the efficiency and reliability differences among a wide range of\npreconditioner fitting methods, from closed-form to iterative solutions, using\nHessian-vector products or stochastic gradients only, with Hessian fittings in\nthe Euclidean space, the manifold of symmetric positive definite (SPL)\nmatrices, to a variety of Lie groups. The most intriguing discovery is that the\nHessian fitting itself as an optimization problem is strongly convex under mild\nconditions in certain general Lie groups. This discovery turns Hessian fitting\ninto a well-behaved Lie group optimization problem and facilitates the designs\nof highly efficient and elegant Lie group sparse preconditioner fitting methods\nfor large-scale stochastic optimizations.\n","authors":["Xi-Lin Li"],"pdf_url":"https://arxiv.org/pdf/2402.11858v4.pdf","comment":"14 pages; 6 figures; 3 tables; code\n https://github.com/lixilinx/psgd_torch"},{"id":"http://arxiv.org/abs/2407.05593v4","updated":"2024-12-01T04:36:57Z","published":"2024-07-08T04:15:43Z","title":"Unmasking Trees for Tabular Data","summary":" Despite much work on advanced deep learning and generative modeling\ntechniques for tabular data generation and imputation, traditional methods have\ncontinued to win on imputation benchmarks. We herein present UnmaskingTrees, a\nsimple method for tabular imputation (and generation) employing\ngradient-boosted decision trees which are used to incrementally unmask\nindividual features. This approach offers state-of-the-art performance on\nimputation, and on generation given training data with missingness; and it has\ncompetitive performance on vanilla generation. To solve the conditional\ngeneration subproblem, we propose a tabular probabilistic prediction method,\nBaltoBot, which fits a balanced tree of boosted tree classifiers. Unlike older\nmethods, it requires no parametric assumption on the conditional distribution,\naccommodating features with multimodal distributions; unlike newer diffusion\nmethods, it offers fast sampling, closed-form density estimation, and flexible\nhandling of discrete variables. We finally consider our two approaches as\nmeta-algorithms, demonstrating in-context learning-based generative modeling\nwith TabPFN.\n","authors":["Calvin McCarter"],"pdf_url":"https://arxiv.org/pdf/2407.05593v4.pdf","comment":"v0.3.0 of UnmaskingTrees software"},{"id":"http://arxiv.org/abs/2408.07712v2","updated":"2024-12-01T04:34:53Z","published":"2024-08-13T23:08:06Z","title":"Introduction to Reinforcement Learning","summary":" Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI),\nfocuses on training agents to make decisions by interacting with their\nenvironment to maximize cumulative rewards. This paper provides an overview of\nRL, covering its core concepts, methodologies, and resources for further\nlearning. It offers a thorough explanation of fundamental components such as\nstates, actions, policies, and reward signals, ensuring readers develop a solid\nfoundational understanding. Additionally, the paper presents a variety of RL\nalgorithms, categorized based on the key factors such as model-free,\nmodel-based, value-based, policy-based, and other key factors. Resources for\nlearning and implementing RL, such as books, courses, and online communities\nare also provided. By offering a clear, structured introduction, this paper\naims to simplify the complexities of RL for beginners, providing a\nstraightforward pathway to understanding and applying real-time techniques.\n","authors":["Majid Ghasemi","Dariush Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2408.07712v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2401.17133v2","updated":"2024-12-01T04:06:27Z","published":"2024-01-30T16:07:44Z","title":"SongBsAb: A Dual Prevention Approach against Singing Voice Conversion\n based Illegal Song Covers","summary":" Singing voice conversion (SVC) automates song covers by converting a source\nsinging voice from a source singer into a new singing voice with the same\nlyrics and melody as the source, but sounds like being covered by the target\nsinger of some given target singing voices. However, it raises serious concerns\nabout copyright and civil right infringements. We propose SongBsAb, the first\nproactive approach to tackle SVC-based illegal song covers. SongBsAb adds\nperturbations to singing voices before releasing them, so that when they are\nused, the process of SVC will be interfered, leading to unexpected singing\nvoices. Perturbations are carefully crafted to (1) provide a dual prevention,\ni.e., preventing the singing voice from being used as the source and target\nsinging voice in SVC, by proposing a gender-transformation loss and a high/low\nhierarchy multi-target loss, respectively; and (2) be harmless, i.e., no\nside-effect on the enjoyment of protected songs, by refining a psychoacoustic\nmodel-based loss with the backing track as an additional masker, a unique\naccompanying element for singing voices compared to ordinary speech voices. We\nalso adopt a frame-level interaction reduction-based loss and encoder ensemble\nto enhance the transferability of SongBsAb to unknown SVC models. We\ndemonstrate the prevention effectiveness, harmlessness, and robustness of\nSongBsAb on five diverse and promising SVC models, using both English and\nChinese datasets, and both objective and human study-based subjective metrics.\nOur work fosters an emerging research direction for mitigating illegal\nautomated song covers.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song","Ting Wang","Xiaoning Du","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.17133v2.pdf","comment":"In Proceedings of the 32nd Network and Distributed System Security\n (NDSS) Symposium 2025"},{"id":"http://arxiv.org/abs/2212.03853v6","updated":"2024-12-01T04:02:40Z","published":"2022-12-05T12:33:26Z","title":"Clustering with Neural Network and Index","summary":" A new model called Clustering with Neural Network and Index (CNNI) is\nintroduced. CNNI uses a Neural Network to cluster data points. Training of the\nNeural Network mimics supervised learning, with an internal clustering\nevaluation index acting as the loss function. An experiment is conducted to\ntest the feasibility of the new model, and compared with results of other\nclustering models like K-means and Gaussian Mixture Model (GMM). The result\nshows CNNI can work properly for clustering data; CNNI equipped with MMJ-SC,\nachieves the first parametric (inductive) clustering model that can deal with\nnon-convex shaped (non-flat geometry) data.\n","authors":["Gangli Liu"],"pdf_url":"https://arxiv.org/pdf/2212.03853v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21712v2","updated":"2024-12-01T03:54:57Z","published":"2024-10-29T03:54:48Z","title":"Sliced-Wasserstein-based Anomaly Detection and Open Dataset for\n Localized Critical Peak Rebates","summary":" In this work, we present a new unsupervised anomaly (outlier) detection (AD)\nmethod using the sliced-Wasserstein metric. This filtering technique is\nconceptually interesting for MLOps pipelines deploying machine learning models\nin critical sectors, e.g., energy, as it offers a conservative data selection.\nAdditionally, we open the first dataset showcasing localized critical peak\nrebate demand response in a northern climate. We demonstrate the capabilities\nof our method on synthetic datasets as well as standard AD datasets and use it\nin the making of a first benchmark for our open-source localized critical peak\nrebate dataset.\n","authors":["Julien Pallage","Bertrand Scherrer","Salma Naccache","Christophe Bélanger","Antoine Lesage-Landry"],"pdf_url":"https://arxiv.org/pdf/2410.21712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11920v3","updated":"2024-12-01T03:49:57Z","published":"2024-06-17T07:22:51Z","title":"Job-SDF: A Multi-Granularity Dataset for Job Skill Demand Forecasting\n and Benchmarking","summary":" In a rapidly evolving job market, skill demand forecasting is crucial as it\nenables policymakers and businesses to anticipate and adapt to changes,\nensuring that workforce skills align with market needs, thereby enhancing\nproductivity and competitiveness. Additionally, by identifying emerging skill\nrequirements, it directs individuals towards relevant training and education\nopportunities, promoting continuous self-learning and development. However, the\nabsence of comprehensive datasets presents a significant challenge, impeding\nresearch and the advancement of this field. To bridge this gap, we present\nJob-SDF, a dataset designed to train and benchmark job-skill demand forecasting\nmodels. Based on 10.35 million public job advertisements collected from major\nonline recruitment platforms in China between 2021 and 2023, this dataset\nencompasses monthly recruitment demand for 2,324 types of skills across 521\ncompanies. Our dataset uniquely enables evaluating skill demand forecasting\nmodels at various granularities, including occupation, company, and regional\nlevels. We benchmark a range of models on this dataset, evaluating their\nperformance in standard scenarios, in predictions focused on lower value\nranges, and in the presence of structural breaks, providing new insights for\nfurther research. Our code and dataset are publicly accessible via the\nhttps://github.com/Job-SDF/benchmark.\n","authors":["Xi Chen","Chuan Qin","Chuyu Fang","Chao Wang","Chen Zhu","Fuzhen Zhuang","Hengshu Zhu","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2406.11920v3.pdf","comment":"NeurIPS 2024 Accepted"},{"id":"http://arxiv.org/abs/2410.23686v2","updated":"2024-12-01T03:08:21Z","published":"2024-10-31T07:20:40Z","title":"Towards Dynamic Message Passing on Graphs","summary":" Message passing plays a vital role in graph neural networks (GNNs) for\neffective feature learning. However, the over-reliance on input topology\ndiminishes the efficacy of message passing and restricts the ability of GNNs.\nDespite efforts to mitigate the reliance, existing study encounters\nmessage-passing bottlenecks or high computational expense problems, which\ninvokes the demands for flexible message passing with low complexity. In this\npaper, we propose a novel dynamic message-passing mechanism for GNNs. It\nprojects graph nodes and learnable pseudo nodes into a common space with\nmeasurable spatial relations between them. With nodes moving in the space,\ntheir evolving relations facilitate flexible pathway construction for a dynamic\nmessage-passing process. Associating pseudo nodes to input graphs with their\nmeasured relations, graph nodes can communicate with each other intermediately\nthrough pseudo nodes under linear complexity. We further develop a GNN model\nnamed $\\mathtt{\\mathbf{N^2}}$ based on our dynamic message-passing mechanism.\n$\\mathtt{\\mathbf{N^2}}$ employs a single recurrent layer to recursively\ngenerate the displacements of nodes and construct optimal dynamic pathways.\nEvaluation on eighteen benchmarks demonstrates the superior performance of\n$\\mathtt{\\mathbf{N^2}}$ over popular GNNs. $\\mathtt{\\mathbf{N^2}}$ successfully\nscales to large-scale benchmarks and requires significantly fewer parameters\nfor graph classification with the shared recurrent layer.\n","authors":["Junshu Sun","Chenxue Yang","Xiangyang Ji","Qingming Huang","Shuhui Wang"],"pdf_url":"https://arxiv.org/pdf/2410.23686v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.15876v2","updated":"2024-12-01T02:38:17Z","published":"2024-10-21T10:57:45Z","title":"FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL","summary":" Multi-agent reinforcement learning has demonstrated significant potential in\naddressing complex cooperative tasks across various real-world applications.\nHowever, existing MARL approaches often rely on the restrictive assumption that\nthe number of entities (e.g., agents, obstacles) remains constant between\ntraining and inference. This overlooks scenarios where entities are dynamically\nremoved or added during the inference trajectory -- a common occurrence in\nreal-world environments like search and rescue missions and dynamic combat\nsituations. In this paper, we tackle the challenge of intra-trajectory dynamic\nentity composition under zero-shot out-of-domain (OOD) generalization, where\nsuch dynamic changes cannot be anticipated beforehand. Our empirical studies\nreveal that existing MARL methods suffer significant performance degradation\nand increased uncertainty in these scenarios. In response, we propose\nFlickerFusion, a novel OOD generalization method that acts as a universally\napplicable augmentation technique for MARL backbone methods. FlickerFusion\nstochastically drops out parts of the observation space, emulating being\nin-domain when inferenced OOD. The results show that FlickerFusion not only\nachieves superior inference rewards but also uniquely reduces uncertainty\nvis-\\`a-vis the backbone, compared to existing methods. Benchmarks,\nimplementations, and model weights are organized and open-sourced at\nflickerfusion305.github.io, accompanied by ample demo video renderings.\n","authors":["Woosung Koh","Wonbeen Oh","Siyeol Kim","Suhin Shin","Hyeongjin Kim","Jaein Jang","Junghyun Lee","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2410.15876v2.pdf","comment":"NeurIPS '24 Open-World Agents Workshop (v2: minor revision)"},{"id":"http://arxiv.org/abs/2410.21107v2","updated":"2024-12-01T02:36:26Z","published":"2024-10-28T15:11:23Z","title":"Tree-Wasserstein Distance for High Dimensional Data with a Latent\n Feature Hierarchy","summary":" Finding meaningful distances between high-dimensional data samples is an\nimportant scientific task. To this end, we propose a new tree-Wasserstein\ndistance (TWD) for high-dimensional data with two key aspects. First, our TWD\nis specifically designed for data with a latent feature hierarchy, i.e., the\nfeatures lie in a hierarchical space, in contrast to the usual focus on\nembedding samples in hyperbolic space. Second, while the conventional use of\nTWD is to speed up the computation of the Wasserstein distance, we use its\ninherent tree as a means to learn the latent feature hierarchy. The key idea of\nour method is to embed the features into a multi-scale hyperbolic space using\ndiffusion geometry and then present a new tree decoding method by establishing\nanalogies between the hyperbolic embedding and trees. We show that our TWD\ncomputed based on data observations provably recovers the TWD defined with the\nlatent feature hierarchy and that its computation is efficient and scalable. We\nshowcase the usefulness of the proposed TWD in applications to word-document\nand single-cell RNA-sequencing datasets, demonstrating its advantages over\nexisting TWDs and methods based on pre-trained models.\n","authors":["Ya-Wei Eileen Lin","Ronald R. Coifman","Gal Mishne","Ronen Talmon"],"pdf_url":"https://arxiv.org/pdf/2410.21107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14803v4","updated":"2024-12-01T02:09:21Z","published":"2024-10-18T18:19:56Z","title":"DistRL: An Asynchronous Distributed Reinforcement Learning Framework for\n On-Device Control Agents","summary":" On-device control agents, especially on mobile devices, are responsible for\noperating mobile devices to fulfill users' requests, enabling seamless and\nintuitive interactions. Integrating Multimodal Large Language Models (MLLMs)\ninto these agents enhances their ability to understand and execute complex\ncommands, thereby improving user experience. However, fine-tuning MLLMs for\non-device control presents significant challenges due to limited data\navailability and inefficient online training processes. This paper introduces\nDistRL, a novel framework designed to enhance the efficiency of online RL\nfine-tuning for mobile device control agents. DistRL employs centralized\ntraining and decentralized data acquisition to ensure efficient fine-tuning in\nthe context of dynamic online interactions. Additionally, the framework is\nbacked by our tailor-made RL algorithm, which effectively balances exploration\nwith the prioritized utilization of collected data to ensure stable and robust\ntraining. Our experiments show that, on average, DistRL delivers a 3X\nimprovement in training efficiency and enables training data collection 2.4X\nfaster than the leading synchronous multi-machine methods. Notably, after\ntraining, DistRL achieves a 20% relative improvement in success rate compared\nto state-of-the-art methods on general Android tasks from an open benchmark,\nsignificantly outperforming existing approaches while maintaining the same\ntraining time. These results validate DistRL as a scalable and efficient\nsolution, offering substantial improvements in both training efficiency and\nagent performance for real-world, in-the-wild device control tasks.\n","authors":["Taiyi Wang","Zhihao Wu","Jianheng Liu","Jianye Hao","Jun Wang","Kun Shao"],"pdf_url":"https://arxiv.org/pdf/2410.14803v4.pdf","comment":"Paper and Appendix, 26 pages"},{"id":"http://arxiv.org/abs/2309.17249v3","updated":"2024-12-01T01:36:50Z","published":"2023-09-29T13:55:45Z","title":"Batch Calibration: Rethinking Calibration for In-Context Learning and\n Prompt Engineering","summary":" Prompting and in-context learning (ICL) have become efficient learning\nparadigms for large language models (LLMs). However, LLMs suffer from prompt\nbrittleness and various bias factors in the prompt, including but not limited\nto the formatting, the choice verbalizers, and the ICL examples. To address\nthis problem that results in unexpected performance degradation, calibration\nmethods have been developed to mitigate the effects of these biases while\nrecovering LLM performance. In this work, we first conduct a systematic\nanalysis of the existing calibration methods, where we both provide a unified\nview and reveal the failure cases. Inspired by these analyses, we propose Batch\nCalibration (BC), a simple yet intuitive method that controls the contextual\nbias from the batched input, unifies various prior approaches, and effectively\naddresses the aforementioned issues. BC is zero-shot, inference-only, and\nincurs negligible additional costs. In the few-shot setup, we further extend BC\nto allow it to learn the contextual bias from labeled data. We validate the\neffectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate\nstate-of-the-art performance over previous calibration baselines across more\nthan 10 natural language understanding and image classification tasks.\n","authors":["Han Zhou","Xingchen Wan","Lev Proleev","Diana Mincu","Jilin Chen","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2309.17249v3.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2405.11848v2","updated":"2024-12-01T00:49:32Z","published":"2024-05-20T07:47:06Z","title":"Alternators For Sequence Modeling","summary":" This paper introduces alternators, a novel family of non-Markovian dynamical\nmodels for sequences. An alternator features two neural networks: the\nobservation trajectory network (OTN) and the feature trajectory network (FTN).\nThe OTN and the FTN work in conjunction, alternating between outputting samples\nin the observation space and some feature space, respectively, over a cycle.\nThe parameters of the OTN and the FTN are not time-dependent and are learned\nvia a minimum cross-entropy criterion over the trajectories. Alternators are\nversatile. They can be used as dynamical latent-variable generative models or\nas sequence-to-sequence predictors. Alternators can uncover the latent dynamics\nunderlying complex sequential data, accurately forecast and impute missing\ndata, and sample new trajectories. We showcase the capabilities of alternators\nin three applications. We first used alternators to model the Lorenz equations,\noften used to describe chaotic behavior. We then applied alternators to\nNeuroscience, to map brain activity to physical activity. Finally, we applied\nalternators to Climate Science, focusing on sea-surface temperature\nforecasting. In all our experiments, we found alternators are stable to train,\nfast to sample from, yield high-quality generated samples and latent variables,\nand often outperform strong baselines such as Mambas, neural ODEs, and\ndiffusion models in the domains we studied.\n","authors":["Mohammad Reza Rezaei","Adji Bousso Dieng"],"pdf_url":"https://arxiv.org/pdf/2405.11848v2.pdf","comment":"A new versatile family of sequence models that can be used for both\n generative modeling and supervised learning. The codebase will be made\n available upon publication. This paper is dedicated to Thomas Sankara"},{"id":"http://arxiv.org/abs/2401.10989v3","updated":"2024-12-01T00:44:02Z","published":"2024-01-19T19:04:23Z","title":"Provably Scalable Black-Box Variational Inference with Structured\n Variational Families","summary":" Variational families with full-rank covariance approximations are known not\nto work well in black-box variational inference (BBVI), both empirically and\ntheoretically. In fact, recent computational complexity results for BBVI have\nestablished that full-rank variational families scale poorly with the\ndimensionality of the problem compared to e.g. mean-field families. This is\nparticularly critical to hierarchical Bayesian models with local variables;\ntheir dimensionality increases with the size of the datasets. Consequently, one\ngets an iteration complexity with an explicit $\\mathcal{O}(N^2)$ dependence on\nthe dataset size $N$. In this paper, we explore a theoretical middle ground\nbetween mean-field variational families and full-rank families: structured\nvariational families. We rigorously prove that certain scale matrix structures\ncan achieve a better iteration complexity of $\\mathcal{O}\\left(N\\right)$,\nimplying better scaling with respect to $N$. We empirically verify our\ntheoretical results on large-scale hierarchical models.\n","authors":["Joohwan Ko","Kyurae Kim","Woo Chang Kim","Jacob R. Gardner"],"pdf_url":"https://arxiv.org/pdf/2401.10989v3.pdf","comment":"Accepted to ICML'24; v3: fixed typos"},{"id":"http://arxiv.org/abs/2309.03468v2","updated":"2024-12-01T00:09:44Z","published":"2023-09-07T03:33:49Z","title":"Support-Set Context Matters for Bongard Problems","summary":" Current machine learning methods struggle to solve Bongard problems, which\nare a type of IQ test that requires deriving an abstract \"concept\" from a set\nof positive and negative \"support\" images, and then classifying whether or not\na new query image depicts the key concept. On Bongard-HOI, a benchmark for\nnatural-image Bongard problems, most existing methods have reached at best 69%\naccuracy (where chance is 50%). Low accuracy is often attributed to neural\nnets' lack of ability to find human-like symbolic rules. In this work, we point\nout that many existing methods are forfeiting accuracy due to a much simpler\nproblem: they do not adapt image features given information contained in the\nsupport set as a whole, and rely instead on information extracted from\nindividual supports. This is a critical issue, because the \"key concept\" in a\ntypical Bongard problem can often only be distinguished using multiple\npositives and multiple negatives. We explore simple methods to incorporate this\ncontext and show substantial gains over prior works, leading to new\nstate-of-the-art accuracy on Bongard-LOGO (75.3%) and Bongard-HOI (76.4%)\ncompared to methods with equivalent vision backbone architectures and strong\nperformance on the original Bongard problem set (60.8%).\n","authors":["Nikhil Raghuraman","Adam W. Harley","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2309.03468v2.pdf","comment":"TMLR October 2024. Code:\n https://github.com/nraghuraman/bongard-context"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.05037v3","updated":"2024-12-01T15:17:03Z","published":"2023-08-09T16:09:44Z","title":"Separate Anything You Describe","summary":" Language-queried audio source separation (LASS) is a new paradigm for\ncomputational auditory scene analysis (CASA). LASS aims to separate a target\nsound from an audio mixture given a natural language query, which provides a\nnatural and scalable interface for digital audio applications. Recent works on\nLASS, despite attaining promising separation performance on specific sources\n(e.g., musical instruments, limited classes of audio events), are unable to\nseparate audio concepts in the open domain. In this work, we introduce\nAudioSep, a foundation model for open-domain audio source separation with\nnatural language queries. We train AudioSep on large-scale multimodal datasets\nand extensively evaluate its capabilities on numerous tasks including audio\nevent separation, musical instrument separation, and speech enhancement.\nAudioSep demonstrates strong separation performance and impressive zero-shot\ngeneralization ability using audio captions or text labels as queries,\nsubstantially outperforming previous audio-queried and language-queried sound\nseparation models. For reproducibility of this work, we will release the source\ncode, evaluation benchmark and pre-trained model at:\nhttps://github.com/Audio-AGI/AudioSep.\n","authors":["Xubo Liu","Qiuqiang Kong","Yan Zhao","Haohe Liu","Yi Yuan","Yuzhuo Liu","Rui Xia","Yuxuan Wang","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.05037v3.pdf","comment":"Code, benchmark and pre-trained models:\n https://github.com/Audio-AGI/AudioSep"},{"id":"http://arxiv.org/abs/2401.17133v2","updated":"2024-12-01T04:06:27Z","published":"2024-01-30T16:07:44Z","title":"SongBsAb: A Dual Prevention Approach against Singing Voice Conversion\n based Illegal Song Covers","summary":" Singing voice conversion (SVC) automates song covers by converting a source\nsinging voice from a source singer into a new singing voice with the same\nlyrics and melody as the source, but sounds like being covered by the target\nsinger of some given target singing voices. However, it raises serious concerns\nabout copyright and civil right infringements. We propose SongBsAb, the first\nproactive approach to tackle SVC-based illegal song covers. SongBsAb adds\nperturbations to singing voices before releasing them, so that when they are\nused, the process of SVC will be interfered, leading to unexpected singing\nvoices. Perturbations are carefully crafted to (1) provide a dual prevention,\ni.e., preventing the singing voice from being used as the source and target\nsinging voice in SVC, by proposing a gender-transformation loss and a high/low\nhierarchy multi-target loss, respectively; and (2) be harmless, i.e., no\nside-effect on the enjoyment of protected songs, by refining a psychoacoustic\nmodel-based loss with the backing track as an additional masker, a unique\naccompanying element for singing voices compared to ordinary speech voices. We\nalso adopt a frame-level interaction reduction-based loss and encoder ensemble\nto enhance the transferability of SongBsAb to unknown SVC models. We\ndemonstrate the prevention effectiveness, harmlessness, and robustness of\nSongBsAb on five diverse and promising SVC models, using both English and\nChinese datasets, and both objective and human study-based subjective metrics.\nOur work fosters an emerging research direction for mitigating illegal\nautomated song covers.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song","Ting Wang","Xiaoning Du","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.17133v2.pdf","comment":"In Proceedings of the 32nd Network and Distributed System Security\n (NDSS) Symposium 2025"}],"Genomics":[{"id":"http://arxiv.org/abs/2412.00651v1","updated":"2024-12-01T03:09:52Z","published":"2024-12-01T03:09:52Z","title":"Towards Unified Molecule-Enhanced Pathology Image Representation\n Learning via Integrating Spatial Transcriptomics","summary":" Recent advancements in multimodal pre-training models have significantly\nadvanced computational pathology. However, current approaches predominantly\nrely on visual-language models, which may impose limitations from a molecular\nperspective and lead to performance bottlenecks. Here, we introduce a Unified\nMolecule-enhanced Pathology Image REpresentationn Learning framework (UMPIRE).\nUMPIRE aims to leverage complementary information from gene expression profiles\nto guide the multimodal pre-training, enhancing the molecular awareness of\npathology image representation learning. We demonstrate that this molecular\nperspective provides a robust, task-agnostic training signal for learning\npathology image embeddings. Due to the scarcity of paired data, approximately 4\nmillion entries of spatial transcriptomics gene expression were collected to\ntrain the gene encoder. By leveraging powerful pre-trained encoders, UMPIRE\naligns the encoders across over 697K pathology image-gene expression pairs. The\nperformance of UMPIRE is demonstrated across various molecular-related\ndownstream tasks, including gene expression prediction, spot classification,\nand mutation state prediction in whole slide images. Our findings highlight the\neffectiveness of multimodal data integration and open new avenues for exploring\ncomputational pathology enhanced by molecular perspectives. The code and\npre-trained weights are available at https://github.com/Hanminghao/UMPIRE.\n","authors":["Minghao Han","Dingkang Yang","Jiabei Cheng","Xukun Zhang","Linhao Qu","Zizhi Chen","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.00651v1.pdf","comment":"21 pages, 11 figures, 7 tables"}]},"2024-11-30T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.17922v2","updated":"2024-11-30T23:10:40Z","published":"2024-11-26T22:31:09Z","title":"Exploring Superpixel Segmentation Methods in the Context of Citizen\n Science and Deforestation Detection","summary":" Tropical forests play an essential role in the planet's ecosystem, making the\nconservation of these biomes a worldwide priority. However, ongoing\ndeforestation and degradation pose a significant threat to their existence,\nnecessitating effective monitoring and the proposal of actions to mitigate the\ndamage caused by these processes. In this regard, initiatives range from\ngovernment and private sector monitoring programs to solutions based on citizen\nscience campaigns, for example. Particularly in the context of citizen science\ncampaigns, the segmentation of remote sensing images to identify deforested\nareas and subsequently submit them to analysis by non-specialized volunteers is\nnecessary. Thus, segmentation using superpixel-based techniques proves to be a\nviable solution for this important task. Therefore, this paper presents an\nanalysis of 22 superpixel-based segmentation methods applied to remote sensing\nimages, aiming to identify which of them are more suitable for generating\nsegments for citizen science campaigns. The results reveal that seven of the\nsegmentation methods outperformed the baseline method (SLIC) currently employed\nin the ForestEyes citizen science project, indicating an opportunity for\nimprovement in this important stage of campaign development.\n","authors":["Hugo Resende","Isabela Borlido","Victor Sundermann","Eduardo B. Neto","Silvio Jamil F. Guimarães","Fabio Faria","Alvaro Luiz Fazenda"],"pdf_url":"https://arxiv.org/pdf/2411.17922v2.pdf","comment":"Paper was accepted for presentation at SAC 2025"},{"id":"http://arxiv.org/abs/2411.11922v2","updated":"2024-11-30T22:32:34Z","published":"2024-11-18T05:59:03Z","title":"SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking\n with Motion-Aware Memory","summary":" The Segment Anything Model 2 (SAM 2) has demonstrated strong performance in\nobject segmentation tasks but faces challenges in visual object tracking,\nparticularly when managing crowded scenes with fast-moving or self-occluding\nobjects. Furthermore, the fixed-window memory approach in the original model\ndoes not consider the quality of memories selected to condition the image\nfeatures for the next frame, leading to error propagation in videos. This paper\nintroduces SAMURAI, an enhanced adaptation of SAM 2 specifically designed for\nvisual object tracking. By incorporating temporal motion cues with the proposed\nmotion-aware memory selection mechanism, SAMURAI effectively predicts object\nmotion and refines mask selection, achieving robust, accurate tracking without\nthe need for retraining or fine-tuning. SAMURAI operates in real-time and\ndemonstrates strong zero-shot performance across diverse benchmark datasets,\nshowcasing its ability to generalize without fine-tuning. In evaluations,\nSAMURAI achieves significant improvements in success rate and precision over\nexisting trackers, with a 7.1% AUC gain on LaSOT$_{\\text{ext}}$ and a 3.5% AO\ngain on GOT-10k. Moreover, it achieves competitive results compared to fully\nsupervised methods on LaSOT, underscoring its robustness in complex tracking\nscenarios and its potential for real-world applications in dynamic\nenvironments.\n","authors":["Cheng-Yen Yang","Hsiang-Wei Huang","Wenhao Chai","Zhongyu Jiang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2411.11922v2.pdf","comment":"Project page is available at https://yangchris11.github.io/samurai/"},{"id":"http://arxiv.org/abs/2309.00378v5","updated":"2024-11-30T19:08:48Z","published":"2023-09-01T10:27:04Z","title":"Long-Term Ad Memorability: Understanding & Generating Memorable Ads","summary":" Despite the importance of long-term memory in marketing and brand building,\nuntil now, there has been no large-scale study on the memorability of ads. All\nprevious memorability studies have been conducted on short-term recall on\nspecific content types like action videos. On the other hand, long-term\nmemorability is crucial for the advertising industry, and ads are almost always\nhighly multimodal. Therefore, we release the first memorability dataset,\nLAMBDA, consisting of 1749 participants and 2205 ads covering 276 brands.\nRunning statistical tests over different participant subpopulations and ad\ntypes, we find many interesting insights into what makes an ad memorable, e.g.,\nfast-moving ads are more memorable than those with slower scenes; people who\nuse ad-blockers remember a lower number of ads than those who don't. Next, we\npresent a model, Henry, to predict the memorability of a content. Henry\nachieves state-of-the-art performance across all prominent literature\nmemorability datasets. It shows strong generalization performance with better\nresults in 0-shot on unseen datasets. Finally, with the intent of memorable ad\ngeneration, we present a scalable method to build a high-quality memorable ad\ngeneration model by leveraging automatically annotated data. Our approach, SEED\n(Self rEwarding mEmorability Modeling), starts with a language model trained on\nLAMBDA as seed data and progressively trains an LLM to generate more memorable\nads. We show that the generated advertisements have 44% higher memorability\nscores than the original ads. We release this large-scale ad dataset,\nUltraLAMBDA, consisting of 5 million ads. Our code and the datasets, LAMBDA and\nUltraLAMBDA, are open-sourced at\nhttps://behavior-in-the-wild.github.io/memorability.\n","authors":["Harini SI","Somesh Singh","Yaman K Singla","Aanisha Bhattacharyya","Veeky Baths","Changyou Chen","Rajiv Ratn Shah","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00378v5.pdf","comment":"Published in WACV-2025"},{"id":"http://arxiv.org/abs/2408.00672v2","updated":"2024-11-30T18:42:30Z","published":"2024-08-01T16:13:07Z","title":"ExpertAF: Expert Actionable Feedback from Video","summary":" Feedback is essential for learning a new skill or improving one's current\nskill-level. However, current methods for skill-assessment from video only\nprovide scores or compare demonstrations, leaving the burden of knowing what to\ndo differently on the user. We introduce a novel method to generate actionable\nfeedback from video of a person doing a physical activity, such as basketball\nor soccer. Our method takes a video demonstration and its accompanying 3D body\npose and generates (1) free-form expert commentary describing what the person\nis doing well and what they could improve, and (2) a visual expert\ndemonstration that incorporates the required corrections. We show how to\nleverage Ego-Exo4D's videos of skilled activity and expert commentary together\nwith a strong language model to create a weakly-supervised training dataset for\nthis task, and we devise a multimodal video-language model to infer coaching\nfeedback. Our method is able to reason across multi-modal input combinations to\noutput full-spectrum, actionable coaching -- expert commentary, expert video\nretrieval, and expert pose generation -- outperforming strong vision-language\nmodels on both established metrics and human preference studies. Code and data\nwill be publicly released.\n","authors":["Kumar Ashutosh","Tushar Nagarajan","Georgios Pavlakos","Kris Kitani","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2408.00672v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2408.01372v3","updated":"2024-11-30T13:24:19Z","published":"2024-08-02T16:28:51Z","title":"Spatial and Spatial-Spectral Morphological Mamba for Hyperspectral Image\n Classification","summary":" Recent advancements in transformers, specifically self-attention mechanisms,\nhave significantly improved hyperspectral image (HSI) classification. However,\nthese models often suffer from inefficiencies, as their computational\ncomplexity scales quadratically with sequence length. To address these\nchallenges, we propose the morphological spatial mamba (SMM) and morphological\nspatial-spectral Mamba (SSMM) model (MorpMamba), which combines the strengths\nof morphological operations and the state space model framework, offering a\nmore computationally efficient alternative to transformers. In MorpMamba, a\nnovel token generation module first converts HSI patches into spatial-spectral\ntokens. These tokens are then processed through morphological operations such\nas erosion and dilation, utilizing depthwise separable convolutions to capture\nstructural and shape information. A token enhancement module refines these\nfeatures by dynamically adjusting the spatial and spectral tokens based on\ncentral HSI regions, ensuring effective feature fusion within each block.\nSubsequently, multi-head self-attention is applied to further enrich the\nfeature representations, allowing the model to capture complex relationships\nand dependencies within the data. Finally, the enhanced tokens are fed into a\nstate space module, which efficiently models the temporal evolution of the\nfeatures for classification. Experimental results on widely used HSI datasets\ndemonstrate that MorpMamba achieves superior parametric efficiency compared to\ntraditional CNN and transformer models while maintaining high accuracy. The\ncode will be made publicly available at\n\\url{https://github.com/mahmad000/MorpMamba}.\n","authors":["Muhammad Ahmad","Muhammad Hassaan Farooq Butt","Adil Mehmood Khan","Manuel Mazzara","Salvatore Distefano","Muhammad Usama","Swalpa Kumar Roy","Jocelyn Chanussot","Danfeng Hong"],"pdf_url":"https://arxiv.org/pdf/2408.01372v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.06769v5","updated":"2024-11-30T13:19:28Z","published":"2021-06-12T13:04:46Z","title":"Cross-Subject Domain Adaptation for Classifying Working Memory Load with\n Multi-Frame EEG Images","summary":" Working memory (WM), denoting the information temporally stored in the mind,\nis a fundamental research topic in the field of human cognition.\nElectroencephalograph (EEG), which can monitor the electrical activity of the\nbrain, has been widely used in measuring the level of WM. However, one of the\ncritical challenges is that individual differences may cause ineffective\nresults, especially when the established model meets an unfamiliar subject. In\nthis work, we propose a cross-subject deep adaptation model with spatial\nattention (CS-DASA) to generalize the workload classifications across subjects.\nFirst, we transform EEG time series into multi-frame EEG images incorporating\nspatial, spectral, and temporal information. First, the Subject-Shared module\nin CS-DASA receives multi-frame EEG image data from both source and target\nsubjects and learns the common feature representations. Then, in the\nsubject-specific module, the maximum mean discrepancy is implemented to measure\nthe domain distribution divergence in a reproducing kernel Hilbert space, which\ncan add an effective penalty loss for domain adaptation. Additionally, the\nsubject-to-subject spatial attention mechanism is employed to focus on the\ndiscriminative spatial features from the target image data. Experiments\nconducted on a public WM EEG dataset containing 13 subjects show that the\nproposed model is capable of achieving better performance than existing\nstate-of-the-art methods.\n","authors":["Junfu Chen","Sirui Li","Dechang Pi"],"pdf_url":"https://arxiv.org/pdf/2106.06769v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11512v2","updated":"2024-11-30T12:11:05Z","published":"2024-09-17T19:26:21Z","title":"Good Grasps Only: A data engine for self-supervised fine-tuning of pose\n estimation using grasp poses for verification","summary":" In this paper, we present a novel method for self-supervised fine-tuning of\npose estimation. Leveraging zero-shot pose estimation, our approach enables the\nrobot to automatically obtain training data without manual labeling. After pose\nestimation the object is grasped, and in-hand pose estimation is used for data\nvalidation. Our pipeline allows the system to fine-tune while the process is\nrunning, removing the need for a learning phase. The motivation behind our work\nlies in the need for rapid setup of pose estimation solutions. Specifically, we\naddress the challenging task of bin picking, which plays a pivotal role in\nflexible robotic setups. Our method is implemented on a robotics work-cell, and\ntested with four different objects. For all objects, our method increases the\nperformance and outperforms a state-of-the-art method trained on the CAD model\nof the objects.\n","authors":["Frederik Hagelskjær"],"pdf_url":"https://arxiv.org/pdf/2409.11512v2.pdf","comment":"8 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.17106v2","updated":"2024-11-30T11:59:54Z","published":"2024-11-26T04:49:42Z","title":"PassionSR: Post-Training Quantization with Adaptive Scale in One-Step\n Diffusion based Image Super-Resolution","summary":" Diffusion-based image super-resolution (SR) models have shown superior\nperformance at the cost of multiple denoising steps. However, even though the\ndenoising step has been reduced to one, they require high computational costs\nand storage requirements, making it difficult for deployment on hardware\ndevices. To address these issues, we propose a novel post-training quantization\napproach with adaptive scale in one-step diffusion (OSD) image SR, PassionSR.\nFirst, we simplify OSD model to two core components, UNet and Variational\nAutoencoder (VAE) by removing the CLIPEncoder. Secondly, we propose Learnable\nBoundary Quantizer (LBQ) and Learnable Equivalent Transformation (LET) to\noptimize the quantization process and manipulate activation distributions for\nbetter quantization. Finally, we design a Distributed Quantization Calibration\n(DQC) strategy that stabilizes the training of quantized parameters for rapid\nconvergence. Comprehensive experiments demonstrate that PassionSR with 8-bit\nand 6-bit obtains comparable visual results with full-precision model.\nMoreover, our PassionSR achieves significant advantages over recent leading\nlow-bit quantization methods for image SR. Our code will be at\nhttps://github.com/libozhu03/PassionSR.\n","authors":["Libo Zhu","Jianze Li","Haotong Qin","Wenbo Li","Yulun Zhang","Yong Guo","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.17106v2.pdf","comment":"https://github.com/libozhu03/PassionSR"},{"id":"http://arxiv.org/abs/2404.05268v3","updated":"2024-11-30T11:55:19Z","published":"2024-04-08T07:59:04Z","title":"MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation","summary":" Customized text-to-image generation, which synthesizes images based on\nuser-specified concepts, has made significant progress in handling individual\nconcepts. However, when extended to multiple concepts, existing methods often\nstruggle with properly integrating different models and avoiding the unintended\nblending of characteristics from distinct concepts. In this paper, we propose\nMC$^2$, a novel approach for multi-concept customization that enhances\nflexibility and fidelity through inference-time optimization. MC$^2$ enables\nthe integration of multiple single-concept models with heterogeneous\narchitectures. By adaptively refining attention weights between visual and\ntextual tokens, our method ensures that image regions accurately correspond to\ntheir associated concepts while minimizing interference between concepts.\nExtensive experiments demonstrate that MC$^2$ outperforms training-based\nmethods in terms of prompt-reference alignment. Furthermore, MC$^2$ can be\nseamlessly applied to text-to-image generation, providing robust compositional\ncapabilities. To facilitate the evaluation of multi-concept customization, we\nalso introduce a new benchmark, MC++. The code will be publicly available at\nhttps://github.com/JIANGJiaXiu/MC-2.\n","authors":["Jiaxiu Jiang","Yabo Zhang","Kailai Feng","Xiaohe Wu","Wenbo Li","Renjing Pei","Fan Li","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.05268v3.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2408.17065v2","updated":"2024-11-30T10:48:35Z","published":"2024-08-30T07:49:57Z","title":"Generalizing Deepfake Video Detection with Plug-and-Play: Video-Level\n Blending and Spatiotemporal Adapter Tuning","summary":" Three key challenges hinder the development of current deepfake video\ndetection: (1) Temporal features can be complex and diverse: how can we\nidentify general temporal artifacts to enhance model generalization? (2)\nSpatiotemporal models often lean heavily on one type of artifact and ignore the\nother: how can we ensure balanced learning from both? (3) Videos are naturally\nresource-intensive: how can we tackle efficiency without compromising accuracy?\nThis paper attempts to tackle the three challenges jointly. First, inspired by\nthe notable generality of using image-level blending data for image forgery\ndetection, we investigate whether and how video-level blending can be effective\nin video. We then perform a thorough analysis and identify a previously\nunderexplored temporal forgery artifact: Facial Feature Drift (FFD), which\ncommonly exists across different forgeries. To reproduce FFD, we then propose a\nnovel Video-level Blending data (VB), where VB is implemented by blending the\noriginal image and its warped version frame-by-frame, serving as a hard\nnegative sample to mine more general artifacts. Second, we carefully design a\nlightweight Spatiotemporal Adapter (StA) to equip a pretrained image model\n(both ViTs and CNNs) with the ability to capture both spatial and temporal\nfeatures jointly and efficiently. StA is designed with two-stream 3D-Conv with\nvarying kernel sizes, allowing it to process spatial and temporal features\nseparately. Extensive experiments validate the effectiveness of the proposed\nmethods; and show our approach can generalize well to previously unseen forgery\nvideos, even the latest generation methods.\n","authors":["Zhiyuan Yan","Yandan Zhao","Shen Chen","Mingyi Guo","Xinghe Fu","Taiping Yao","Shouhong Ding","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2408.17065v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08074v3","updated":"2024-11-30T10:48:21Z","published":"2024-06-12T10:48:53Z","title":"A Concept-Based Explainability Framework for Large Multimodal Models","summary":" Large multimodal models (LMMs) combine unimodal encoders and large language\nmodels (LLMs) to perform multimodal tasks. Despite recent advancements towards\nthe interpretability of these models, understanding internal representations of\nLMMs remains largely a mystery. In this paper, we present a novel framework for\nthe interpretation of LMMs. We propose a dictionary learning based approach,\napplied to the representation of tokens. The elements of the learned dictionary\ncorrespond to our proposed concepts. We show that these concepts are well\nsemantically grounded in both vision and text. Thus we refer to these as\n``multi-modal concepts''. We qualitatively and quantitatively evaluate the\nresults of the learnt concepts. We show that the extracted multimodal concepts\nare useful to interpret representations of test samples. Finally, we evaluate\nthe disentanglement between different concepts and the quality of grounding\nconcepts visually and textually. Our code is publicly available at\nhttps://github.com/mshukor/xl-vlms\n","authors":["Jayneel Parekh","Pegah Khayatan","Mustafa Shukor","Alasdair Newson","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08074v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2408.16767v2","updated":"2024-11-30T09:10:08Z","published":"2024-08-29T17:59:40Z","title":"ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion\n Model","summary":" Advancements in 3D scene reconstruction have transformed 2D images from the\nreal world into 3D models, producing realistic 3D results from hundreds of\ninput photos. Despite great success in dense-view reconstruction scenarios,\nrendering a detailed scene from insufficient captured views is still an\nill-posed optimization problem, often resulting in artifacts and distortions in\nunseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction\nparadigm that reframes the ambiguous reconstruction challenge as a temporal\ngeneration task. The key insight is to unleash the strong generative prior of\nlarge pre-trained video diffusion models for sparse-view reconstruction.\nHowever, 3D view consistency struggles to be accurately preserved in directly\ngenerated video frames from pre-trained models. To address this, given limited\ninput views, the proposed ReconX first constructs a global point cloud and\nencodes it into a contextual space as the 3D structure condition. Guided by the\ncondition, the video diffusion model then synthesizes video frames that are\nboth detail-preserved and exhibit a high degree of 3D consistency, ensuring the\ncoherence of the scene from various perspectives. Finally, we recover the 3D\nscene from the generated video through a confidence-aware 3D Gaussian Splatting\noptimization scheme. Extensive experiments on various real-world datasets show\nthe superiority of our ReconX over state-of-the-art methods in terms of quality\nand generalizability.\n","authors":["Fangfu Liu","Wenqiang Sun","Hanyang Wang","Yikai Wang","Haowen Sun","Junliang Ye","Jun Zhang","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16767v2.pdf","comment":"Project page: https://liuff19.github.io/ReconX"},{"id":"http://arxiv.org/abs/2411.16727v2","updated":"2024-11-30T09:06:45Z","published":"2024-11-23T05:19:27Z","title":"An Information-Theoretic Regularizer for Lossy Neural Image Compression","summary":" Lossy image compression networks aim to minimize the latent entropy of images\nwhile adhering to specific distortion constraints. However, optimizing the\nneural network can be challenging due to its nature of learning quantized\nlatent representations. In this paper, our key finding is that minimizing the\nlatent entropy is, to some extent, equivalent to maximizing the conditional\nsource entropy, an insight that is deeply rooted in information-theoretic\nequalities. Building on this insight, we propose a novel structural\nregularization method for the neural image compression task by incorporating\nthe negative conditional source entropy into the training objective, such that\nboth the optimization efficacy and the model's generalization ability can be\npromoted. The proposed information-theoretic regularizer is interpretable,\nplug-and-play, and imposes no inference overheads. Extensive experiments\ndemonstrate its superiority in regularizing the models and further squeezing\nbits from the latent representation across various compression structures and\nunseen domains.\n","authors":["Yingwen Zhang","Meng Wang","Xihua Sheng","Peilin Chen","Junru Li","Li Zhang","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2411.16727v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.14811v2","updated":"2024-11-30T08:47:23Z","published":"2024-11-22T09:12:02Z","title":"Fine-Grained Alignment in Vision-and-Language Navigation through\n Bayesian Optimization","summary":" This paper addresses the challenge of fine-grained alignment in\nVision-and-Language Navigation (VLN) tasks, where robots navigate realistic 3D\nenvironments based on natural language instructions. Current approaches use\ncontrastive learning to align language with visual trajectory sequences.\nNevertheless, they encounter difficulties with fine-grained vision negatives.\nTo enhance cross-modal embeddings, we introduce a novel Bayesian\nOptimization-based adversarial optimization framework for creating fine-grained\ncontrastive vision samples. To validate the proposed methodology, we conduct a\nseries of experiments to assess the effectiveness of the enriched embeddings on\nfine-grained vision negatives. We conduct experiments on two common VLN\nbenchmarks R2R and REVERIE, experiments on the them demonstrate that these\nembeddings benefit navigation, and can lead to a promising performance\nenhancement. Our source code and trained models are available at:\nhttps://anonymous.4open.science/r/FGVLN.\n","authors":["Yuhang Song","Mario Gianni","Chenguang Yang","Kunyang Lin","Te-Chuan Chiu","Anh Nguyen","Chun-Yi Lee"],"pdf_url":"https://arxiv.org/pdf/2411.14811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18281v2","updated":"2024-11-30T07:34:24Z","published":"2024-11-27T12:15:52Z","title":"MotionCharacter: Identity-Preserving and Motion Controllable Human Video\n Generation","summary":" Recent advancements in personalized Text-to-Video (T2V) generation highlight\nthe importance of integrating character-specific identities and actions.\nHowever, previous T2V models struggle with identity consistency and\ncontrollable motion dynamics, mainly due to limited fine-grained facial and\naction-based textual prompts, and datasets that overlook key human attributes\nand actions. To address these challenges, we propose MotionCharacter, an\nefficient and high-fidelity human video generation framework designed for\nidentity preservation and fine-grained motion control. We introduce an\nID-preserving module to maintain identity fidelity while allowing flexible\nattribute modifications, and further integrate ID-consistency and region-aware\nloss mechanisms, significantly enhancing identity consistency and detail\nfidelity. Additionally, our approach incorporates a motion control module that\nprioritizes action-related text while maintaining subject consistency, along\nwith a dataset, Human-Motion, which utilizes large language models to generate\ndetailed motion descriptions. For simplify user control during inference, we\nparameterize motion intensity through a single coefficient, allowing for easy\nadjustments. Extensive experiments highlight the effectiveness of\nMotionCharacter, demonstrating significant improvements in ID-preserving,\nhigh-quality video generation.\n","authors":["Haopeng Fang","Di Qiu","Binjie Mao","Pengfei Yan","He Tang"],"pdf_url":"https://arxiv.org/pdf/2411.18281v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08857v2","updated":"2024-11-30T07:22:57Z","published":"2024-09-13T14:19:27Z","title":"InstantDrag: Improving Interactivity in Drag-based Image Editing","summary":" Drag-based image editing has recently gained popularity for its interactivity\nand precision. However, despite the ability of text-to-image models to generate\nsamples within a second, drag editing still lags behind due to the challenge of\naccurately reflecting user interaction while maintaining image content. Some\nexisting approaches rely on computationally intensive per-image optimization or\nintricate guidance-based methods, requiring additional inputs such as masks for\nmovable regions and text prompts, thereby compromising the interactivity of the\nediting process. We introduce InstantDrag, an optimization-free pipeline that\nenhances interactivity and speed, requiring only an image and a drag\ninstruction as input. InstantDrag consists of two carefully designed networks:\na drag-conditioned optical flow generator (FlowGen) and an optical\nflow-conditioned diffusion model (FlowDiffusion). InstantDrag learns motion\ndynamics for drag-based image editing in real-world video datasets by\ndecomposing the task into motion generation and motion-conditioned image\ngeneration. We demonstrate InstantDrag's capability to perform fast,\nphoto-realistic edits without masks or text prompts through experiments on\nfacial video datasets and general scenes. These results highlight the\nefficiency of our approach in handling drag-based image editing, making it a\npromising solution for interactive, real-time applications.\n","authors":["Joonghyuk Shin","Daehyeon Choi","Jaesik Park"],"pdf_url":"https://arxiv.org/pdf/2409.08857v2.pdf","comment":"SIGGRAPH Asia 2024. Project webpage:\n https://joonghyuk.com/instantdrag-web/"},{"id":"http://arxiv.org/abs/2405.16116v2","updated":"2024-11-30T07:19:56Z","published":"2024-05-25T08:06:12Z","title":"REACT: Real-time Efficiency and Accuracy Compromise for Tradeoffs in\n Scene Graph Generation","summary":" Scene Graph Generation (SGG) is a task that encodes visual relationships\nbetween objects in images as graph structures. SGG shows significant promise as\na foundational component for downstream tasks, such as reasoning for embodied\nagents. To enable real-time applications, SGG must address the trade-off\nbetween performance and inference speed. However, current methods tend to focus\non one of the following: (1) improving relation prediction accuracy, (2)\nenhancing object detection accuracy, or (3) reducing latency, without aiming to\nbalance all three objectives simultaneously. To address this limitation, we\npropose a novel architecture, inference method, and relation prediction model.\nOur proposed solution, the REACT model, achieves the highest inference speed\namong existing SGG models, improving object detection accuracy without\nsacrificing relation prediction performance. Compared to state-of-the-art\napproaches, REACT is 2.7 times faster (with a latency of 23 ms) and improves\nobject detection accuracy by 58.51%. Furthermore, our proposal significantly\nreduces model size, with an average of 5.5x fewer parameters. Code is available\nat https://github.com/Maelic/SGG-Benchmark\n","authors":["Maëlic Neau","Paulo E. Santos","Anne-Gwenn Bosser","Cédric Buche"],"pdf_url":"https://arxiv.org/pdf/2405.16116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14332v2","updated":"2024-11-30T05:56:52Z","published":"2024-10-18T09:44:25Z","title":"Croc: Pretraining Large Multimodal Models with Cross-Modal Comprehension","summary":" Recent advances in Large Language Models (LLMs) have catalyzed the\ndevelopment of Large Multimodal Models (LMMs). However, existing research\nprimarily focuses on tuning language and image instructions, ignoring the\ncritical pretraining phase where models learn to process textual and visual\nmodalities jointly. In this paper, we propose a new pretraining paradigm for\nLMMs to enhance the visual comprehension capabilities of LLMs by introducing a\nnovel cross-modal comprehension stage. Specifically, we design a dynamically\nlearnable prompt token pool and employ the Hungarian algorithm to replace part\nof the original visual tokens with the most relevant prompt tokens. Then, we\nconceptualize visual tokens as analogous to a \"foreign language\" for the LLMs\nand propose a mixed attention mechanism with bidirectional visual attention and\nunidirectional textual attention to comprehensively enhance the understanding\nof visual tokens. Meanwhile, we integrate a detailed caption generation task,\nleveraging rich descriptions to further facilitate LLMs in understanding visual\nsemantic information. After pretraining on 1.5 million publicly accessible\ndata, we present a new foundation model called Croc. Experimental results\ndemonstrate that Croc achieves new state-of-the-art performance on massive\nvision-language benchmarks. To support reproducibility and facilitate further\nresearch, we release the training code and pre-trained model weights at\nhttps://github.com/deepglint/Croc.\n","authors":["Yin Xie","Kaicheng Yang","Ninghua Yang","Weimo Deng","Xiangzi Dai","Tiancheng Gu","Yumeng Wang","Xiang An","Yongle Zhao","Ziyong Feng","Jiankang Deng"],"pdf_url":"https://arxiv.org/pdf/2410.14332v2.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.06169v3","updated":"2024-11-30T05:32:51Z","published":"2024-10-08T16:13:24Z","title":"Treat Visual Tokens as Text? But Your MLLM Only Needs Fewer Efforts to\n See","summary":" By treating visual tokens from visual encoders as text tokens, Multimodal\nLarge Language Models (MLLMs) have achieved remarkable progress across diverse\nvisual understanding tasks, leveraging the robust architectures of Large\nLanguage Models (LLMs). However, as token counts grow, the quadratic scaling of\ncomputation in LLMs introduces a significant efficiency bottleneck, impeding\nfurther scalability. Although recent approaches have explored pruning visual\ntokens or employing lighter LLM architectures, the computational overhead from\nan increasing number of visual tokens remains a substantial challenge.\n In this study, we investigate the redundancy in visual computation at both\nthe parameter and computational pattern levels within LLaVA, a representative\nMLLM, and introduce a suite of streamlined strategies to enhance efficiency.\nThese include neighbor-aware visual token attention, pruning of inactive visual\nattention heads, and selective layer dropping for visual computations. By\nimplementing these strategies in LLaVA, we achieve a reduction in computational\ndemands of 88% while maintaining model performance across key benchmarks.\nAdditionally, we validate the existence of visual computational redundancy in\nother MLLMs, such as Qwen2-VL-7B and InternVL-2.0-4B/8B/26B. These results\npresent a novel pathway for MLLMs to handle dense visual tokens with minimal\ncomputational costs. Code and model checkpoints will be released to support\nfurther research.\n","authors":["Zeliang Zhang","Phu Pham","Wentian Zhao","Kun Wan","Yu-Jhe Li","Jianing Zhou","Daniel Miranda","Ajinkya Kale","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2410.06169v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17765v2","updated":"2024-11-30T04:50:36Z","published":"2024-11-26T04:21:22Z","title":"I2VControl: Disentangled and Unified Video Motion Synthesis Control","summary":" Video synthesis techniques are undergoing rapid progress, with\ncontrollability being a significant aspect of practical usability for\nend-users. Although text condition is an effective way to guide video\nsynthesis, capturing the correct joint distribution between text descriptions\nand video motion remains a substantial challenge. In this paper, we present a\ndisentangled and unified framework, namely I2VControl, that unifies multiple\nmotion control tasks in image-to-video synthesis. Our approach partitions the\nvideo into individual motion units and represents each unit with disentangled\ncontrol signals, which allows for various control types to be flexibly combined\nwithin our single system. Furthermore, our methodology seamlessly integrates as\na plug-in for pre-trained models and remains agnostic to specific model\narchitectures. We conduct extensive experiments, achieving excellent\nperformance on various control tasks, and our method further facilitates\nuser-driven creative combinations, enhancing innovation and creativity. The\nproject page is: https://wanquanf.github.io/I2VControl .\n","authors":["Wanquan Feng","Tianhao Qi","Jiawei Liu","Mingzhen Sun","Pengqi Tu","Tianxiang Ma","Fei Dai","Songtao Zhao","Siyu Zhou","Qian He"],"pdf_url":"https://arxiv.org/pdf/2411.17765v2.pdf","comment":"Project page: https://wanquanf.github.io/I2VControl"},{"id":"http://arxiv.org/abs/2402.00672v3","updated":"2024-11-30T04:47:10Z","published":"2024-02-01T15:33:17Z","title":"Exploring Homogeneous and Heterogeneous Consistent Label Associations\n for Unsupervised Visible-Infrared Person ReID","summary":" Unsupervised visible-infrared person re-identification (USL-VI-ReID)\nendeavors to retrieve pedestrian images of the same identity from different\nmodalities without annotations. While prior work focuses on establishing\ncross-modality pseudo-label associations to bridge the modality-gap, they\nignore maintaining the instance-level homogeneous and heterogeneous consistency\nbetween the feature space and the pseudo-label space, resulting in coarse\nassociations. In response, we introduce a Modality-Unified Label Transfer\n(MULT) module that simultaneously accounts for both homogeneous and\nheterogeneous fine-grained instance-level structures, yielding high-quality\ncross-modality label associations. It models both homogeneous and heterogeneous\naffinities, leveraging them to quantify the inconsistency between the\npseudo-label space and the feature space, subsequently minimizing it. The\nproposed MULT ensures that the generated pseudo-labels maintain alignment\nacross modalities while upholding structural consistency within intra-modality.\nAdditionally, a straightforward plug-and-play Online Cross-memory Label\nRefinement (OCLR) module is proposed to further mitigate the side effects of\nnoisy pseudo-labels while simultaneously aligning different modalities, coupled\nwith an Alternative Modality-Invariant Representation Learning (AMIRL)\nframework. Experiments demonstrate that our proposed method outperforms\nexisting state-of-the-art USL-VI-ReID methods, highlighting the superiority of\nour MULT in comparison to other cross-modality association methods. Code is\navailable at https://github.com/FranklinLingfeng/code_for_MULT.\n","authors":["Lingfeng He","De Cheng","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2402.00672v3.pdf","comment":"Accepted by IJCV2024"},{"id":"http://arxiv.org/abs/2411.13615v3","updated":"2024-11-30T03:12:00Z","published":"2024-11-20T08:09:35Z","title":"A Deep Learning Approach to Predict the Fall [of Price] of\n Cryptocurrency Long Before its Actual Fall","summary":" In modern times, the cryptocurrency market is one of the world's most rapidly\nrising financial markets. The cryptocurrency market is regarded to be more\nvolatile and illiquid than traditional markets such as equities, foreign\nexchange, and commodities. The risk of this market creates an uncertain\ncondition among the investors. The purpose of this research is to predict the\nmagnitude of the risk factor of the cryptocurrency market. Risk factor is also\ncalled volatility. Our approach will assist people who invest in the\ncryptocurrency market by overcoming the problems and difficulties they\nexperience. Our approach starts with calculating the risk factor of the\ncryptocurrency market from the existing parameters. In twenty elements of the\ncryptocurrency market, the risk factor has been predicted using different\nmachine learning algorithms such as CNN, LSTM, BiLSTM, and GRU. All of the\nmodels have been applied to the calculated risk factor parameter. A new model\nhas been developed to predict better than the existing models. Our proposed\nmodel gives the highest RMSE value of 1.3229 and the lowest RMSE value of\n0.0089. Following our model, it will be easier for investors to trade in\ncomplicated and challenging financial assets like bitcoin, Ethereum, dogecoin,\netc. Where the other existing models, the highest RMSE was 14.5092, and the\nlower was 0.02769. So, the proposed model performs much better than models with\nproper generalization. Using our approach, it will be easier for investors to\ntrade in complicated and challenging financial assets like Bitcoin, Ethereum,\nand Dogecoin.\n","authors":["Anika Tahsin Meem"],"pdf_url":"https://arxiv.org/pdf/2411.13615v3.pdf","comment":"I am writing to formally request the withdrawal, which is necessary\n due to issues with the author list and the need for improvements to the\n manuscript. We apologize for any inconvenience caused by this request and\n appreciate your understanding"},{"id":"http://arxiv.org/abs/2410.05651v2","updated":"2024-11-30T02:13:46Z","published":"2024-10-08T03:01:54Z","title":"ViBiDSampler: Enhancing Video Interpolation Using Bidirectional\n Diffusion Sampler","summary":" Recent progress in large-scale text-to-video (T2V) and image-to-video (I2V)\ndiffusion models has greatly enhanced video generation, especially in terms of\nkeyframe interpolation. However, current image-to-video diffusion models, while\npowerful in generating videos from a single conditioning frame, need adaptation\nfor two-frame (start & end) conditioned generation, which is essential for\neffective bounded interpolation. Unfortunately, existing approaches that fuse\ntemporally forward and backward paths in parallel often suffer from\noff-manifold issues, leading to artifacts or requiring multiple iterative\nre-noising steps. In this work, we introduce a novel, bidirectional sampling\nstrategy to address these off-manifold issues without requiring extensive\nre-noising or fine-tuning. Our method employs sequential sampling along both\nforward and backward paths, conditioned on the start and end frames,\nrespectively, ensuring more coherent and on-manifold generation of intermediate\nframes. Additionally, we incorporate advanced guidance techniques, CFG++ and\nDDS, to further enhance the interpolation process. By integrating these, our\nmethod achieves state-of-the-art performance, efficiently generating\nhigh-quality, smooth videos between keyframes. On a single 3090 GPU, our method\ncan interpolate 25 frames at 1024 x 576 resolution in just 195 seconds,\nestablishing it as a leading solution for keyframe interpolation.\n","authors":["Serin Yang","Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2410.05651v2.pdf","comment":"Project page: https://vibidsampler.github.io/"},{"id":"http://arxiv.org/abs/2409.02574v2","updated":"2024-11-30T01:42:25Z","published":"2024-09-04T09:48:27Z","title":"Solving Video Inverse Problems Using Image Diffusion Models","summary":" Recently, diffusion model-based inverse problem solvers (DIS) have emerged as\nstate-of-the-art approaches for addressing inverse problems, including image\nsuper-resolution, deblurring, inpainting, etc. However, their application to\nvideo inverse problems arising from spatio-temporal degradation remains largely\nunexplored due to the challenges in training video diffusion models. To address\nthis issue, here we introduce an innovative video inverse solver that leverages\nonly image diffusion models. Specifically, by drawing inspiration from the\nsuccess of the recent decomposed diffusion sampler (DDS), our method treats the\ntime dimension of a video as the batch dimension of image diffusion models and\nsolves spatio-temporal optimization problems within denoised spatio-temporal\nbatches derived from each image diffusion model. Moreover, we introduce a\nbatch-consistent diffusion sampling strategy that encourages consistency across\nbatches by synchronizing the stochastic noise components in image diffusion\nmodels. Our approach synergistically combines batch-consistent sampling with\nsimultaneous optimization of denoised spatio-temporal batches at each reverse\ndiffusion step, resulting in a novel and efficient diffusion sampling strategy\nfor video inverse problems. Experimental results demonstrate that our method\neffectively addresses various spatio-temporal degradations in video inverse\nproblems, achieving state-of-the-art reconstructions. Project page:\nhttps://svi-diffusion.github.io\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2409.02574v2.pdf","comment":"22 pages, 16 figures"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2411.13323v2","updated":"2024-11-30T23:44:43Z","published":"2024-11-20T13:46:04Z","title":"Are Large Language Models Memorizing Bug Benchmarks?","summary":" Large Language Models (LLMs) have become integral to various software\nengineering tasks, including code generation, bug detection, and repair. To\nevaluate model performance in these domains, numerous bug benchmarks containing\nreal-world bugs from software projects have been developed. However, a growing\nconcern within the software engineering community is that these benchmarks may\nnot reliably reflect true LLM performance due to the risk of data leakage.\nDespite this concern, limited research has been conducted to quantify the\nimpact of potential leakage. In this paper, we systematically evaluate popular\nLLMs to assess their susceptibility to data leakage from widely used bug\nbenchmarks. To identify potential leakage, we use multiple metrics, including a\nstudy of benchmark membership within commonly used training datasets, as well\nas analyses of negative log-likelihood and n-gram accuracy. Our findings show\nthat certain models, in particular codegen-multi, exhibit significant evidence\nof memorization in widely used benchmarks like Defects4J, while newer models\ntrained on larger datasets like LLaMa 3.1 exhibit limited signs of leakage.\nThese results highlight the need for careful benchmark selection and the\nadoption of robust metrics to adequately assess models capabilities.\n","authors":["Daniel Ramos","Claudia Mamede","Kush Jain","Paulo Canelas","Catarina Gamboa","Claire Le Goues"],"pdf_url":"https://arxiv.org/pdf/2411.13323v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2406.05804v6","updated":"2024-11-30T22:38:57Z","published":"2024-06-09T14:42:55Z","title":"A Review of Prominent Paradigms for LLM-Based Agents: Tool Use\n (Including RAG), Planning, and Feedback Learning","summary":" Tool use, planning, and feedback learning are currently three prominent\nparadigms for developing Large Language Model (LLM)-based agents across various\ntasks. Although numerous frameworks have been devised for each paradigm, their\nintricate workflows and inconsistent taxonomy create challenges in\nunderstanding and reviewing the frameworks across different paradigms. This\nsurvey introduces a unified taxonomy to systematically review and discuss these\nframeworks. Specifically, 1) the taxonomy defines environments/tasks, common\nLLM-profiled roles or LMPRs (policy models, evaluators, and dynamic models),\nand universally applicable workflows found in prior work, and 2) it enables a\ncomparison of key perspectives on the implementations of LMPRs and workflow\ndesigns across different agent paradigms and frameworks. 3) Finally, we\nidentify three limitations in existing workflow designs and systematically\ndiscuss the future work. Resources have been made publicly available at in our\nGitHub repository https://github.com/xinzhel/LLM-Agent-Survey.\n","authors":["Xinzhe Li"],"pdf_url":"https://arxiv.org/pdf/2406.05804v6.pdf","comment":"CoLing 2025 Camera Ready (extended to 9 pages)"},{"id":"http://arxiv.org/abs/2409.14165v3","updated":"2024-11-30T22:21:30Z","published":"2024-09-21T15:07:37Z","title":"A Survey on Large Language Model-empowered Autonomous Driving","summary":" Artificial intelligence (AI) plays a crucial role in autonomous driving (AD)\nresearch, propelling its development towards intelligence and efficiency.\nCurrently, the development of AD technology follows two main technical paths:\nmodularization and end-to-end. Modularization decompose the driving task into\nmodules such as perception, prediction, planning, and control, and train them\nseparately. Due to the inconsistency of training objectives between modules,\nthe integrated effect suffers from bias. End-to-end attempts to address this\nissue by utilizing a single model that directly maps from sensor data to\ncontrol signals. This path has limited learning capabilities in a comprehensive\nset of features and struggles to handle unpredictable long-tail events and\ncomplex urban traffic scenarios. In the face of challenges encountered in both\npaths, many researchers believe that large language models (LLMs) with powerful\nreasoning capabilities and extensive knowledge understanding may be the\nsolution, expecting LLMs to provide AD systems with deeper levels of\nunderstanding and decision-making capabilities. In light of the challenges\nfaced by both paths, many researchers believe that LLMs, with their powerful\nreasoning abilities and extensive knowledge, could offer a solution. To\nunderstand if LLMs could enhance AD, this paper conducts a thorough analysis of\nthe potential applications of LLMs in AD systems, including exploring their\noptimization strategies in both modular and end-to-end approaches, with a\nparticular focus on how LLMs can tackle the problems and challenges present in\ncurrent solutions. Furthermore, we discuss an important question: Can LLM-based\nartificial general intelligence (AGI) be a key to achieve high-level AD? We\nfurther analyze the potential limitations and challenges that LLMs may\nencounter in promoting the development of AD technology.\n","authors":["Yuxuan Zhu","Shiyi Wang","Wenqing Zhong","Nianchen Shen","Yunqi Li","Siqi Wang","Zhiheng Li","Cathy Wu","Zhengbing He","Li Li"],"pdf_url":"https://arxiv.org/pdf/2409.14165v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11796v3","updated":"2024-11-30T22:01:07Z","published":"2024-08-21T17:38:48Z","title":"LLM Pruning and Distillation in Practice: The Minitron Approach","summary":" We present a comprehensive report on compressing the Llama 3.1 8B and Mistral\nNeMo 12B models to 4B and 8B parameters, respectively, using pruning and\ndistillation. We explore two distinct pruning strategies: (1) depth pruning and\n(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on\ncommon benchmarks from the LM Evaluation Harness. The models are then aligned\nwith NeMo Aligner and tested in instruct-tuned versions. This approach produces\na compelling 4B model from Llama 3.1 8B and a state-of-the-art\nMistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo\n12B. We found that with no access to the original data, it is beneficial to\nslightly fine-tune teacher models on the distillation dataset. We open-source\nour base model weights on Hugging Face with a permissive license.\n","authors":["Sharath Turuvekere Sreenivas","Saurav Muralidharan","Raviraj Joshi","Marcin Chochowski","Mostofa Patwary","Pavlo Molchanov","Mohammad Shoeybi","Jan Kautz","Ameya Sunil Mahabaleshwarkar","Gerald Shen","Jiaqi Zeng","Oleksii Kuchaiev","Zijia Chen","Yoshi Suhara","Shizhe Diao","Chenhan Yu","Wei-Chun Chen","Hayley Ross","Daniel Korzekwa","Oluwatobi Olabiyi","Ashwath Aithal","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2408.11796v3.pdf","comment":"v3: Update author list, other changes"},{"id":"http://arxiv.org/abs/2406.14949v2","updated":"2024-11-30T20:40:20Z","published":"2024-06-21T08:02:25Z","title":"CEASEFIRE: An AI-powered system for combatting illicit firearms\n trafficking","summary":" Modern technologies have led illicit firearms trafficking to partially merge\nwith cybercrime, while simultaneously permitting its off-line aspects to become\nmore sophisticated. Law enforcement officers face difficult challenges that\nrequire hi-tech solutions. This article presents a real-world system, powered\nby advanced Artificial Intelligence, for facilitating them in their everyday\nwork.\n","authors":["Jorgen Cani","Ioannis Mademlis","Marina Mancuso","Caterina Paternoster","Emmanouil Adamakis","George Margetis","Sylvie Chambon","Alain Crouzil","Loubna Lechelek","Georgia Dede","Spyridon Evangelatos","George Lalas","Franck Mignet","Pantelis Linardatos","Konstantinos Kentrotis","Henryk Gierszal","Piotr Tyczka","Sophia Karagiorgou","George Pantelis","Georgios Stavropoulos","Konstantinos Votis","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2406.14949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14012v3","updated":"2024-11-30T19:40:59Z","published":"2024-09-21T04:40:08Z","title":"Test Time Learning for Time Series Forecasting","summary":" Time-series forecasting has seen significant advancements with the\nintroduction of token prediction mechanisms such as multi-head attention.\nHowever, these methods often struggle to achieve the same performance as in\nlanguage modeling, primarily due to the quadratic computational cost and the\ncomplexity of capturing long-range dependencies in time-series data.\nState-space models (SSMs), such as Mamba, have shown promise in addressing\nthese challenges by offering efficient solutions with linear RNNs capable of\nmodeling long sequences with larger context windows. However, there remains\nroom for improvement in accuracy and scalability.\n We propose the use of Test-Time Training (TTT) modules in a parallel\narchitecture to enhance performance in long-term time series forecasting.\nThrough extensive experiments on standard benchmark datasets, we demonstrate\nthat TTT modules consistently outperform state-of-the-art models, including the\nMamba-based TimeMachine, particularly in scenarios involving extended sequence\nand prediction lengths. Our results show significant improvements in Mean\nSquared Error (MSE) and Mean Absolute Error (MAE), especially on larger\ndatasets such as Electricity, Traffic, and Weather, underscoring the\neffectiveness of TTT in capturing long-range dependencies. Additionally, we\nexplore various convolutional architectures within the TTT framework, showing\nthat even simple configurations like 1D convolution with small filters can\nachieve competitive results. This work sets a new benchmark for time-series\nforecasting and lays the groundwork for future research in scalable,\nhigh-performance forecasting models.\n","authors":["Panayiotis Christou","Shichu Chen","Xupeng Chen","Parijat Dube"],"pdf_url":"https://arxiv.org/pdf/2409.14012v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15007v2","updated":"2024-11-30T18:07:50Z","published":"2024-07-20T23:31:56Z","title":"Is Behavior Cloning All You Need? Understanding Horizon in Imitation\n Learning","summary":" Imitation learning (IL) aims to mimic the behavior of an expert in a\nsequential decision making task by learning from demonstrations, and has been\nwidely applied to robotics, autonomous driving, and autoregressive text\ngeneration. The simplest approach to IL, behavior cloning (BC), is thought to\nincur sample complexity with unfavorable quadratic dependence on the problem\nhorizon, motivating a variety of different online algorithms that attain\nimproved linear horizon dependence under stronger assumptions on the data and\nthe learner's access to the expert.\n We revisit the apparent gap between offline and online IL from a\nlearning-theoretic perspective, with a focus on the realizable/well-specified\nsetting with general policy classes up to and including deep neural networks.\nThrough a new analysis of behavior cloning with the logarithmic loss, we show\nthat it is possible to achieve horizon-independent sample complexity in offline\nIL whenever (i) the range of the cumulative payoffs is controlled, and (ii) an\nappropriate notion of supervised learning complexity for the policy class is\ncontrolled. Specializing our results to deterministic, stationary policies, we\nshow that the gap between offline and online IL is smaller than previously\nthought: (i) it is possible to achieve linear dependence on horizon in offline\nIL under dense rewards (matching what was previously only known to be\nachievable in online IL); and (ii) without further assumptions on the policy\nclass, online IL cannot improve over offline IL with the logarithmic loss, even\nin benign MDPs. We complement our theoretical results with experiments on\nstandard RL tasks and autoregressive language generation to validate the\npractical relevance of our findings.\n","authors":["Dylan J. Foster","Adam Block","Dipendra Misra"],"pdf_url":"https://arxiv.org/pdf/2407.15007v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.11740v2","updated":"2024-11-30T17:40:26Z","published":"2024-06-17T17:00:41Z","title":"Imagination Policy: Using Generative Point Cloud Models for Learning\n Manipulation Policies","summary":" Humans can imagine goal states during planning and perform actions to match\nthose goals. In this work, we propose Imagination Policy, a novel multi-task\nkey-frame policy network for solving high-precision pick and place tasks.\nInstead of learning actions directly, Imagination Policy generates point clouds\nto imagine desired states which are then translated to actions using rigid\naction estimation. This transforms action inference into a local generative\ntask. We leverage pick and place symmetries underlying the tasks in the\ngeneration process and achieve extremely high sample efficiency and\ngeneralizability to unseen configurations. Finally, we demonstrate\nstate-of-the-art performance across various tasks on the RLbench benchmark\ncompared with several strong baselines and validate our approach on a real\nrobot.\n","authors":["Haojie Huang","Karl Schmeckpeper","Dian Wang","Ondrej Biza","Yaoyao Qian","Haotian Liu","Mingxi Jia","Robert Platt","Robin Walters"],"pdf_url":"https://arxiv.org/pdf/2406.11740v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09324v2","updated":"2024-11-30T14:35:52Z","published":"2024-07-12T15:01:09Z","title":"Provable Privacy Advantages of Decentralized Federated Learning via\n Distributed Optimization","summary":" Federated learning (FL) emerged as a paradigm designed to improve data\nprivacy by enabling data to reside at its source, thus embedding privacy as a\ncore consideration in FL architectures, whether centralized or decentralized.\nContrasting with recent findings by Pasquini et al., which suggest that\ndecentralized FL does not empirically offer any additional privacy or security\nbenefits over centralized models, our study provides compelling evidence to the\ncontrary. We demonstrate that decentralized FL, when deploying distributed\noptimization, provides enhanced privacy protection - both theoretically and\nempirically - compared to centralized approaches. The challenge of quantifying\nprivacy loss through iterative processes has traditionally constrained the\ntheoretical exploration of FL protocols. We overcome this by conducting a\npioneering in-depth information-theoretical privacy analysis for both\nframeworks. Our analysis, considering both eavesdropping and passive adversary\nmodels, successfully establishes bounds on privacy leakage. We show information\ntheoretically that the privacy loss in decentralized FL is upper bounded by the\nloss in centralized FL. Compared to the centralized case where local gradients\nof individual participants are directly revealed, a key distinction of\noptimization-based decentralized FL is that the relevant information includes\ndifferences of local gradients over successive iterations and the aggregated\nsum of different nodes' gradients over the network. This information\ncomplicates the adversary's attempt to infer private data. To bridge our\ntheoretical insights with practical applications, we present detailed case\nstudies involving logistic regression and deep neural networks. These examples\ndemonstrate that while privacy leakage remains comparable in simpler models,\ncomplex models like deep neural networks exhibit lower privacy risks under\ndecentralized FL.\n","authors":["Wenrui Yu","Qiongxiu Li","Milan Lopuhaä-Zwakenberg","Mads Græsbøll Christensen","Richard Heusdens"],"pdf_url":"https://arxiv.org/pdf/2407.09324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02224v3","updated":"2024-11-30T14:27:59Z","published":"2024-06-04T11:36:09Z","title":"FedMKT: Federated Mutual Knowledge Transfer for Large and Small Language\n Models","summary":" Recent research in federated large language models (LLMs) has primarily\nfocused on enabling clients to fine-tune their locally deployed homogeneous\nLLMs collaboratively or on transferring knowledge from server-based LLMs to\nsmall language models (SLMs) at downstream clients. However, a significant gap\nremains in the simultaneous mutual enhancement of both the server's LLM and\nclients' SLMs. To bridge this gap, we propose FedMKT, a parameter-efficient\nfederated mutual knowledge transfer framework for large and small language\nmodels. This framework is designed to adaptively transfer knowledge from the\nserver's LLM to clients' SLMs while concurrently enriching the LLM with\nclients' unique domain insights. We facilitate token alignment using minimum\nedit distance (MinED) and then selective mutual knowledge transfer between\nclient-side SLMs and a server-side LLM, aiming to collectively enhance their\nperformance. Through extensive experiments across three distinct scenarios, we\nevaluate the effectiveness of FedMKT using various public LLMs and SLMs on a\nrange of NLP text generation tasks. Empirical results demonstrate that FedMKT\nsimultaneously boosts the performance of both LLMs and SLMs.\n","authors":["Tao Fan","Guoqiang Ma","Yan Kang","Hanlin Gu","Yuanfeng Song","Lixin Fan","Kai Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2406.02224v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08074v3","updated":"2024-11-30T10:48:21Z","published":"2024-06-12T10:48:53Z","title":"A Concept-Based Explainability Framework for Large Multimodal Models","summary":" Large multimodal models (LMMs) combine unimodal encoders and large language\nmodels (LLMs) to perform multimodal tasks. Despite recent advancements towards\nthe interpretability of these models, understanding internal representations of\nLMMs remains largely a mystery. In this paper, we present a novel framework for\nthe interpretation of LMMs. We propose a dictionary learning based approach,\napplied to the representation of tokens. The elements of the learned dictionary\ncorrespond to our proposed concepts. We show that these concepts are well\nsemantically grounded in both vision and text. Thus we refer to these as\n``multi-modal concepts''. We qualitatively and quantitatively evaluate the\nresults of the learnt concepts. We show that the extracted multimodal concepts\nare useful to interpret representations of test samples. Finally, we evaluate\nthe disentanglement between different concepts and the quality of grounding\nconcepts visually and textually. Our code is publicly available at\nhttps://github.com/mshukor/xl-vlms\n","authors":["Jayneel Parekh","Pegah Khayatan","Mustafa Shukor","Alasdair Newson","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2406.08074v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2409.15380v2","updated":"2024-11-30T09:57:09Z","published":"2024-09-20T15:01:21Z","title":"Kalahi: A handcrafted, grassroots cultural LLM evaluation suite for\n Filipino","summary":" Multilingual large language models (LLMs) today may not necessarily provide\nculturally appropriate and relevant responses to its Filipino users. We\nintroduce Kalahi, a cultural LLM evaluation suite collaboratively created by\nnative Filipino speakers. It is composed of 150 high-quality, handcrafted and\nnuanced prompts that test LLMs for generations that are relevant to shared\nFilipino cultural knowledge and values. Strong LLM performance in Kalahi\nindicates a model's ability to generate responses similar to what an average\nFilipino would say or do in a given situation. We conducted experiments on LLMs\nwith multilingual and Filipino language support. Results show that Kalahi,\nwhile trivial for Filipinos, is challenging for LLMs, with the best model\nanswering only 46.0% of the questions correctly compared to native Filipino\nperformance of 89.10%. Thus, Kalahi can be used to accurately and reliably\nevaluate Filipino cultural representation in LLMs.\n","authors":["Jann Railey Montalan","Jian Gang Ngui","Wei Qi Leong","Yosephine Susanto","Hamsawardhini Rengarajan","Alham Fikri Aji","William Chandra Tjhi"],"pdf_url":"https://arxiv.org/pdf/2409.15380v2.pdf","comment":"Accepted for presentation at Paclic 38, 2024"},{"id":"http://arxiv.org/abs/2411.17931v2","updated":"2024-11-30T09:48:12Z","published":"2024-11-26T23:00:51Z","title":"Combining Threat Intelligence with IoT Scanning to Predict Cyber Attack","summary":" While the Web has become a worldwide platform for communication, hackers and\nhacktivists share their ideology and communicate with members on the \"Dark\nWeb\"-the reverse of the Web. Currently, the problems of information overload\nand difficulty to obtain a comprehensive picture of hackers and cyber-attackers\nhinder the effective analysis of predicting their activities on the Web. Also,\nthere are currently more objects connected to the internet than there are\npeople in the world and this gap will continue to grow as more and more objects\ngain ability to directly interface with the Internet. Many technical\ncommunities are vigorously pursuing research topics that contribute to the\nInternet of Things (IoT). In this paper I have proposed a novel methodology for\ncollecting and analyzing the Dark Web information to identify websites of\nhackers from the Web sea, and how this information can help us in predicting\nIoT vulnerabilities. This methodology incorporates information collection,\nanalysis, visualization techniques, and exploits some of the IoT devices.\nThrough this research I want to contribute to the existing literature on\ncyber-security that could potentially guide in both policy-making and\nintelligence research.\n","authors":["Jubin Abhishek Soni"],"pdf_url":"https://arxiv.org/pdf/2411.17931v2.pdf","comment":"8 pages, 6 figures, 2 tables. This manuscript has been submitted to\n Springer for review (Manuscript ID: PDSE-D-24-00163) and is under\n consideration. It has not yet been peer-reviewed or published. Researchers\n are welcome to read and build upon this work; please cite it appropriately.\n For questions or clarifications, feel free to contact me"},{"id":"http://arxiv.org/abs/2408.16767v2","updated":"2024-11-30T09:10:08Z","published":"2024-08-29T17:59:40Z","title":"ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion\n Model","summary":" Advancements in 3D scene reconstruction have transformed 2D images from the\nreal world into 3D models, producing realistic 3D results from hundreds of\ninput photos. Despite great success in dense-view reconstruction scenarios,\nrendering a detailed scene from insufficient captured views is still an\nill-posed optimization problem, often resulting in artifacts and distortions in\nunseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction\nparadigm that reframes the ambiguous reconstruction challenge as a temporal\ngeneration task. The key insight is to unleash the strong generative prior of\nlarge pre-trained video diffusion models for sparse-view reconstruction.\nHowever, 3D view consistency struggles to be accurately preserved in directly\ngenerated video frames from pre-trained models. To address this, given limited\ninput views, the proposed ReconX first constructs a global point cloud and\nencodes it into a contextual space as the 3D structure condition. Guided by the\ncondition, the video diffusion model then synthesizes video frames that are\nboth detail-preserved and exhibit a high degree of 3D consistency, ensuring the\ncoherence of the scene from various perspectives. Finally, we recover the 3D\nscene from the generated video through a confidence-aware 3D Gaussian Splatting\noptimization scheme. Extensive experiments on various real-world datasets show\nthe superiority of our ReconX over state-of-the-art methods in terms of quality\nand generalizability.\n","authors":["Fangfu Liu","Wenqiang Sun","Hanyang Wang","Yikai Wang","Haowen Sun","Junliang Ye","Jun Zhang","Yueqi Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16767v2.pdf","comment":"Project page: https://liuff19.github.io/ReconX"},{"id":"http://arxiv.org/abs/2410.19764v2","updated":"2024-11-30T07:06:57Z","published":"2024-10-12T16:14:18Z","title":"Unraveling Movie Genres through Cross-Attention Fusion of Bi-Modal\n Synergy of Poster","summary":" Movie posters are not just decorative; they are meticulously designed to\ncapture the essence of a movie, such as its genre, storyline, and tone/vibe.\nFor decades, movie posters have graced cinema walls, billboards, and now our\ndigital screens as a form of digital posters. Movie genre classification plays\na pivotal role in film marketing, audience engagement, and recommendation\nsystems. Previous explorations into movie genre classification have been mostly\nexamined in plot summaries, subtitles, trailers and movie scenes. Movie posters\nprovide a pre-release tantalizing glimpse into a film's key aspects, which can\nignite public interest. In this paper, we presented the framework that exploits\nmovie posters from a visual and textual perspective to address the multilabel\nmovie genre classification problem. Firstly, we extracted text from movie\nposters using an OCR and retrieved the relevant embedding. Next, we introduce a\ncross-attention-based fusion module to allocate attention weights to visual and\ntextual embedding. In validating our framework, we utilized 13882 posters\nsourced from the Internet Movie Database (IMDb). The outcomes of the\nexperiments indicate that our model exhibited promising performance and\noutperformed even some prominent contemporary architectures.\n","authors":["Utsav Kumar Nareti","Chandranath Adak","Soumi Chattopadhyay","Pichao Wang"],"pdf_url":"https://arxiv.org/pdf/2410.19764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08188v2","updated":"2024-11-30T06:09:33Z","published":"2024-08-15T14:46:13Z","title":"Nl2Hltl2Plan: Scaling Up Natural Language Understanding for Multi-Robots\n Through Hierarchical Temporal Logic Task Representation","summary":" To enable non-experts to specify long-horizon, multi-robot collaborative\ntasks, language models are increasingly used to translate natural language\ncommands into formal specifications. However, because translation can occur in\nmultiple ways, such translations may lack accuracy or lead to inefficient\nmulti-robot planning. Our key insight is that concise hierarchical\nspecifications can simplify planning while remaining straightforward to derive\nfrom human instructions. We propose~\\acronym{}, a framework that translates\nnatural language commands into hierarchical Linear Temporal Logic (LTL) and\nsolves the corresponding planning problem. The translation involves two steps\nleveraging Large Language Models (LLMs). First, an LLM transforms instructions\ninto a Hierarchical Task Tree, capturing logical and temporal relations. Next,\na fine-tuned LLM converts sub-tasks into flat LTL formulas, which are\naggregated into hierarchical specifications, with the lowest level\ncorresponding to ordered robot actions. These specifications are then used with\noff-the-shelf planners. Our~\\acronym{} demonstrates the potential of LLMs in\nhierarchical reasoning for multi-robot task planning. Evaluations in simulation\nand real-world experiments with human participants show that~\\acronym{}\noutperforms existing methods, handling more complex instructions while\nachieving higher success rates and lower costs in task allocation and planning.\nAdditional details are available at https://nl2hltl2plan.github.io .\n","authors":["Shaojun Xu","Xusheng Luo","Yutong Huang","Letian Leng","Ruixuan Liu","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08188v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.07223v3","updated":"2024-11-30T05:54:34Z","published":"2024-03-27T07:17:55Z","title":"Stock Recommendations for Individual Investors: A Temporal Graph Network\n Approach with Mean-Variance Efficient Sampling","summary":" Recommender systems can be helpful for individuals to make well-informed\ndecisions in complex financial markets. While many studies have focused on\npredicting stock prices, even advanced models fall short of accurately\nforecasting them. Additionally, previous studies indicate that individual\ninvestors often disregard established investment theories, favoring their\npersonal preferences instead. This presents a challenge for stock\nrecommendation systems, which must not only provide strong investment\nperformance but also respect these individual preferences. To create effective\nstock recommender systems, three critical elements must be incorporated: 1)\nindividual preferences, 2) portfolio diversification, and 3) the temporal\ndynamics of the first two. In response, we propose a new model, Portfolio\nTemporal Graph Network Recommender PfoTGNRec, which can handle time-varying\ncollaborative signals and incorporates diversification-enhancing sampling. On\nreal-world individual trading data, our approach demonstrates superior\nperformance compared to state-of-the-art baselines, including cutting-edge\ndynamic embedding models and existing stock recommendation models. Indeed, we\nshow that PfoTGNRec is an effective solution that can balance customer\npreferences with the need to suggest portfolios with high Return-on-Investment.\nThe source code and data are available at\nhttps://github.com/youngandbin/PfoTGNRec.\n","authors":["Youngbin Lee","Yejin Kim","Javier Sanz-Cruzado","Richard McCreadie","Yongjae Lee"],"pdf_url":"https://arxiv.org/pdf/2404.07223v3.pdf","comment":"ICAIF 2024 (https://dl.acm.org/doi/10.1145/3677052.3698662)"},{"id":"http://arxiv.org/abs/2403.06832v3","updated":"2024-11-30T04:53:04Z","published":"2024-03-11T15:48:43Z","title":"Noise-powered Multi-modal Knowledge Graph Representation Framework","summary":" The rise of Multi-modal Pre-training highlights the necessity for a unified\nMulti-Modal Knowledge Graph (MMKG) representation learning framework. Such a\nframework is essential for embedding structured knowledge into multi-modal\nLarge Language Models effectively, alleviating issues like knowledge\nmisconceptions and multi-modal hallucinations. In this work, we explore the\nefficacy of models in accurately embedding entities within MMKGs through two\npivotal tasks: Multi-modal Knowledge Graph Completion (MKGC) and Multi-modal\nEntity Alignment (MMEA). Building on this foundation, we propose a novel SNAG\nmethod that utilizes a Transformer-based architecture equipped with\nmodality-level noise masking to robustly integrate multi-modal entity features\nin KGs. By incorporating specific training objectives for both MKGC and MMEA,\nour approach achieves SOTA performance across a total of ten datasets,\ndemonstrating its versatility. Moreover, SNAG can not only function as a\nstandalone model but also enhance other existing methods, providing stable\nperformance improvements. Code and data are available at\nhttps://github.com/zjukg/SNAG.\n","authors":["Zhuo Chen","Yin Fang","Yichi Zhang","Lingbing Guo","Jiaoyan Che","Jeff Z. Pan","Huajun Chen","Wen Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.06832v3.pdf","comment":"COLING 2025 Accpeted, Repo is available at\n https://github.com/zjukg/SNAG"},{"id":"http://arxiv.org/abs/2402.00672v3","updated":"2024-11-30T04:47:10Z","published":"2024-02-01T15:33:17Z","title":"Exploring Homogeneous and Heterogeneous Consistent Label Associations\n for Unsupervised Visible-Infrared Person ReID","summary":" Unsupervised visible-infrared person re-identification (USL-VI-ReID)\nendeavors to retrieve pedestrian images of the same identity from different\nmodalities without annotations. While prior work focuses on establishing\ncross-modality pseudo-label associations to bridge the modality-gap, they\nignore maintaining the instance-level homogeneous and heterogeneous consistency\nbetween the feature space and the pseudo-label space, resulting in coarse\nassociations. In response, we introduce a Modality-Unified Label Transfer\n(MULT) module that simultaneously accounts for both homogeneous and\nheterogeneous fine-grained instance-level structures, yielding high-quality\ncross-modality label associations. It models both homogeneous and heterogeneous\naffinities, leveraging them to quantify the inconsistency between the\npseudo-label space and the feature space, subsequently minimizing it. The\nproposed MULT ensures that the generated pseudo-labels maintain alignment\nacross modalities while upholding structural consistency within intra-modality.\nAdditionally, a straightforward plug-and-play Online Cross-memory Label\nRefinement (OCLR) module is proposed to further mitigate the side effects of\nnoisy pseudo-labels while simultaneously aligning different modalities, coupled\nwith an Alternative Modality-Invariant Representation Learning (AMIRL)\nframework. Experiments demonstrate that our proposed method outperforms\nexisting state-of-the-art USL-VI-ReID methods, highlighting the superiority of\nour MULT in comparison to other cross-modality association methods. Code is\navailable at https://github.com/FranklinLingfeng/code_for_MULT.\n","authors":["Lingfeng He","De Cheng","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2402.00672v3.pdf","comment":"Accepted by IJCV2024"},{"id":"http://arxiv.org/abs/2411.05285v2","updated":"2024-11-30T02:55:48Z","published":"2024-11-08T02:31:03Z","title":"AgentOps: Enabling Observability of LLM Agents","summary":" Large language model (LLM) agents have demonstrated remarkable capabilities\nacross various domains, gaining extensive attention from academia and industry.\nHowever, these agents raise significant concerns on AI safety due to their\nautonomous and non-deterministic behavior, as well as continuous evolving\nnature . From a DevOps perspective, enabling observability in agents is\nnecessary to ensuring AI safety, as stakeholders can gain insights into the\nagents' inner workings, allowing them to proactively understand the agents,\ndetect anomalies, and prevent potential failures. Therefore, in this paper, we\npresent a comprehensive taxonomy of AgentOps, identifying the artifacts and\nassociated data that should be traced throughout the entire lifecycle of agents\nto achieve effective observability. The taxonomy is developed based on a\nsystematic mapping study of existing AgentOps tools. Our taxonomy serves as a\nreference template for developers to design and implement AgentOps\ninfrastructure that supports monitoring, logging, and analytics. thereby\nensuring AI safety.\n","authors":["Liming Dong","Qinghua Lu","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.05285v2.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.05651v2","updated":"2024-11-30T02:13:46Z","published":"2024-10-08T03:01:54Z","title":"ViBiDSampler: Enhancing Video Interpolation Using Bidirectional\n Diffusion Sampler","summary":" Recent progress in large-scale text-to-video (T2V) and image-to-video (I2V)\ndiffusion models has greatly enhanced video generation, especially in terms of\nkeyframe interpolation. However, current image-to-video diffusion models, while\npowerful in generating videos from a single conditioning frame, need adaptation\nfor two-frame (start & end) conditioned generation, which is essential for\neffective bounded interpolation. Unfortunately, existing approaches that fuse\ntemporally forward and backward paths in parallel often suffer from\noff-manifold issues, leading to artifacts or requiring multiple iterative\nre-noising steps. In this work, we introduce a novel, bidirectional sampling\nstrategy to address these off-manifold issues without requiring extensive\nre-noising or fine-tuning. Our method employs sequential sampling along both\nforward and backward paths, conditioned on the start and end frames,\nrespectively, ensuring more coherent and on-manifold generation of intermediate\nframes. Additionally, we incorporate advanced guidance techniques, CFG++ and\nDDS, to further enhance the interpolation process. By integrating these, our\nmethod achieves state-of-the-art performance, efficiently generating\nhigh-quality, smooth videos between keyframes. On a single 3090 GPU, our method\ncan interpolate 25 frames at 1024 x 576 resolution in just 195 seconds,\nestablishing it as a leading solution for keyframe interpolation.\n","authors":["Serin Yang","Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2410.05651v2.pdf","comment":"Project page: https://vibidsampler.github.io/"},{"id":"http://arxiv.org/abs/2402.02054v3","updated":"2024-11-30T01:55:11Z","published":"2024-02-03T06:17:21Z","title":"Towards Neural Scaling Laws on Graphs","summary":" Deep graph models (e.g., graph neural networks and graph transformers) have\nbecome important techniques for leveraging knowledge across various types of\ngraphs. Yet, the neural scaling laws on graphs, i.e., how the performance of\ndeep graph models changes with model and dataset sizes, have not been\nsystematically investigated, casting doubts on the feasibility of achieving\nlarge graph models. To fill this gap, we benchmark many graph datasets from\ndifferent tasks and make an attempt to establish the neural scaling laws on\ngraphs from both model and data perspectives. The model size we investigated is\nup to 100 million parameters, and the dataset size investigated is up to 50\nmillion samples. We first verify the validity of such laws on graphs,\nestablishing proper formulations to describe the scaling behaviors. For model\nscaling, we identify that despite the parameter numbers, the model depth also\nplays an important role in affecting the model scaling behaviors, which differs\nfrom observations in other domains such as CV and NLP. For data scaling, we\nsuggest that the number of graphs can not effectively measure the graph data\nvolume in scaling law since the sizes of different graphs are highly irregular.\nInstead, we reform the data scaling law with the number of nodes or edges as\nthe metric to address the irregular graph sizes. We further demonstrate that\nthe reformed law offers a unified view of the data scaling behaviors for\nvarious fundamental graph tasks including node classification, link prediction,\nand graph classification. This work provides valuable insights into neural\nscaling laws on graphs, which can serve as an important tool for collecting new\ngraph data and developing large graph models.\n","authors":["Jingzhe Liu","Haitao Mao","Zhikai Chen","Tong Zhao","Neil Shah","Jiliang Tang"],"pdf_url":"https://arxiv.org/pdf/2402.02054v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.02574v2","updated":"2024-11-30T01:42:25Z","published":"2024-09-04T09:48:27Z","title":"Solving Video Inverse Problems Using Image Diffusion Models","summary":" Recently, diffusion model-based inverse problem solvers (DIS) have emerged as\nstate-of-the-art approaches for addressing inverse problems, including image\nsuper-resolution, deblurring, inpainting, etc. However, their application to\nvideo inverse problems arising from spatio-temporal degradation remains largely\nunexplored due to the challenges in training video diffusion models. To address\nthis issue, here we introduce an innovative video inverse solver that leverages\nonly image diffusion models. Specifically, by drawing inspiration from the\nsuccess of the recent decomposed diffusion sampler (DDS), our method treats the\ntime dimension of a video as the batch dimension of image diffusion models and\nsolves spatio-temporal optimization problems within denoised spatio-temporal\nbatches derived from each image diffusion model. Moreover, we introduce a\nbatch-consistent diffusion sampling strategy that encourages consistency across\nbatches by synchronizing the stochastic noise components in image diffusion\nmodels. Our approach synergistically combines batch-consistent sampling with\nsimultaneous optimization of denoised spatio-temporal batches at each reverse\ndiffusion step, resulting in a novel and efficient diffusion sampling strategy\nfor video inverse problems. Experimental results demonstrate that our method\neffectively addresses various spatio-temporal degradations in video inverse\nproblems, achieving state-of-the-art reconstructions. Project page:\nhttps://svi-diffusion.github.io\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2409.02574v2.pdf","comment":"22 pages, 16 figures"},{"id":"http://arxiv.org/abs/2302.13186v6","updated":"2024-11-30T01:18:32Z","published":"2023-02-25T22:54:27Z","title":"Construction numbers: How to build a graph?","summary":" A construction sequence for a graph is a listing of the elements of the graph\n(the set of vertices and edges) such that each edge follows both its endpoints.\nThe construction number of the graph is the number of such sequences. We\ndetermine this number for various graph families.\n","authors":["Paul C. Kainen"],"pdf_url":"https://arxiv.org/pdf/2302.13186v6.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2411.00024v3","updated":"2024-11-30T00:17:01Z","published":"2024-10-28T22:30:06Z","title":"A Perspective for Adapting Generalist AI to Specialized Medical AI\n Applications and Their Challenges","summary":" The integration of Large Language Models (LLMs) into medical applications has\nsparked widespread interest across the healthcare industry, from drug discovery\nand development to clinical decision support, assisting telemedicine, medical\ndevices, and healthcare insurance applications. This perspective paper aims to\ndiscuss the inner workings of building LLM-powered medical AI applications and\nintroduces a comprehensive framework for their development. We review existing\nliterature and outline the unique challenges of applying LLMs in specialized\nmedical contexts. Additionally, we introduce a three-step framework to organize\nmedical LLM research activities: 1) Modeling: breaking down complex medical\nworkflows into manageable steps for developing medical-specific models; 2)\nOptimization: optimizing the model performance with crafted prompts and\nintegrating external knowledge and tools, and 3) System engineering:\ndecomposing complex tasks into subtasks and leveraging human expertise for\nbuilding medical AI applications. Furthermore, we offer a detailed use case\nplaybook that describes various LLM-powered medical AI applications, such as\noptimizing clinical trial design, enhancing clinical decision support, and\nadvancing medical imaging analysis. Finally, we discuss various challenges and\nconsiderations for building medical AI applications with LLMs, such as handling\nhallucination issues, data ownership and compliance, privacy, intellectual\nproperty considerations, compute cost, sustainability issues, and responsible\nAI requirements.\n","authors":["Zifeng Wang","Hanyin Wang","Benjamin Danek","Ying Li","Christina Mack","Hoifung Poon","Yajuan Wang","Pranav Rajpurkar","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2411.00024v3.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.13323v2","updated":"2024-11-30T23:44:43Z","published":"2024-11-20T13:46:04Z","title":"Are Large Language Models Memorizing Bug Benchmarks?","summary":" Large Language Models (LLMs) have become integral to various software\nengineering tasks, including code generation, bug detection, and repair. To\nevaluate model performance in these domains, numerous bug benchmarks containing\nreal-world bugs from software projects have been developed. However, a growing\nconcern within the software engineering community is that these benchmarks may\nnot reliably reflect true LLM performance due to the risk of data leakage.\nDespite this concern, limited research has been conducted to quantify the\nimpact of potential leakage. In this paper, we systematically evaluate popular\nLLMs to assess their susceptibility to data leakage from widely used bug\nbenchmarks. To identify potential leakage, we use multiple metrics, including a\nstudy of benchmark membership within commonly used training datasets, as well\nas analyses of negative log-likelihood and n-gram accuracy. Our findings show\nthat certain models, in particular codegen-multi, exhibit significant evidence\nof memorization in widely used benchmarks like Defects4J, while newer models\ntrained on larger datasets like LLaMa 3.1 exhibit limited signs of leakage.\nThese results highlight the need for careful benchmark selection and the\nadoption of robust metrics to adequately assess models capabilities.\n","authors":["Daniel Ramos","Claudia Mamede","Kush Jain","Paulo Canelas","Catarina Gamboa","Claire Le Goues"],"pdf_url":"https://arxiv.org/pdf/2411.13323v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2403.02871v3","updated":"2024-11-30T22:31:53Z","published":"2024-03-05T11:29:05Z","title":"Quantum Mixed-State Self-Attention Network","summary":" Attention mechanisms have revolutionized natural language processing.\nCombining them with quantum computing aims to further advance this technology.\nThis paper introduces a novel Quantum Mixed-State Self-Attention Network\n(QMSAN) for natural language processing tasks. Our model leverages quantum\ncomputing principles to enhance the effectiveness of self-attention mechanisms.\nQMSAN uses a quantum attention mechanism based on mixed state, allowing for\ndirect similarity estimation between queries and keys in the quantum domain.\nThis approach leads to more effective attention coefficient calculations. We\nalso propose an innovative quantum positional encoding scheme, implemented\nthrough fixed quantum gates within the circuit, improving the model's ability\nto capture sequence information without additional qubit resources. In\nnumerical experiments of text classification tasks on public datasets, QMSAN\noutperforms Quantum Self-Attention Neural Network (QSANN). Furthermore, we\ndemonstrate QMSAN's robustness in different quantum noise environments,\nhighlighting its potential for near-term quantum devices.\n","authors":["Fu Chen","Qinglin Zhao","Li Feng","Chuangtao Chen","Yangbin Lin","Jianhong Lin"],"pdf_url":"https://arxiv.org/pdf/2403.02871v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.14165v3","updated":"2024-11-30T22:21:30Z","published":"2024-09-21T15:07:37Z","title":"A Survey on Large Language Model-empowered Autonomous Driving","summary":" Artificial intelligence (AI) plays a crucial role in autonomous driving (AD)\nresearch, propelling its development towards intelligence and efficiency.\nCurrently, the development of AD technology follows two main technical paths:\nmodularization and end-to-end. Modularization decompose the driving task into\nmodules such as perception, prediction, planning, and control, and train them\nseparately. Due to the inconsistency of training objectives between modules,\nthe integrated effect suffers from bias. End-to-end attempts to address this\nissue by utilizing a single model that directly maps from sensor data to\ncontrol signals. This path has limited learning capabilities in a comprehensive\nset of features and struggles to handle unpredictable long-tail events and\ncomplex urban traffic scenarios. In the face of challenges encountered in both\npaths, many researchers believe that large language models (LLMs) with powerful\nreasoning capabilities and extensive knowledge understanding may be the\nsolution, expecting LLMs to provide AD systems with deeper levels of\nunderstanding and decision-making capabilities. In light of the challenges\nfaced by both paths, many researchers believe that LLMs, with their powerful\nreasoning abilities and extensive knowledge, could offer a solution. To\nunderstand if LLMs could enhance AD, this paper conducts a thorough analysis of\nthe potential applications of LLMs in AD systems, including exploring their\noptimization strategies in both modular and end-to-end approaches, with a\nparticular focus on how LLMs can tackle the problems and challenges present in\ncurrent solutions. Furthermore, we discuss an important question: Can LLM-based\nartificial general intelligence (AGI) be a key to achieve high-level AD? We\nfurther analyze the potential limitations and challenges that LLMs may\nencounter in promoting the development of AD technology.\n","authors":["Yuxuan Zhu","Shiyi Wang","Wenqing Zhong","Nianchen Shen","Yunqi Li","Siqi Wang","Zhiheng Li","Cathy Wu","Zhengbing He","Li Li"],"pdf_url":"https://arxiv.org/pdf/2409.14165v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2410.19764v2","updated":"2024-11-30T07:06:57Z","published":"2024-10-12T16:14:18Z","title":"Unraveling Movie Genres through Cross-Attention Fusion of Bi-Modal\n Synergy of Poster","summary":" Movie posters are not just decorative; they are meticulously designed to\ncapture the essence of a movie, such as its genre, storyline, and tone/vibe.\nFor decades, movie posters have graced cinema walls, billboards, and now our\ndigital screens as a form of digital posters. Movie genre classification plays\na pivotal role in film marketing, audience engagement, and recommendation\nsystems. Previous explorations into movie genre classification have been mostly\nexamined in plot summaries, subtitles, trailers and movie scenes. Movie posters\nprovide a pre-release tantalizing glimpse into a film's key aspects, which can\nignite public interest. In this paper, we presented the framework that exploits\nmovie posters from a visual and textual perspective to address the multilabel\nmovie genre classification problem. Firstly, we extracted text from movie\nposters using an OCR and retrieved the relevant embedding. Next, we introduce a\ncross-attention-based fusion module to allocate attention weights to visual and\ntextual embedding. In validating our framework, we utilized 13882 posters\nsourced from the Internet Movie Database (IMDb). The outcomes of the\nexperiments indicate that our model exhibited promising performance and\noutperformed even some prominent contemporary architectures.\n","authors":["Utsav Kumar Nareti","Chandranath Adak","Soumi Chattopadhyay","Pichao Wang"],"pdf_url":"https://arxiv.org/pdf/2410.19764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00446v1","updated":"2024-11-30T11:40:31Z","published":"2024-11-30T11:40:31Z","title":"Hybrid Local-Global Context Learning for Neural Video Compression","summary":" In neural video codecs, current state-of-the-art methods typically adopt\nmulti-scale motion compensation to handle diverse motions. These methods\nestimate and compress either optical flow or deformable offsets to reduce\ninter-frame redundancy. However, flow-based methods often suffer from\ninaccurate motion estimation in complicated scenes. Deformable\nconvolution-based methods are more robust but have a higher bit cost for motion\ncoding. In this paper, we propose a hybrid context generation module, which\ncombines the advantages of the above methods in an optimal way and achieves\naccurate compensation at a low bit cost. Specifically, considering the\ncharacteristics of features at different scales, we adopt flow-guided\ndeformable compensation at largest-scale to produce accurate alignment in\ndetailed regions. For smaller-scale features, we perform flow-based warping to\nsave the bit cost for motion coding. Furthermore, we design a local-global\ncontext enhancement module to fully explore the local-global information of\nprevious reconstructed signals. Experimental results demonstrate that our\nproposed Hybrid Local-Global Context learning (HLGC) method can significantly\nenhance the state-of-the-art methods on standard test datasets.\n","authors":["Yongqi Zhai","Jiayu Yang","Wei Jiang","Chunhui Yang","Luyang Tang","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2412.00446v1.pdf","comment":"Accepted to DCC 2024"}],"Genomics":[{"id":"http://arxiv.org/abs/2412.00471v1","updated":"2024-11-30T13:10:39Z","published":"2024-11-30T13:10:39Z","title":"LLaMA-Gene: A General-purpose Gene Task Large Language Model Based on\n Instruction Fine-tuning","summary":" Building a general-purpose task model similar to ChatGPT has been an\nimportant research direction for gene large language models. Instruction\nfine-tuning is a key component in building ChatGPT, but existing instructions\nare primarily based on natural language. Natural language and gene sequences\nhave significant differences in tokenization and encoding. Therefore,\nconstructing a multilingual model that can handle both natural language and\ngene sequences is crucial for solving this problem.In this paper, we expand the\ncapabilities of the LLaMA large language model to include gene language. This\ninvolves expanding the vocabulary using the Byte Pair Encoding (BPE) method,\nspecifically tailored for DNA and protein sequences, and conducting further\npre-training on these sequences. We then convert various downstream gene task\ndata into a unified format for instruction fine-tuning and further fine-tune\nthe model on this data.Our study demonstrates that a mixed model of gene and\nnatural language, fine-tuned with instructions, achieves results comparable to\nthe current state-of-the-art (SOTA) in tasks such as gene classification and\ngene sequence interaction. This provides a promising direction for building a\nunified large language model for gene tasks.\n","authors":["Wang Liang"],"pdf_url":"https://arxiv.org/pdf/2412.00471v1.pdf","comment":"15 pages, 2 figures"}]},"2024-12-03T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2412.02700v1","updated":"2024-12-03T18:59:56Z","published":"2024-12-03T18:59:56Z","title":"Motion Prompting: Controlling Video Generation with Motion Trajectories","summary":" Motion control is crucial for generating expressive and compelling video\ncontent; however, most existing video generation models rely mainly on text\nprompts for control, which struggle to capture the nuances of dynamic actions\nand temporal compositions. To this end, we train a video generation model\nconditioned on spatio-temporally sparse or dense motion trajectories. In\ncontrast to prior motion conditioning work, this flexible representation can\nencode any number of trajectories, object-specific or global scene motion, and\ntemporally sparse motion; due to its flexibility we refer to this conditioning\nas motion prompts. While users may directly specify sparse trajectories, we\nalso show how to translate high-level user requests into detailed, semi-dense\nmotion prompts, a process we term motion prompt expansion. We demonstrate the\nversatility of our approach through various applications, including camera and\nobject motion control, \"interacting\" with an image, motion transfer, and image\nediting. Our results showcase emergent behaviors, such as realistic physics,\nsuggesting the potential of motion prompts for probing video models and\ninteracting with future generative world models. Finally, we evaluate\nquantitatively, conduct a human study, and demonstrate strong performance.\nVideo results are available on our webpage: https://motion-prompting.github.io/\n","authors":["Daniel Geng","Charles Herrmann","Junhwa Hur","Forrester Cole","Serena Zhang","Tobias Pfaff","Tatiana Lopez-Guevara","Carl Doersch","Yusuf Aytar","Michael Rubinstein","Chen Sun","Oliver Wang","Andrew Owens","Deqing Sun"],"pdf_url":"https://arxiv.org/pdf/2412.02700v1.pdf","comment":"Project page: https://motion-prompting.github.io/"},{"id":"http://arxiv.org/abs/2412.02693v1","updated":"2024-12-03T18:59:28Z","published":"2024-12-03T18:59:28Z","title":"Diffusion-based Visual Anagram as Multi-task Learning","summary":" Visual anagrams are images that change appearance upon transformation, like\nflipping or rotation. With the advent of diffusion models, generating such\noptical illusions can be achieved by averaging noise across multiple views\nduring the reverse denoising process. However, we observe two critical failure\nmodes in this approach: (i) concept segregation, where concepts in different\nviews are independently generated, which can not be considered a true anagram,\nand (ii) concept domination, where certain concepts overpower others. In this\nwork, we cast the visual anagram generation problem in a multi-task learning\nsetting, where different viewpoint prompts are analogous to different tasks,and\nderive denoising trajectories that align well across tasks simultaneously. At\nthe core of our designed framework are two newly introduced techniques, where\n(i) an anti-segregation optimization strategy that promotes overlap in\ncross-attention maps between different concepts, and (ii) a noise vector\nbalancing method that adaptively adjusts the influence of different tasks.\nAdditionally, we observe that directly averaging noise predictions yields\nsuboptimal performance because statistical properties may not be preserved,\nprompting us to derive a noise variance rectification method. Extensive\nqualitative and quantitative experiments demonstrate our method's superior\nability to generate visual anagrams spanning diverse concepts.\n","authors":["Zhiyuan Xu","Yinhe Chen","Huan-ang Gao","Weiyan Zhao","Guiyu Zhang","Hao Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.02693v1.pdf","comment":"WACV 2025. Code is publicly available at\n https://github.com/Pixtella/Anagram-MTL"},{"id":"http://arxiv.org/abs/2412.02692v1","updated":"2024-12-03T18:59:10Z","published":"2024-12-03T18:59:10Z","title":"Taming Scalable Visual Tokenizer for Autoregressive Image Generation","summary":" Existing vector quantization (VQ) methods struggle with scalability, largely\nattributed to the instability of the codebook that undergoes partial updates\nduring training. The codebook is prone to collapse as utilization decreases,\ndue to the progressively widening distribution gap between non-activated codes\nand visual features. To solve the problem, we propose Index Backpropagation\nQuantization (IBQ), a new VQ method for the joint optimization of all codebook\nembeddings and the visual encoder. Applying a straight-through estimator on the\none-hot categorical distribution between the encoded feature and codebook, all\ncodes are differentiable and maintain a consistent latent space with the visual\nencoder. IBQ enables scalable training of visual tokenizers and, for the first\ntime, achieves a large-scale codebook ($2^{18}$) with high dimension ($256$)\nand high utilization. Experiments on the standard ImageNet benchmark\ndemonstrate the scalability and superiority of IBQ, achieving competitive\nresults on both reconstruction ($1.00$ rFID) and autoregressive visual\ngeneration ($2.05$ gFID). The code and models are available at\nhttps://github.com/TencentARC/SEED-Voken.\n","authors":["Fengyuan Shi","Zhuoyan Luo","Yixiao Ge","Yujiu Yang","Ying Shan","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.02692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11021v3","updated":"2024-12-03T18:58:22Z","published":"2024-03-16T21:40:27Z","title":"Towards Neuro-Symbolic Video Understanding","summary":" The unprecedented surge in video data production in recent years necessitates\nefficient tools to extract meaningful frames from videos for downstream tasks.\nLong-term temporal reasoning is a key desideratum for frame retrieval systems.\nWhile state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are\nproficient in short-term semantic understanding, they surprisingly fail at\nlong-term reasoning across frames. A key reason for this failure is that they\nintertwine per-frame perception and temporal reasoning into a single deep\nnetwork. Hence, decoupling but co-designing semantic understanding and temporal\nreasoning is essential for efficient scene identification. We propose a system\nthat leverages vision-language models for semantic understanding of individual\nframes but effectively reasons about the long-term evolution of events using\nstate machines and temporal logic (TL) formulae that inherently capture memory.\nOur TL-based reasoning improves the F1 score of complex event identification by\n9-15% compared to benchmarks that use GPT4 for reasoning on state-of-the-art\nself-driving datasets such as Waymo and NuScenes.\n","authors":["Minkyu Choi","Harsh Goel","Mohammad Omama","Yunhao Yang","Sahil Shah","Sandeep Chinchali"],"pdf_url":"https://arxiv.org/pdf/2403.11021v3.pdf","comment":"Accepted by The European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2412.02690v1","updated":"2024-12-03T18:58:19Z","published":"2024-12-03T18:58:19Z","title":"FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand\n Image Generation","summary":" Despite remarkable progress in image generation models, generating realistic\nhands remains a persistent challenge due to their complex articulation, varying\nviewpoints, and frequent occlusions. We present FoundHand, a large-scale\ndomain-specific diffusion model for synthesizing single and dual hand images.\nTo train our model, we introduce FoundHand-10M, a large-scale hand dataset with\n2D keypoints and segmentation mask annotations. Our insight is to use 2D hand\nkeypoints as a universal representation that encodes both hand articulation and\ncamera viewpoint. FoundHand learns from image pairs to capture physically\nplausible hand articulations, natively enables precise control through 2D\nkeypoints, and supports appearance control. Our model exhibits core\ncapabilities that include the ability to repose hands, transfer hand\nappearance, and even synthesize novel views. This leads to zero-shot\ncapabilities for fixing malformed hands in previously generated images, or\nsynthesizing hand video sequences. We present extensive experiments and\nevaluations that demonstrate state-of-the-art performance of our method.\n","authors":["Kefan Chen","Chaerin Min","Linguang Zhang","Shreyas Hampali","Cem Keskin","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2412.02690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16718v3","updated":"2024-12-03T18:56:47Z","published":"2024-11-22T23:59:12Z","title":"Neuro-Symbolic Evaluation of Text-to-Video Models using Formal\n Verification","summary":" Recent advancements in text-to-video models such as Sora, Gen-3, MovieGen,\nand CogVideoX are pushing the boundaries of synthetic video generation, with\nadoption seen in fields like robotics, autonomous driving, and entertainment.\nAs these models become prevalent, various metrics and benchmarks have emerged\nto evaluate the quality of the generated videos. However, these metrics\nemphasize visual quality and smoothness, neglecting temporal fidelity and\ntext-to-video alignment, which are crucial for safety-critical applications. To\naddress this gap, we introduce NeuS-V, a novel synthetic video evaluation\nmetric that rigorously assesses text-to-video alignment using neuro-symbolic\nformal verification techniques. Our approach first converts the prompt into a\nformally defined Temporal Logic (TL) specification and translates the generated\nvideo into an automaton representation. Then, it evaluates the text-to-video\nalignment by formally checking the video automaton against the TL\nspecification. Furthermore, we present a dataset of temporally extended prompts\nto evaluate state-of-the-art video generation models against our benchmark. We\nfind that NeuS-V demonstrates a higher correlation by over 5x with human\nevaluations when compared to existing metrics. Our evaluation further reveals\nthat current video generation models perform poorly on these temporally complex\nprompts, highlighting the need for future work in improving text-to-video\ngeneration capabilities.\n","authors":["S. P. Sharan","Minkyu Choi","Sahil Shah","Harsh Goel","Mohammad Omama","Sandeep Chinchali"],"pdf_url":"https://arxiv.org/pdf/2411.16718v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02687v1","updated":"2024-12-03T18:56:32Z","published":"2024-12-03T18:56:32Z","title":"SNOOPI: Supercharged One-step Diffusion Distillation with Proper\n Guidance","summary":" Recent approaches have yielded promising results in distilling multi-step\ntext-to-image diffusion models into one-step ones. The state-of-the-art\nefficient distillation technique, i.e., SwiftBrushv2 (SBv2), even surpasses the\nteacher model's performance with limited resources. However, our study reveals\nits instability when handling different diffusion model backbones due to using\na fixed guidance scale within the Variational Score Distillation (VSD) loss.\nAnother weakness of the existing one-step diffusion models is the missing\nsupport for negative prompt guidance, which is crucial in practical image\ngeneration. This paper presents SNOOPI, a novel framework designed to address\nthese limitations by enhancing the guidance in one-step diffusion models during\nboth training and inference. First, we effectively enhance training stability\nthrough Proper Guidance-SwiftBrush (PG-SB), which employs a random-scale\nclassifier-free guidance approach. By varying the guidance scale of both\nteacher models, we broaden their output distributions, resulting in a more\nrobust VSD loss that enables SB to perform effectively across diverse backbones\nwhile maintaining competitive performance. Second, we propose a training-free\nmethod called Negative-Away Steer Attention (NASA), which integrates negative\nprompts into one-step diffusion models via cross-attention to suppress\nundesired elements in generated images. Our experimental results show that our\nproposed methods significantly improve baseline models across various metrics.\nRemarkably, we achieve an HPSv2 score of 31.08, setting a new state-of-the-art\nbenchmark for one-step diffusion models.\n","authors":["Viet Nguyen","Anh Aengus Nguyen","Trung Dao","Khoi Nguyen","Cuong Pham","Toan Tran","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2412.02687v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.02684v1","updated":"2024-12-03T18:55:39Z","published":"2024-12-03T18:55:39Z","title":"AniGS: Animatable Gaussian Avatar from a Single Image with Inconsistent\n Gaussian Reconstruction","summary":" Generating animatable human avatars from a single image is essential for\nvarious digital human modeling applications. Existing 3D reconstruction methods\noften struggle to capture fine details in animatable models, while generative\napproaches for controllable animation, though avoiding explicit 3D modeling,\nsuffer from viewpoint inconsistencies in extreme poses and computational\ninefficiencies. In this paper, we address these challenges by leveraging the\npower of generative models to produce detailed multi-view canonical pose\nimages, which help resolve ambiguities in animatable human reconstruction. We\nthen propose a robust method for 3D reconstruction of inconsistent images,\nenabling real-time rendering during inference. Specifically, we adapt a\ntransformer-based video generation model to generate multi-view canonical pose\nimages and normal maps, pretraining on a large-scale video dataset to improve\ngeneralization. To handle view inconsistencies, we recast the reconstruction\nproblem as a 4D task and introduce an efficient 3D modeling approach using 4D\nGaussian Splatting. Experiments demonstrate that our method achieves\nphotorealistic, real-time animation of 3D human avatars from in-the-wild\nimages, showcasing its effectiveness and generalization capability.\n","authors":["Lingteng Qiu","Shenhao Zhu","Qi Zuo","Xiaodong Gu","Yuan Dong","Junfei Zhang","Chao Xu","Zhe Li","Weihao Yuan","Liefeng Bo","Guanying Chen","Zilong Dong"],"pdf_url":"https://arxiv.org/pdf/2412.02684v1.pdf","comment":"Project Page: https://lingtengqiu.github.io/2024/AniGS/"},{"id":"http://arxiv.org/abs/2412.02676v1","updated":"2024-12-03T18:51:39Z","published":"2024-12-03T18:51:39Z","title":"Planning-Guided Diffusion Policy Learning for Generalizable Contact-Rich\n Bimanual Manipulation","summary":" Contact-rich bimanual manipulation involves precise coordination of two arms\nto change object states through strategically selected contacts and motions.\nDue to the inherent complexity of these tasks, acquiring sufficient\ndemonstration data and training policies that generalize to unseen scenarios\nremain a largely unresolved challenge. Building on recent advances in planning\nthrough contacts, we introduce Generalizable Planning-Guided Diffusion Policy\nLearning (GLIDE), an approach that effectively learns to solve contact-rich\nbimanual manipulation tasks by leveraging model-based motion planners to\ngenerate demonstration data in high-fidelity physics simulation. Through\nefficient planning in randomized environments, our approach generates\nlarge-scale and high-quality synthetic motion trajectories for tasks involving\ndiverse objects and transformations. We then train a task-conditioned diffusion\npolicy via behavior cloning using these demonstrations. To tackle the\nsim-to-real gap, we propose a set of essential design options in feature\nextraction, task representation, action prediction, and data augmentation that\nenable learning robust prediction of smooth action sequences and generalization\nto unseen scenarios. Through experiments in both simulation and the real world,\nwe demonstrate that our approach can enable a bimanual robotic system to\neffectively manipulate objects of diverse geometries, dimensions, and physical\nproperties. Website: https://glide-manip.github.io/\n","authors":["Xuanlin Li","Tong Zhao","Xinghao Zhu","Jiuguang Wang","Tao Pang","Kuan Fang"],"pdf_url":"https://arxiv.org/pdf/2412.02676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01682v2","updated":"2024-12-03T18:44:43Z","published":"2024-12-02T16:29:06Z","title":"Diffusion Models with Anisotropic Gaussian Splatting for Image\n Inpainting","summary":" Image inpainting is a fundamental task in computer vision, aiming to restore\nmissing or corrupted regions in images realistically. While recent deep\nlearning approaches have significantly advanced the state-of-the-art,\nchallenges remain in maintaining structural continuity and generating coherent\ntextures, particularly in large missing areas. Diffusion models have shown\npromise in generating high-fidelity images but often lack the structural\nguidance necessary for realistic inpainting. We propose a novel inpainting\nmethod that combines diffusion models with anisotropic Gaussian splatting to\ncapture both local structures and global context effectively. By modeling\nmissing regions using anisotropic Gaussian functions that adapt to local image\ngradients, our approach provides structural guidance to the diffusion-based\ninpainting network. The Gaussian splat maps are integrated into the diffusion\nprocess, enhancing the model's ability to generate high-fidelity and\nstructurally coherent inpainting results. Extensive experiments demonstrate\nthat our method outperforms state-of-the-art techniques, producing visually\nplausible results with enhanced structural integrity and texture realism.\n","authors":["Jacob Fein-Ashley","Benjamin Fein-Ashley"],"pdf_url":"https://arxiv.org/pdf/2412.01682v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01819v2","updated":"2024-12-03T18:44:06Z","published":"2024-12-02T18:57:41Z","title":"Switti: Designing Scale-Wise Transformers for Text-to-Image Synthesis","summary":" This work presents Switti, a scale-wise transformer for text-to-image\ngeneration. Starting from existing next-scale prediction AR models, we first\nexplore them for T2I generation and propose architectural modifications to\nimprove their convergence and overall performance. We then observe that\nself-attention maps of our pretrained scale-wise AR model exhibit weak\ndependence on preceding scales. Based on this insight, we propose a non-AR\ncounterpart facilitating ~11% faster sampling and lower memory usage while also\nachieving slightly better generation quality. Furthermore, we reveal that\nclassifier-free guidance at high-resolution scales is often unnecessary and can\neven degrade performance. By disabling guidance at these scales, we achieve an\nadditional sampling acceleration of ~20% and improve the generation of\nfine-grained details. Extensive human preference studies and automated\nevaluations show that Switti outperforms existing T2I AR models and competes\nwith state-of-the-art T2I diffusion models while being up to 7 times faster.\n","authors":["Anton Voronov","Denis Kuznedelev","Mikhail Khoroshikh","Valentin Khrulkov","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2412.01819v2.pdf","comment":"19 pages, 21 figures"},{"id":"http://arxiv.org/abs/2405.14276v3","updated":"2024-12-03T18:42:56Z","published":"2024-05-23T07:53:01Z","title":"D-MiSo: Editing Dynamic 3D Scenes using Multi-Gaussians Soup","summary":" Over the past years, we have observed an abundance of approaches for modeling\ndynamic 3D scenes using Gaussian Splatting (GS). Such solutions use GS to\nrepresent the scene's structure and the neural network to model dynamics. Such\napproaches allow fast rendering and extracting each element of such a dynamic\nscene. However, modifying such objects over time is challenging. SC-GS (Sparse\nControlled Gaussian Splatting) enhanced with Deformed Control Points partially\nsolves this issue. However, this approach necessitates selecting elements that\nneed to be kept fixed, as well as centroids that should be adjusted throughout\nediting. Moreover, this task poses additional difficulties regarding the\nre-productivity of such editing. To address this, we propose Dynamic\nMulti-Gaussian Soup (D-MiSo), which allows us to model the mesh-inspired\nrepresentation of dynamic GS. Additionally, we propose a strategy of linking\nparameterized Gaussian splats, forming a Triangle Soup with the estimated mesh.\nConsequently, we can separately construct new trajectories for the 3D objects\ncomposing the scene. Thus, we can make the scene's dynamic editable over time\nor while maintaining partial dynamics.\n","authors":["Joanna Waczyńska","Piotr Borycki","Joanna Kaleta","Sławomir Tadeja","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2405.14276v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16221v3","updated":"2024-12-03T18:39:15Z","published":"2023-12-24T11:05:10Z","title":"STRIDE: Single-video based Temporally Continuous Occlusion Robust 3D\n Pose Estimation","summary":" The capability to accurately estimate 3D human poses is crucial for diverse\nfields such as action recognition, gait recognition, and virtual/augmented\nreality. However, a persistent and significant challenge within this field is\nthe accurate prediction of human poses under conditions of severe occlusion.\nTraditional image-based estimators struggle with heavy occlusions due to a lack\nof temporal context, resulting in inconsistent predictions. While video-based\nmodels benefit from processing temporal data, they encounter limitations when\nfaced with prolonged occlusions that extend over multiple frames. This\nchallenge arises because these models struggle to generalize beyond their\ntraining datasets, and the variety of occlusions is hard to capture in the\ntraining data. Addressing these challenges, we propose STRIDE (Single-video\nbased TempoRally contInuous occlusion Robust 3D Pose Estimation), a novel\nTest-Time Training (TTT) approach to fit a human motion prior for each video.\nThis approach specifically handles occlusions that were not encountered during\nthe model's training. By employing STRIDE, we can refine a sequence of noisy\ninitial pose estimates into accurate, temporally coherent poses during test\ntime, effectively overcoming the limitations of prior methods. Our framework\ndemonstrates flexibility by being model-agnostic, allowing us to use any\noff-the-shelf 3D pose estimation method for improving robustness and temporal\nconsistency. We validate STRIDE's efficacy through comprehensive experiments on\nchallenging datasets like Occluded Human3.6M, Human3.6M, and OCMotion, where it\nnot only outperforms existing single-image and video-based pose estimation\nmodels but also showcases superior handling of substantial occlusions,\nachieving fast, robust, accurate, and temporally consistent 3D pose estimates.\nCode is made publicly available at https://github.com/take2rohit/stride\n","authors":["Rohit Lal","Saketh Bachu","Yash Garg","Arindam Dutta","Calvin-Khang Ta","Dripta S. Raychaudhuri","Hannah Dela Cruz","M. Salman Asif","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2312.16221v3.pdf","comment":"Paper accepted at IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)-2025"},{"id":"http://arxiv.org/abs/2312.07636v2","updated":"2024-12-03T18:35:27Z","published":"2023-12-12T10:25:31Z","title":"Go beyond End-to-End Training: Boosting Greedy Local Learning with\n Context Supply","summary":" Traditional end-to-end (E2E) training of deep networks necessitates storing\nintermediate activations for back-propagation, resulting in a large memory\nfootprint on GPUs and restricted model parallelization. As an alternative,\ngreedy local learning partitions the network into gradient-isolated modules and\ntrains supervisely based on local preliminary losses, thereby providing\nasynchronous and parallel training methods that substantially reduce memory\ncost. However, empirical experiments reveal that as the number of segmentations\nof the gradient-isolated module increases, the performance of the local\nlearning scheme degrades substantially, severely limiting its expansibility. To\navoid this issue, we theoretically analyze the greedy local learning from the\nstandpoint of information theory and propose a ContSup scheme, which\nincorporates context supply between isolated modules to compensate for\ninformation loss. Experiments on benchmark datasets (i.e. CIFAR, SVHN, STL-10)\nachieve SOTA results and indicate that our proposed method can significantly\nimprove the performance of greedy local learning with minimal memory and\ncomputational overhead, allowing for the boost of the number of isolated\nmodules. Our codes are available at https://github.com/Tab-ct/ContSup.\n","authors":["Chengting Yu","Fengzhao Zhang","Hanzhi Ma","Aili Wang","Erping Li"],"pdf_url":"https://arxiv.org/pdf/2312.07636v2.pdf","comment":"9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2411.01547v2","updated":"2024-12-03T18:28:43Z","published":"2024-11-03T12:42:16Z","title":"Decoupling Dark Knowledge via Block-wise Logit Distillation for\n Feature-level Alignment","summary":" Knowledge Distillation (KD), a learning manner with a larger teacher network\nguiding a smaller student network, transfers dark knowledge from the teacher to\nthe student via logits or intermediate features, with the aim of producing a\nwell-performed lightweight model. Notably, many subsequent feature-based KD\nmethods outperformed the earliest logit-based KD method and iteratively\ngenerated numerous state-of-the-art distillation methods. Nevertheless, recent\nwork has uncovered the potential of the logit-based method, bringing the simple\nKD form based on logits back into the limelight. Features or logits? They\npartially implement the KD with entirely distinct perspectives; therefore,\nchoosing between logits and features is not straightforward. This paper\nprovides a unified perspective of feature alignment in order to obtain a better\ncomprehension of their fundamental distinction. Inheriting the design\nphilosophy and insights of feature-based and logit-based methods, we introduce\na block-wise logit distillation framework to apply implicit logit-based feature\nalignment by gradually replacing teacher's blocks as intermediate\nstepping-stone models to bridge the gap between the student and the teacher.\nOur method obtains comparable or superior results to state-of-the-art\ndistillation methods. This paper demonstrates the great potential of combining\nlogit and features, and we hope it will inspire future research to revisit KD\nfrom a higher vantage point.\n","authors":["Chengting Yu","Fengzhao Zhang","Ruizhe Chen","Aili Wang","Zuozhu Liu","Shurun Tan","Er-Ping Li"],"pdf_url":"https://arxiv.org/pdf/2411.01547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02643v1","updated":"2024-12-03T18:15:34Z","published":"2024-12-03T18:15:34Z","title":"A Bidirectional Long Short Term Memory Approach for Infrastructure\n Health Monitoring Using On-board Vibration Response","summary":" The growing volume of available infrastructural monitoring data enables the\ndevelopment of powerful datadriven approaches to estimate infrastructure health\nconditions using direct measurements. This paper proposes a deep learning\nmethodology to estimate infrastructure physical parameters, such as railway\ntrack stiffness, using drive-by vibration response signals. The proposed method\nemploys a Long Short-term Memory (LSTM) feature extractor accounting for\ntemporal dependencies in the feature extraction phase, and a bidirectional Long\nShort-term Memory (BiLSTM) networks to leverage bidirectional temporal\ndependencies in both the forward and backward paths of the drive-by vibration\nresponse in condition estimation phase. Additionally, a framing approach is\nemployed to enhance the resolution of the monitoring task to the beam level by\nsegmenting the vibration signal into frames equal to the distance between\nindividual beams, centering the frames over the beam nodes. The proposed\nLSTM-BiLSTM model offers a versatile tool for various bridge and railway\ninfrastructure conditions monitoring using direct drive-by vibration response\nmeasurements. The results demonstrate the potential of incorporating temporal\nanalysis in the feature extraction phase and emphasize the pivotal role of\nbidirectional temporal information in infrastructure health condition\nestimation. The proposed methodology can accurately and automatically estimate\nrailway track stiffness and identify local stiffness reductions in the presence\nof noise using drive-by measurements. An illustrative case study of\nvehicle-track interaction simulation is used to demonstrate the performance of\nthe proposed model, achieving a maximum mean absolute percentage error of 1.7%\nand 0.7% in estimating railpad and ballast stiffness, respectively.\n","authors":["R. R. Samani","A. Nunez","B. De Schutter"],"pdf_url":"https://arxiv.org/pdf/2412.02643v1.pdf","comment":"17 pages; Accepted for the presentation at Transportation Research\n Board (TRB) Annual Meeting, and under review in the Journal of Transportation\n Research Record (TRR)"},{"id":"http://arxiv.org/abs/2412.02642v1","updated":"2024-12-03T18:13:51Z","published":"2024-12-03T18:13:51Z","title":"Robust soybean seed yield estimation using high-throughput ground robot\n videos","summary":" We present a novel method for soybean (Glycine max (L.) Merr.) yield\nestimation leveraging high throughput seed counting via computer vision and\ndeep learning techniques. Traditional methods for collecting yield data are\nlabor-intensive, costly, prone to equipment failures at critical data\ncollection times, and require transportation of equipment across field sites.\nComputer vision, the field of teaching computers to interpret visual data,\nallows us to extract detailed yield information directly from images. By\ntreating it as a computer vision task, we report a more efficient alternative,\nemploying a ground robot equipped with fisheye cameras to capture comprehensive\nvideos of soybean plots from which images are extracted in a variety of\ndevelopment programs. These images are processed through the P2PNet-Yield\nmodel, a deep learning framework where we combined a Feature Extraction Module\n(the backbone of the P2PNet-Soy) and a Yield Regression Module to estimate seed\nyields of soybean plots. Our results are built on three years of yield testing\nplot data - 8500 in 2021, 2275 in 2022, and 650 in 2023. With these datasets,\nour approach incorporates several innovations to further improve the accuracy\nand generalizability of the seed counting and yield estimation architecture,\nsuch as the fisheye image correction and data augmentation with random sensor\neffects. The P2PNet-Yield model achieved a genotype ranking accuracy score of\nup to 83%. It demonstrates up to a 32% reduction in time to collect yield data\nas well as costs associated with traditional yield estimation, offering a\nscalable solution for breeding programs and agricultural productivity\nenhancement.\n","authors":["Jiale Feng","Samuel W. Blair","Timilehin Ayanlade","Aditya Balu","Baskar Ganapathysubramanian","Arti Singh","Soumik Sarkar","Asheesh K Singh"],"pdf_url":"https://arxiv.org/pdf/2412.02642v1.pdf","comment":"23 pages, 12 figures, 2 tables"},{"id":"http://arxiv.org/abs/2412.02635v1","updated":"2024-12-03T18:04:42Z","published":"2024-12-03T18:04:42Z","title":"MetaShadow: Object-Centered Shadow Detection, Removal, and Synthesis","summary":" Shadows are often under-considered or even ignored in image editing\napplications, limiting the realism of the edited results. In this paper, we\nintroduce MetaShadow, a three-in-one versatile framework that enables\ndetection, removal, and controllable synthesis of shadows in natural images in\nan object-centered fashion. MetaShadow combines the strengths of two\ncooperative components: Shadow Analyzer, for object-centered shadow detection\nand removal, and Shadow Synthesizer, for reference-based controllable shadow\nsynthesis. Notably, we optimize the learning of the intermediate features from\nShadow Analyzer to guide Shadow Synthesizer to generate more realistic shadows\nthat blend seamlessly with the scene. Extensive evaluations on multiple shadow\nbenchmark datasets show significant improvements of MetaShadow over the\nexisting state-of-the-art methods on object-centered shadow detection, removal,\nand synthesis. MetaShadow excels in image-editing tasks such as object removal,\nrelocation, and insertion, pushing the boundaries of object-centered image\nediting.\n","authors":["Tianyu Wang","Jianming Zhang","Haitian Zheng","Zhihong Ding","Scott Cohen","Zhe Lin","Wei Xiong","Chi-Wing Fu","Luis Figueroa","Soo Ye Kim"],"pdf_url":"https://arxiv.org/pdf/2412.02635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02632v1","updated":"2024-12-03T18:01:45Z","published":"2024-12-03T18:01:45Z","title":"Scaling Image Tokenizers with Grouped Spherical Quantization","summary":" Vision tokenizers have gained a lot of attraction due to their scalability\nand compactness; previous works depend on old-school GAN-based hyperparameters,\nbiased comparisons, and a lack of comprehensive analysis of the scaling\nbehaviours. To tackle those issues, we introduce Grouped Spherical Quantization\n(GSQ), featuring spherical codebook initialization and lookup regularization to\nconstrain codebook latent to a spherical surface. Our empirical analysis of\nimage tokenizer training strategies demonstrates that GSQ-GAN achieves superior\nreconstruction quality over state-of-the-art methods with fewer training\niterations, providing a solid foundation for scaling studies. Building on this,\nwe systematically examine the scaling behaviours of GSQ, specifically in latent\ndimensionality, codebook size, and compression ratios, and their impact on\nmodel performance. Our findings reveal distinct behaviours at high and low\nspatial compression levels, underscoring challenges in representing\nhigh-dimensional latent spaces. We show that GSQ can restructure\nhigh-dimensional latent into compact, low-dimensional spaces, thus enabling\nefficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x\ndown-sampling with a reconstruction FID (rFID) of 0.50.\n","authors":["Jiangtao Wang","Zhen Qin","Yifan Zhang","Vincent Tao Hu","Björn Ommer","Rania Briq","Stefan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2412.02632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02631v1","updated":"2024-12-03T17:58:07Z","published":"2024-12-03T17:58:07Z","title":"Sharp-It: A Multi-view to Multi-view Diffusion Model for 3D Synthesis\n and Manipulation","summary":" Advancements in text-to-image diffusion models have led to significant\nprogress in fast 3D content creation. One common approach is to generate a set\nof multi-view images of an object, and then reconstruct it into a 3D model.\nHowever, this approach bypasses the use of a native 3D representation of the\nobject and is hence prone to geometric artifacts and limited in controllability\nand manipulation capabilities. An alternative approach involves native 3D\ngenerative models that directly produce 3D representations. These models,\nhowever, are typically limited in their resolution, resulting in lower quality\n3D objects. In this work, we bridge the quality gap between methods that\ndirectly generate 3D representations and ones that reconstruct 3D objects from\nmulti-view images. We introduce a multi-view to multi-view diffusion model\ncalled Sharp-It, which takes a 3D consistent set of multi-view images rendered\nfrom a low-quality object and enriches its geometric details and texture. The\ndiffusion model operates on the multi-view set in parallel, in the sense that\nit shares features across the generated views. A high-quality 3D model can then\nbe reconstructed from the enriched multi-view set. By leveraging the advantages\nof both 2D and 3D approaches, our method offers an efficient and controllable\nmethod for high-quality 3D content creation. We demonstrate that Sharp-It\nenables various 3D applications, such as fast synthesis, editing, and\ncontrolled generation, while attaining high-quality assets.\n","authors":["Yiftach Edelstein","Or Patashnik","Dana Cohen-Bar","Lihi Zelnik-Manor"],"pdf_url":"https://arxiv.org/pdf/2412.02631v1.pdf","comment":"Project page at https://yiftachede.github.io/Sharp-It/"},{"id":"http://arxiv.org/abs/2412.02627v1","updated":"2024-12-03T17:56:23Z","published":"2024-12-03T17:56:23Z","title":"Continual Learning of Personalized Generative Face Models with\n Experience Replay","summary":" We introduce a novel continual learning problem: how to sequentially update\nthe weights of a personalized 2D and 3D generative face model as new batches of\nphotos in different appearances, styles, poses, and lighting are captured\nregularly. We observe that naive sequential fine-tuning of the model leads to\ncatastrophic forgetting of past representations of the individual's face. We\nthen demonstrate that a simple random sampling-based experience replay method\nis effective at mitigating catastrophic forgetting when a relatively large\nnumber of images can be stored and replayed. However, for long-term deployment\nof these models with relatively smaller storage, this simple random\nsampling-based replay technique also forgets past representations. Thus, we\nintroduce a novel experience replay algorithm that combines random sampling\nwith StyleGAN's latent space to represent the buffer as an optimal convex hull.\nWe observe that our proposed convex hull-based experience replay is more\neffective in preventing forgetting than a random sampling baseline and the\nlower bound.\n","authors":["Annie N. Wang","Luchao Qi","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2412.02627v1.pdf","comment":"Accepted to WACV 2025. Project page (incl. supplementary materials):\n https://anniedde.github.io/personalizedcontinuallearning.github.io/"},{"id":"http://arxiv.org/abs/2412.02617v1","updated":"2024-12-03T17:44:23Z","published":"2024-12-03T17:44:23Z","title":"Improving Dynamic Object Interactions in Text-to-Video Generation with\n AI Feedback","summary":" Large text-to-video models hold immense potential for a wide range of\ndownstream applications. However, these models struggle to accurately depict\ndynamic object interactions, often resulting in unrealistic movements and\nfrequent violations of real-world physics. One solution inspired by large\nlanguage models is to align generated outputs with desired outcomes using\nexternal feedback. This enables the model to refine its responses autonomously,\neliminating extensive manual data collection. In this work, we investigate the\nuse of feedback to enhance the object dynamics in text-to-video models. We aim\nto answer a critical question: what types of feedback, paired with which\nspecific self-improvement algorithms, can most effectively improve text-video\nalignment and realistic object interactions? We begin by deriving a unified\nprobabilistic objective for offline RL finetuning of text-to-video models. This\nperspective highlights how design elements in existing algorithms like KL\nregularization and policy projection emerge as specific choices within a\nunified framework. We then use derived methods to optimize a set of text-video\nalignment metrics (e.g., CLIP scores, optical flow), but notice that they often\nfail to align with human perceptions of generation quality. To address this\nlimitation, we propose leveraging vision-language models to provide more\nnuanced feedback specifically tailored to object dynamics in videos. Our\nexperiments demonstrate that our method can effectively optimize a wide variety\nof rewards, with binary AI feedback driving the most significant improvements\nin video quality for dynamic interactions, as confirmed by both AI and human\nevaluations. Notably, we observe substantial gains when using reward signals\nderived from AI feedback, particularly in scenarios involving complex\ninteractions between multiple objects and realistic depictions of objects\nfalling.\n","authors":["Hiroki Furuta","Heiga Zen","Dale Schuurmans","Aleksandra Faust","Yutaka Matsuo","Percy Liang","Sherry Yang"],"pdf_url":"https://arxiv.org/pdf/2412.02617v1.pdf","comment":"Website: https://sites.google.com/view/aif-dynamic-t2v/"},{"id":"http://arxiv.org/abs/2412.02611v1","updated":"2024-12-03T17:41:23Z","published":"2024-12-03T17:41:23Z","title":"AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand\n Audio-Visual Information?","summary":" Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini\n1.5 Pro, and Reka Core, have expanded their capabilities to include vision and\naudio modalities. While these models demonstrate impressive performance across\na wide range of audio-visual applications, our proposed DeafTest reveals that\nMLLMs often struggle with simple tasks humans find trivial: 1) determining\nwhich of two sounds is louder, and 2) determining which of two sounds has a\nhigher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a\ncomprehensive audio-visual benchmark designed to assess whether those MLLMs can\ntruly understand the audio-visual information. This benchmark encompasses 4,555\ncarefully crafted problems, each incorporating text, visual, and audio\ncomponents. To successfully infer answers, models must effectively leverage\nclues from both visual and audio inputs. To ensure precise and objective\nevaluation of MLLM responses, we have structured the questions as\nmultiple-choice, eliminating the need for human evaluation or LLM-assisted\nassessment. We benchmark a series of closed-source and open-source models and\nsummarize the observations. By revealing the limitations of current models, we\naim to provide useful insight for future dataset collection and model\ndevelopment.\n","authors":["Kaixiong Gong","Kaituo Feng","Bohao Li","Yibing Wang","Mofan Cheng","Shijia Yang","Jiaming Han","Benyou Wang","Yutong Bai","Zhuoran Yang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2412.02611v1.pdf","comment":"Project page: https://av-odyssey.github.io/"},{"id":"http://arxiv.org/abs/2412.02601v1","updated":"2024-12-03T17:32:05Z","published":"2024-12-03T17:32:05Z","title":"MERGE: Multi-faceted Hierarchical Graph-based GNN for Gene Expression\n Prediction from Whole Slide Histopathology Images","summary":" Recent advances in Spatial Transcriptomics (ST) pair histology images with\nspatially resolved gene expression profiles, enabling predictions of gene\nexpression across different tissue locations based on image patches. This opens\nup new possibilities for enhancing whole slide image (WSI) prediction tasks\nwith localized gene expression. However, existing methods fail to fully\nleverage the interactions between different tissue locations, which are crucial\nfor accurate joint prediction. To address this, we introduce MERGE\n(Multi-faceted hiErarchical gRaph for Gene Expressions), which combines a\nmulti-faceted hierarchical graph construction strategy with graph neural\nnetworks (GNN) to improve gene expression predictions from WSIs. By clustering\ntissue image patches based on both spatial and morphological features, and\nincorporating intra- and inter-cluster edges, our approach fosters interactions\nbetween distant tissue locations during GNN learning. As an additional\ncontribution, we evaluate different data smoothing techniques that are\nnecessary to mitigate artifacts in ST data, often caused by technical\nimperfections. We advocate for adopting gene-aware smoothing methods that are\nmore biologically justified. Experimental results on gene expression prediction\nshow that our GNN method outperforms state-of-the-art techniques across\nmultiple metrics.\n","authors":["Aniruddha Ganguly","Debolina Chatterjee","Wentao Huang","Jie Zhang","Alisa Yurovsky","Travis Steele Johnson","Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02601v1.pdf","comment":"Main Paper: 8 pages, Supplementary Material: 9 pages, Figures: 16"},{"id":"http://arxiv.org/abs/2412.02596v1","updated":"2024-12-03T17:29:00Z","published":"2024-12-03T17:29:00Z","title":"Class-wise Autoencoders Measure Classification Difficulty And Detect\n Label Mistakes","summary":" We introduce a new framework for analyzing classification datasets based on\nthe ratios of reconstruction errors between autoencoders trained on individual\nclasses. This analysis framework enables efficient characterization of datasets\non the sample, class, and entire dataset levels. We define reconstruction error\nratios (RERs) that probe classification difficulty and allow its decomposition\ninto (1) finite sample size and (2) Bayes error and decision-boundary\ncomplexity. Through systematic study across 19 popular visual datasets, we find\nthat our RER-based dataset difficulty probe strongly correlates with error rate\nfor state-of-the-art (SOTA) classification models. By interpreting sample-level\nclassification difficulty as a label mistakenness score, we further find that\nRERs achieve SOTA performance on mislabel detection tasks on hard datasets\nunder symmetric and asymmetric label noise. Our code is publicly available at\nhttps://github.com/voxel51/reconstruction-error-ratios.\n","authors":["Jacob Marks","Brent A. Griffin","Jason J. Corso"],"pdf_url":"https://arxiv.org/pdf/2412.02596v1.pdf","comment":"30 pages, 18 figures"},{"id":"http://arxiv.org/abs/2412.02592v1","updated":"2024-12-03T17:23:47Z","published":"2024-12-03T17:23:47Z","title":"OCR Hinders RAG: Evaluating the Cascading Impact of OCR on\n Retrieval-Augmented Generation","summary":" Retrieval-augmented Generation (RAG) enhances Large Language Models (LLMs) by\nintegrating external knowledge to reduce hallucinations and incorporate\nup-to-date information without retraining. As an essential part of RAG,\nexternal knowledge bases are commonly built by extracting structured data from\nunstructured PDF documents using Optical Character Recognition (OCR). However,\ngiven the imperfect prediction of OCR and the inherent non-uniform\nrepresentation of structured data, knowledge bases inevitably contain various\nOCR noises. In this paper, we introduce OHRBench, the first benchmark for\nunderstanding the cascading impact of OCR on RAG systems. OHRBench includes 350\ncarefully selected unstructured PDF documents from six real-world RAG\napplication domains, along with Q&As derived from multimodal elements in\ndocuments, challenging existing OCR solutions used for RAG To better understand\nOCR's impact on RAG systems, we identify two primary types of OCR noise:\nSemantic Noise and Formatting Noise and apply perturbation to generate a set of\nstructured data with varying degrees of each OCR noise. Using OHRBench, we\nfirst conduct a comprehensive evaluation of current OCR solutions and reveal\nthat none is competent for constructing high-quality knowledge bases for RAG\nsystems. We then systematically evaluate the impact of these two noise types\nand demonstrate the vulnerability of RAG systems. Furthermore, we discuss the\npotential of employing Vision-Language Models (VLMs) without OCR in RAG\nsystems. Code: https://github.com/opendatalab/OHR-Bench\n","authors":["Junyuan Zhang","Qintong Zhang","Bin Wang","Linke Ouyang","Zichen Wen","Ying Li","Ka-Ho Chow","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06219v4","updated":"2024-12-03T17:23:07Z","published":"2024-09-10T05:05:34Z","title":"Denoising: A Powerful Building-Block for Imaging, Inverse Problems, and\n Machine Learning","summary":" Denoising, the process of reducing random fluctuations in a signal to\nemphasize essential patterns, has been a fundamental problem of interest since\nthe dawn of modern scientific inquiry. Recent denoising techniques,\nparticularly in imaging, have achieved remarkable success, nearing theoretical\nlimits by some measures. Yet, despite tens of thousands of research papers, the\nwide-ranging applications of denoising beyond noise removal have not been fully\nrecognized. This is partly due to the vast and diverse literature, making a\nclear overview challenging.\n This paper aims to address this gap. We present a clarifying perspective on\ndenoisers, their structure, and desired properties. We emphasize the increasing\nimportance of denoising and showcase its evolution into an essential building\nblock for complex tasks in imaging, inverse problems, and machine learning.\nDespite its long history, the community continues to uncover unexpected and\ngroundbreaking uses for denoising, further solidifying its place as a\ncornerstone of scientific and engineering practice.\n","authors":["Peyman Milanfar","Mauricio Delbracio"],"pdf_url":"https://arxiv.org/pdf/2409.06219v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00177v2","updated":"2024-12-03T17:21:41Z","published":"2024-11-29T18:59:11Z","title":"LumiNet: Latent Intrinsics Meets Diffusion Models for Indoor Scene\n Relighting","summary":" We introduce LumiNet, a novel architecture that leverages generative models\nand latent intrinsic representations for effective lighting transfer. Given a\nsource image and a target lighting image, LumiNet synthesizes a relit version\nof the source scene that captures the target's lighting. Our approach makes two\nkey contributions: a data curation strategy from the StyleGAN-based relighting\nmodel for our training, and a modified diffusion-based ControlNet that\nprocesses both latent intrinsic properties from the source image and latent\nextrinsic properties from the target image. We further improve lighting\ntransfer through a learned adaptor (MLP) that injects the target's latent\nextrinsic properties via cross-attention and fine-tuning.\n Unlike traditional ControlNet, which generates images with conditional maps\nfrom a single scene, LumiNet processes latent representations from two\ndifferent images - preserving geometry and albedo from the source while\ntransferring lighting characteristics from the target. Experiments demonstrate\nthat our method successfully transfers complex lighting phenomena including\nspecular highlights and indirect illumination across scenes with varying\nspatial layouts and materials, outperforming existing approaches on challenging\nindoor scenes using only images as input.\n","authors":["Xiaoyan Xing","Konrad Groh","Sezer Karaoglu","Theo Gevers","Anand Bhattad"],"pdf_url":"https://arxiv.org/pdf/2412.00177v2.pdf","comment":"Project page: https://luminet-relight.github.io"},{"id":"http://arxiv.org/abs/2412.02589v1","updated":"2024-12-03T17:18:33Z","published":"2024-12-03T17:18:33Z","title":"MedTet: An Online Motion Model for 4D Heart Reconstruction","summary":" We present a novel approach to reconstruction of 3D cardiac motion from\nsparse intraoperative data. While existing methods can accurately reconstruct\n3D organ geometries from full 3D volumetric imaging, they cannot be used during\nsurgical interventions where usually limited observed data, such as a few 2D\nframes or 1D signals, is available in real-time. We propose a versatile\nframework for reconstructing 3D motion from such partial data. It discretizes\nthe 3D space into a deformable tetrahedral grid with signed distance values,\nproviding implicit unlimited resolution while maintaining explicit control over\nmotion dynamics. Given an initial 3D model reconstructed from pre-operative\nfull volumetric data, our system, equipped with an universal observation\nencoder, can reconstruct coherent 3D cardiac motion from full 3D volumes, a few\n2D MRI slices or even 1D signals. Extensive experiments on cardiac intervention\nscenarios demonstrate our ability to generate plausible and anatomically\nconsistent 3D motion reconstructions from various sparse real-time\nobservations, highlighting its potential for multimodal cardiac imaging. Our\ncode and model will be made available at https://github.com/Scalsol/MedTet.\n","authors":["Yihong Chen","Jiancheng Yang","Deniz Sayin Mercadier","Hieu Le","Pascal Fua"],"pdf_url":"https://arxiv.org/pdf/2412.02589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14494v2","updated":"2024-12-03T17:07:25Z","published":"2024-11-20T19:24:30Z","title":"dc-GAN: Dual-Conditioned GAN for Face Demorphing From a Single Morph","summary":" A facial morph is an image created by combining two face images pertaining to\ntwo distinct identities. Face demorphing inverts the process and tries to\nrecover the original images constituting a facial morph. While morph attack\ndetection (MAD) techniques can be used to flag morph images, they do not\ndivulge any visual information about the faces used to create them. Demorphing\nhelps address this problem. Existing demorphing techniques are either very\nrestrictive (assume identities during testing) or produce feeble outputs (both\noutputs look very similar). In this paper, we overcome these issues by\nproposing dc-GAN, a novel GAN-based demorphing method conditioned on the morph\nimages. Our method overcomes morph-replication and produces high quality\nreconstructions of the bonafide images used to create the morphs. Moreover, our\nmethod is highly generalizable across demorphing paradigms\n(differential/reference-free). We conduct experiments on AMSL, FRLL-Morphs and\nMorDiff datasets to showcase the efficacy of our method.\n","authors":["Nitish Shukla","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2411.14494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02575v1","updated":"2024-12-03T17:02:40Z","published":"2024-12-03T17:02:40Z","title":"Copy-Move Forgery Detection and Question Answering for Remote Sensing\n Image","summary":" This paper introduces the task of Remote Sensing Copy-Move Question Answering\n(RSCMQA). Unlike traditional Remote Sensing Visual Question Answering (RSVQA),\nRSCMQA focuses on interpreting complex tampering scenarios and inferring\nrelationships between objects. Based on the practical needs of national defense\nsecurity and land resource monitoring, we have developed an accurate and\ncomprehensive global dataset for remote sensing image copy-move question\nanswering, named RS-CMQA-2.1M. These images were collected from 29 different\nregions across 14 countries. Additionally, we have refined a balanced dataset,\nRS-CMQA-B, to address the long-standing issue of long-tail data in the remote\nsensing field. Furthermore, we propose a region-discriminative guided\nmultimodal CMQA model, which enhances the accuracy of answering questions about\ntampered images by leveraging prompt about the differences and connections\nbetween the source and tampered domains. Extensive experiments demonstrate that\nour method provides a stronger benchmark for RS-CMQA compared to general VQA\nand RSVQA models. Our dataset and code are available at\nhttps://github.com/shenyedepisa/RSCMQA.\n","authors":["Ze Zhang","Enyuan Zhao","Ziyi Wan","Jie Nie","Xinyue Liang","Lei Huang"],"pdf_url":"https://arxiv.org/pdf/2412.02575v1.pdf","comment":"7 figs, 7 tables"},{"id":"http://arxiv.org/abs/2412.02573v1","updated":"2024-12-03T16:56:10Z","published":"2024-12-03T16:56:10Z","title":"Remote Sensing Temporal Vision-Language Models: A Comprehensive Survey","summary":" Temporal image analysis in remote sensing has traditionally centered on\nchange detection, which identifies regions of change between images captured at\ndifferent times. However, change detection remains limited by its focus on\nvisual-level interpretation, often lacking contextual or descriptive\ninformation. The rise of Vision-Language Models (VLMs) has introduced a new\ndimension to remote sensing temporal image analysis by integrating visual\ninformation with natural language, creating an avenue for advanced\ninterpretation of temporal image changes. Remote Sensing Temporal VLMs\n(RSTVLMs) allow for dynamic interactions, generating descriptive captions,\nanswering questions, and providing a richer semantic understanding of temporal\nimages. This temporal vision-language capability is particularly valuable for\ncomplex remote sensing applications, where higher-level insights are crucial.\nThis paper comprehensively reviews the progress of RSTVLM research, with a\nfocus on the latest VLM applications for temporal image analysis. We categorize\nand discuss core methodologies, datasets, and metrics, highlight recent\nadvances in temporal vision-language tasks, and outline key challenges and\nfuture directions for research in this emerging field. This survey fills a\ncritical gap in the literature by providing an integrated overview of RSTVLM,\noffering a foundation for further advancements in remote sensing temporal image\nunderstanding. We will keep tracing related works at\n\\url{https://github.com/Chen-Yang-Liu/Awesome-RS-Temporal-VLM}\n","authors":["Chenyang Liu","Jiafan Zhang","Keyan Chen","Man Wang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2412.02573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02568v1","updated":"2024-12-03T16:54:46Z","published":"2024-12-03T16:54:46Z","title":"Segmentation of Coronary Artery Stenosis in X-ray Angiography using\n Mamba Models","summary":" Coronary artery disease stands as one of the primary contributors to global\nmortality rates. The automated identification of coronary artery stenosis from\nX-ray images plays a critical role in the diagnostic process for coronary heart\ndisease. This task is challenging due to the complex structure of coronary\narteries, intrinsic noise in X-ray images, and the fact that stenotic coronary\narteries appear narrow and blurred in X-ray angiographies. This study employs\nfive different variants of the Mamba-based model and one variant of the Swin\nTransformer-based model, primarily based on the U-Net architecture, for the\nlocalization of stenosis in Coronary artery disease. Our best results showed an\nF1 score of 68.79% for the U-Mamba BOT model, representing an 11.8% improvement\nover the semi-supervised approach.\n","authors":["Ali Rostami","Fatemeh Fouladi","Hedieh Sajedi"],"pdf_url":"https://arxiv.org/pdf/2412.02568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02565v1","updated":"2024-12-03T16:53:58Z","published":"2024-12-03T16:53:58Z","title":"SJTU:Spatial judgments in multimodal models towards unified segmentation\n through coordinate detection","summary":" Despite advances in vision-language understanding, implementing image\nsegmentation within multimodal architectures remains a fundamental challenge in\nmodern artificial intelligence systems. Existing vision-language models, which\nprimarily rely on backbone architectures or CLIP-based embedding learning,\ndemonstrate inherent limitations in fine-grained spatial localization and\noperational capabilities. This paper introduces SJTU: Spatial Judgments in\nmultimodal models - Towards Unified segmentation through coordinate detection,\na novel framework that leverages spatial coordinate understanding to bridge\nvision-language interaction and precise segmentation, enabling accurate target\nidentification through natural language instructions. The framework proposes a\nnovel approach for integrating segmentation techniques with vision-language\nmodels based on multimodal spatial inference. By leveraging normalized\ncoordinate detection for bounding boxes and translating it into actionable\nsegmentation outputs, we explore the possibility of integrating multimodal\nspatial and language representations. Based on the proposed technical approach,\nthe framework demonstrates superior performance on various benchmark datasets\nas well as accurate object segmentation. Results on the COCO 2017 dataset for\ngeneral object detection and Pascal VOC datasets for semantic segmentation\ndemonstrate the generalization capabilities of the framework.\n","authors":["Joongwon Chae","Zhenyu Wang","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2412.02565v1.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2412.02545v1","updated":"2024-12-03T16:37:23Z","published":"2024-12-03T16:37:23Z","title":"ShadowHack: Hacking Shadows via Luminance-Color Divide and Conquer","summary":" Shadows introduce challenges such as reduced brightness, texture\ndeterioration, and color distortion in images, complicating a holistic\nsolution. This study presents \\textbf{ShadowHack}, a divide-and-conquer\nstrategy that tackles these complexities by decomposing the original task into\nluminance recovery and color remedy. To brighten shadow regions and repair the\ncorrupted textures in the luminance space, we customize LRNet, a U-shaped\nnetwork with a rectified outreach attention module, to enhance information\ninteraction and recalibrate contaminated attention maps. With luminance\nrecovered, CRNet then leverages cross-attention mechanisms to revive vibrant\ncolors, producing visually compelling results. Extensive experiments on\nmultiple datasets are conducted to demonstrate the superiority of ShadowHack\nover existing state-of-the-art solutions both quantitatively and qualitatively,\nhighlighting the effectiveness of our design. Our code will be made publicly\navailable at https://github.com/lime-j/ShadowHack\n","authors":["Jin Hu","Mingjia Li","Xiaojie Guo"],"pdf_url":"https://arxiv.org/pdf/2412.02545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02542v1","updated":"2024-12-03T16:34:49Z","published":"2024-12-03T16:34:49Z","title":"Unveiling Concept Attribution in Diffusion Models","summary":" Diffusion models have shown remarkable abilities in generating realistic and\nhigh-quality images from text prompts. However, a trained model remains\nblack-box; little do we know about the role of its components in exhibiting a\nconcept such as objects or styles. Recent works employ causal tracing to\nlocalize layers storing knowledge in generative models without showing how\nthose layers contribute to the target concept. In this work, we approach the\nmodel interpretability problem from a more general perspective and pose a\nquestion: \\textit{``How do model components work jointly to demonstrate\nknowledge?''}. We adapt component attribution to decompose diffusion models,\nunveiling how a component contributes to a concept. Our framework allows\neffective model editing, in particular, we can erase a concept from diffusion\nmodels by removing positive components while remaining knowledge of other\nconcepts. Surprisingly, we also show there exist components that contribute\nnegatively to a concept, which has not been discovered in the knowledge\nlocalization approach. Experimental results confirm the role of positive and\nnegative components pinpointed by our framework, depicting a complete view of\ninterpreting generative models. Our code is available at\n\\url{https://github.com/mail-research/CAD-attribution4diffusion}\n","authors":["Quang H. Nguyen","Hoang Phan","Khoa D. Doan"],"pdf_url":"https://arxiv.org/pdf/2412.02542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00903v2","updated":"2024-12-03T16:32:27Z","published":"2024-12-01T17:37:25Z","title":"Tomographic SAR Reconstruction for Forest Height Estimation","summary":" Tree height estimation serves as an important proxy for biomass estimation in\necological and forestry applications. While traditional methods such as\nphotogrammetry and Light Detection and Ranging (LiDAR) offer accurate height\nmeasurements, their application on a global scale is often cost-prohibitive and\nlogistically challenging. In contrast, remote sensing techniques, particularly\n3D tomographic reconstruction from Synthetic Aperture Radar (SAR) imagery,\nprovide a scalable solution for global height estimation. SAR images have been\nused in earth observation contexts due to their ability to work in all\nweathers, unobscured by clouds. In this study, we use deep learning to estimate\nforest canopy height directly from 2D Single Look Complex (SLC) images, a\nderivative of SAR. Our method attempts to bypass traditional tomographic signal\nprocessing, potentially reducing latency from SAR capture to end product. We\nalso quantify the impact of varying numbers of SLC images on height estimation\naccuracy, aiming to inform future satellite operations and optimize data\ncollection strategies. Compared to full tomographic processing combined with\ndeep learning, our minimal method (partial processing + deep learning) falls\nshort, with an error 16-21\\% higher, highlighting the continuing relevance of\ngeometric signal processing.\n","authors":["Grace Colverd","Jumpei Takami","Laura Schade","Karol Bot","Joseph A. Gallego-Mejia"],"pdf_url":"https://arxiv.org/pdf/2412.00903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15596v2","updated":"2024-12-03T16:28:32Z","published":"2024-11-23T16:13:40Z","title":"Comparative Analysis of Resource-Efficient CNN Architectures for Brain\n Tumor Classification","summary":" Accurate brain tumor classification in MRI images is critical for timely\ndiagnosis and treatment planning. While deep learning models like ResNet-18,\nVGG-16 have shown high accuracy, they often come with increased complexity and\ncomputational demands. This study presents a comparative analysis of effective\nyet simple Convolutional Neural Network (CNN) architecture and pre-trained\nResNet18, and VGG16 model for brain tumor classification using two publicly\navailable datasets: Br35H:: Brain Tumor Detection 2020 and Brain Tumor MRI\nDataset. The custom CNN architecture, despite its lower complexity,\ndemonstrates competitive performance with the pre-trained ResNet18 and VGG16\nmodels. In binary classification tasks, the custom CNN achieved an accuracy of\n98.67% on the Br35H dataset and 99.62% on the Brain Tumor MRI Dataset. For\nmulti-class classification, the custom CNN, with a slight architectural\nmodification, achieved an accuracy of 98.09%, on the Brain Tumor MRI Dataset.\nComparatively, ResNet18 and VGG16 maintained high performance levels, but the\ncustom CNNs provided a more computationally efficient alternative.\nAdditionally,the custom CNNs were evaluated using few-shot learning (0, 5, 10,\n15, 20, 40, and 80 shots) to assess their robustness, achieving notable\naccuracy improvements with increased shots. This study highlights the potential\nof well-designed, less complex CNN architectures as effective and\ncomputationally efficient alternatives to deeper, pre-trained models for\nmedical imaging tasks, including brain tumor classification. This study\nunderscores the potential of custom CNNs in medical imaging tasks and\nencourages further exploration in this direction.\n","authors":["Md Ashik Khan","Rafath Bin Zafar Auvee"],"pdf_url":"https://arxiv.org/pdf/2411.15596v2.pdf","comment":"A revised and extended version of this paper has been accepted at the\n 27th International Conference on Computer and Information Technology (ICCIT\n 2024). It spans 8 pages and includes 6 figures"},{"id":"http://arxiv.org/abs/2411.18270v2","updated":"2024-12-03T16:26:18Z","published":"2024-11-27T12:05:34Z","title":"Grid-augmented vision: A simple yet effective approach for enhanced\n spatial understanding in multi-modal agents","summary":" Recent advances in multimodal models have demonstrated impressive\ncapabilities in object recognition and scene understanding. However, these\nmodels often struggle with precise spatial localization - a critical capability\nfor real-world applications. Inspired by how humans use grid-based references\nlike chess boards and maps, we propose introducing explicit visual position\nencoding through a simple grid overlay approach. By adding a 9x9 black grid\npattern onto input images, our method provides visual spatial guidance\nanalogous to how positional encoding works in transformers, but in an explicit,\nvisual form.\n Experiments on the COCO 2017 dataset demonstrate that our grid-based approach\nachieves significant improvements in localization accuracy, with a 107.4%\nincrease in IoU (from 0.27 to 0.56) and a 194.4% improvement in GIoU (from 0.18\nto 0.53) compared to baseline performance. Through attention visualization\nanalysis, we show how this visual position encoding helps models better ground\nspatial relationships. Our method's simplicity and effectiveness make it\nparticularly valuable for applications requiring accurate spatial reasoning,\nsuch as robotic manipulation, medical imaging, and autonomous navigation.\n","authors":["Joongwon Chae","Zhenyu Wang","Lian Zhang","Dongmei Yu","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2411.18270v2.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2412.02533v1","updated":"2024-12-03T16:25:08Z","published":"2024-12-03T16:25:08Z","title":"LiDAR-based Registration against Georeferenced Models for Globally\n Consistent Allocentric Maps","summary":" Modern unmanned aerial vehicles (UAVs) are irreplaceable in search and rescue\n(SAR) missions to obtain a situational overview or provide closeups without\nendangering personnel. However, UAVs heavily rely on global navigation\nsatellite system (GNSS) for localization which works well in open spaces, but\nthe precision drastically degrades in the vicinity of buildings. These\ninaccuracies hinder aggregation of diverse data from multiple sources in a\nunified georeferenced frame for SAR operators. In contrast, CityGML models\nprovide approximate building shapes with accurate georeferenced poses. Besides,\nLiDAR works best in the vicinity of 3D structures. Hence, we refine coarse GNSS\nmeasurements by registering LiDAR maps against CityGML and digital elevation\nmap (DEM) models as a prior for allocentric mapping. An intuitive plausibility\nscore selects the best hypothesis based on occupancy using a 2D height map.\nAfterwards, we integrate the registration results in a continuous-time\nspline-based pose graph optimizer with LiDAR odometry and further sensing\nmodalities to obtain globally consistent, georeferenced trajectories and maps.\nWe evaluate the viability of our approach on multiple flights captured at two\ndistinct testing sites. Our method successfully reduced GNSS offset errors from\nup-to 16 m to below 0.5 m on multiple flights. Furthermore, we obtain globally\nconsistent maps w.r.t. prior 3D geospatial models.\n","authors":["Jan Quenzel","Linus T. Mallwitz","Benedikt T. Arnold","Sven Behnke"],"pdf_url":"https://arxiv.org/pdf/2412.02533v1.pdf","comment":"Presented at IEEE International Symposium on Safety, Security, and\n Rescue Robotics (SSRR), New York City, USA, November 2024"},{"id":"http://arxiv.org/abs/2412.02531v1","updated":"2024-12-03T16:24:16Z","published":"2024-12-03T16:24:16Z","title":"Multimodal Remote Sensing Scene Classification Using VLMs and Dual-Cross\n Attention Networks","summary":" Remote sensing scene classification (RSSC) is a critical task with diverse\napplications in land use and resource management. While unimodal image-based\napproaches show promise, they often struggle with limitations such as high\nintra-class variance and inter-class similarity. Incorporating textual\ninformation can enhance classification by providing additional context and\nsemantic understanding, but manual text annotation is labor-intensive and\ncostly. In this work, we propose a novel RSSC framework that integrates text\ndescriptions generated by large vision-language models (VLMs) as an auxiliary\nmodality without incurring expensive manual annotation costs. To fully leverage\nthe latent complementarities between visual and textual data, we propose a dual\ncross-attention-based network to fuse these modalities into a unified\nrepresentation. Extensive experiments with both quantitative and qualitative\nevaluation across five RSSC datasets demonstrate that our framework\nconsistently outperforms baseline models. We also verify the effectiveness of\nVLM-generated text descriptions compared to human-annotated descriptions.\nAdditionally, we design a zero-shot classification scenario to show that the\nlearned multimodal representation can be effectively utilized for unseen class\nclassification. This research opens new opportunities for leveraging textual\ninformation in RSSC tasks and provides a promising multimodal fusion structure,\noffering insights and inspiration for future studies. Code is available at:\nhttps://github.com/CJR7/MultiAtt-RSSC\n","authors":["Jinjin Cai","Kexin Meng","Baijian Yang","Gang Shao"],"pdf_url":"https://arxiv.org/pdf/2412.02531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02530v1","updated":"2024-12-03T16:23:02Z","published":"2024-12-03T16:23:02Z","title":"WEM-GAN: Wavelet transform based facial expression manipulation","summary":" Facial expression manipulation aims to change human facial expressions\nwithout affecting face recognition. In order to transform the facial\nexpressions to target expressions, previous methods relied on expression labels\nto guide the manipulation process. However, these methods failed to preserve\nthe details of facial features, which causes the weakening or the loss of\nidentity information in the output image. In our work, we propose WEM-GAN, in\nshort for wavelet-based expression manipulation GAN, which puts more efforts on\npreserving the details of the original image in the editing process. Firstly,\nwe take advantage of the wavelet transform technique and combine it with our\ngenerator with a U-net autoencoder backbone, in order to improve the\ngenerator's ability to preserve more details of facial features. Secondly, we\nalso implement the high-frequency component discriminator, and use\nhigh-frequency domain adversarial loss to further constrain the optimization of\nour model, providing the generated face image with more abundant details.\nAdditionally, in order to narrow the gap between generated facial expressions\nand target expressions, we use residual connections between encoder and\ndecoder, while also using relative action units (AUs) several times. Extensive\nqualitative and quantitative experiments have demonstrated that our model\nperforms better in preserving identity features, editing capability, and image\ngeneration quality on the AffectNet dataset. It also shows superior performance\nin metrics such as Average Content Distance (ACD) and Expression Distance (ED).\n","authors":["Dongya Sun","Yunfei Hu","Xianzhe Zhang","Yingsong Hu"],"pdf_url":"https://arxiv.org/pdf/2412.02530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00876v2","updated":"2024-12-03T16:12:09Z","published":"2024-12-01T16:32:31Z","title":"Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic\n Vision-language Context Sparsification","summary":" Multimodal Large Language Models (MLLMs) have achieved remarkable success in\nvision understanding, reasoning, and interaction. However, the inference\ncomputation and memory increase progressively with the generation of output\ntokens during decoding, directly affecting the efficacy of MLLMs. Existing\nmethods attempt to reduce the vision context redundancy to achieve efficient\nMLLMs. Unfortunately, the efficiency benefits of the vision context reduction\nin the prefill stage gradually diminish during the decoding stage. To address\nthis problem, we proposed a dynamic vision-language context sparsification\nframework Dynamic-LLaVA, which dynamically reduces the redundancy of vision\ncontext in the prefill stage and decreases the memory and computation overhead\nof the generated language context during decoding. Dynamic-LLaVA designs a\ntailored sparsification inference scheme for different inference modes, i.e.,\nprefill, decoding with and without KV cache, to achieve efficient inference of\nMLLMs. In practice, Dynamic-LLaVA can reduce computation consumption by\n$\\sim$75\\% in the prefill stage. Meanwhile, throughout the entire generation\nprocess of MLLMs, Dynamic-LLaVA reduces the $\\sim$50\\% computation consumption\nunder decoding without KV cache, while saving $\\sim$50\\% GPU memory overhead\nwhen decoding with KV cache, due to the vision-language context sparsification.\nExtensive experiments also demonstrate that Dynamic-LLaVA achieves efficient\ninference for MLLMs with negligible understanding and generation ability\ndegradation or even performance gains compared to the full-context inference\nbaselines. Code is available at https://github.com/Osilly/dynamic_llava .\n","authors":["Wenxuan Huang","Zijie Zhai","Yunhang Shen","Shaoshen Cao","Fei Zhao","Xiangfeng Xu","Zheyu Ye","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2412.00876v2.pdf","comment":"Code is available at https://github.com/Osilly/dynamic_llava"},{"id":"http://arxiv.org/abs/2412.02508v1","updated":"2024-12-03T15:39:05Z","published":"2024-12-03T15:39:05Z","title":"Towards Rich Emotions in 3D Avatars: A Text-to-3D Avatar Generation\n Benchmark","summary":" Producing emotionally dynamic 3D facial avatars with text derived from spoken\nwords (Emo3D) has been a pivotal research topic in 3D avatar generation. While\nprogress has been made in general-purpose 3D avatar generation, the exploration\nof generating emotional 3D avatars remains scarce, primarily due to the\ncomplexities of identifying and rendering rich emotions from spoken words. This\npaper reexamines Emo3D generation and draws inspiration from human processes,\nbreaking down Emo3D into two cascading steps: Text-to-3D Expression Mapping\n(T3DEM) and 3D Avatar Rendering (3DAR). T3DEM is the most crucial step in\ndetermining the quality of Emo3D generation and encompasses three key\nchallenges: Expression Diversity, Emotion-Content Consistency, and Expression\nFluidity. To address these challenges, we introduce a novel benchmark to\nadvance research in Emo3D generation. First, we present EmoAva, a large-scale,\nhigh-quality dataset for T3DEM, comprising 15,000 text-to-3D expression\nmappings that characterize the aforementioned three challenges in Emo3D\ngeneration. Furthermore, we develop various metrics to effectively evaluate\nmodels against these identified challenges. Next, to effectively model the\nconsistency, diversity, and fluidity of human expressions in the T3DEM step, we\npropose the Continuous Text-to-Expression Generator, which employs an\nautoregressive Conditional Variational Autoencoder for expression code\ngeneration, enhanced with Latent Temporal Attention and Expression-wise\nAttention mechanisms. Finally, to further enhance the 3DAR step on rendering\nhigher-quality subtle expressions, we present the Globally-informed Gaussian\nAvatar (GiGA) model. GiGA incorporates a global information mechanism into 3D\nGaussian representations, enabling the capture of subtle micro-expressions and\nseamless transitions between emotional states.\n","authors":["Haidong Xu","Meishan Zhang","Hao Ju","Zhedong Zheng","Hongyuan Zhu","Erik Cambria","Min Zhang","Hao Fei"],"pdf_url":"https://arxiv.org/pdf/2412.02508v1.pdf","comment":"18 pages, 14 figures. Project website:\n https://github.com/WalkerMitty/EmoAva"},{"id":"http://arxiv.org/abs/2412.02506v1","updated":"2024-12-03T15:34:00Z","published":"2024-12-03T15:34:00Z","title":"ROVER: A Multi-Season Dataset for Visual SLAM","summary":" Robust Simultaneous Localization and Mapping (SLAM) is a crucial enabler for\nautonomous navigation in natural, unstructured environments such as parks and\ngardens. However, these environments present unique challenges for SLAM due to\nfrequent seasonal changes, varying light conditions, and dense vegetation.\nThese factors often degrade the performance of visual SLAM algorithms\noriginally developed for structured urban environments. To address this gap, we\npresent ROVER, a comprehensive benchmark dataset tailored for evaluating visual\nSLAM algorithms under diverse environmental conditions and spatial\nconfigurations. We captured the dataset with a robotic platform equipped with\nmonocular, stereo, and RGB-D cameras, as well as inertial sensors. It covers 39\nrecordings across five outdoor locations, collected through all seasons and\nvarious lighting scenarios, i.e., day, dusk, and night with and without\nexternal lighting. With this novel dataset, we evaluate several traditional and\ndeep learning-based SLAM methods and study their performance in diverse\nchallenging conditions. The results demonstrate that while stereo-inertial and\nRGB-D configurations generally perform better under favorable lighting and\nmoderate vegetation, most SLAM systems perform poorly in low-light and\nhigh-vegetation scenarios, particularly during summer and autumn. Our analysis\nhighlights the need for improved adaptability in visual SLAM algorithms for\noutdoor applications, as current systems struggle with dynamic environmental\nfactors affecting scale, feature extraction, and trajectory consistency. This\ndataset provides a solid foundation for advancing visual SLAM research in\nreal-world, natural environments, fostering the development of more resilient\nSLAM systems for long-term outdoor localization and mapping. The dataset and\nthe code of the benchmark are available under\nhttps://iis-esslingen.github.io/rover.\n","authors":["Fabian Schmidt","Constantin Blessing","Markus Enzweiler","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2412.02506v1.pdf","comment":"17 pages, 7 figures, 11 tables"},{"id":"http://arxiv.org/abs/2411.13001v2","updated":"2024-12-03T15:31:34Z","published":"2024-11-20T02:57:35Z","title":"Collaborative Feature-Logits Contrastive Learning for Open-Set\n Semi-Supervised Object Detection","summary":" Current Semi-Supervised Object Detection (SSOD) methods enhance detector\nperformance by leveraging large amounts of unlabeled data, assuming that both\nlabeled and unlabeled data share the same label space. However, in open-set\nscenarios, the unlabeled dataset contains both in-distribution (ID) classes and\nout-of-distribution (OOD) classes. Applying semi-supervised detectors in such\nsettings can lead to misclassifying OOD class as ID classes. To alleviate this\nissue, we propose a simple yet effective method, termed Collaborative\nFeature-Logits Detector (CFL-Detector). Specifically, we introduce a\nfeature-level clustering method using contrastive loss to clarify vector\nboundaries in the feature space and highlight class differences. Additionally,\nby optimizing the logits-level uncertainty classification loss, the model\nenhances its ability to effectively distinguish between ID and OOD classes.\nExtensive experiments demonstrate that our method achieves state-of-the-art\nperformance compared to existing methods.\n","authors":["Xinhao Zhong","Siyu Jiao","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2411.13001v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02493v1","updated":"2024-12-03T15:08:03Z","published":"2024-12-03T15:08:03Z","title":"RelayGS: Reconstructing Dynamic Scenes with Large-Scale and Complex\n Motions via Relay Gaussians","summary":" Reconstructing dynamic scenes with large-scale and complex motions remains a\nsignificant challenge. Recent techniques like Neural Radiance Fields and 3D\nGaussian Splatting (3DGS) have shown promise but still struggle with scenes\ninvolving substantial movement. This paper proposes RelayGS, a novel method\nbased on 3DGS, specifically designed to represent and reconstruct highly\ndynamic scenes. Our RelayGS learns a complete 4D representation with canonical\n3D Gaussians and a compact motion field, consisting of three stages. First, we\nlearn a fundamental 3DGS from all frames, ignoring temporal scene variations,\nand use a learnable mask to separate the highly dynamic foreground from the\nminimally moving background. Second, we replicate multiple copies of the\ndecoupled foreground Gaussians from the first stage, each corresponding to a\ntemporal segment, and optimize them using pseudo-views constructed from\nmultiple frames within each segment. These Gaussians, termed Relay Gaussians,\nact as explicit relay nodes, simplifying and breaking down large-scale motion\ntrajectories into smaller, manageable segments. Finally, we jointly learn the\nscene's temporal motion and refine the canonical Gaussians learned from the\nfirst two stages. We conduct thorough experiments on two dynamic scene datasets\nfeaturing large and complex motions, where our RelayGS outperforms\nstate-of-the-arts by more than 1 dB in PSNR, and successfully reconstructs\nreal-world basketball game scenes in a much more complete and coherent manner,\nwhereas previous methods usually struggle to capture the complex motion of\nplayers. Code will be publicly available at https://github.com/gqk/RelayGS\n","authors":["Qiankun Gao","Yanmin Wu","Chengxiang Wen","Jiarui Meng","Luyang Tang","Jie Chen","Ronggang Wang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02493v1.pdf","comment":"Technical Report. GitHub: https://github.com/gqk/RelayGS"},{"id":"http://arxiv.org/abs/2410.19973v3","updated":"2024-12-03T14:54:39Z","published":"2024-10-25T21:22:52Z","title":"Multi-Class Abnormality Classification Task in Video Capsule Endoscopy","summary":" In this work for Capsule Vision Challenge 2024, we addressed the challenge of\nmulticlass anomaly classification in video capsule Endoscopy (VCE)[1] with a\nvariety of deep learning models, ranging from custom CNNs to advanced\ntransformer architectures. The purpose is to correctly classify diverse\ngastrointestinal disorders, which is critical for increasing diagnostic\nefficiency in clinical settings. We started with a baseline CNN model and\nimproved performance with ResNet[2] for better feature extraction, followed by\nVision Transformer (ViT)[3] to capture global dependencies. We further improve\nthe results by using Multiscale Vision Transformer (MViT)[4] for improved\nhierarchical feature extraction, while Dual Attention Vision Transformer\n(DaViT) [5] delivered best results by combining spatial and channel attention\nmethods. Our best balanced accuracy on validation set [6] was 0.8592 and Mean\nAUC was 0.9932. This methodology enabled us to improve model accuracy across a\nwide range of criteria, greatly surpassing all other methods.Additionally, our\nteam capsule commandos achieved 7th place ranking with a test set[7]\nperformance of Mean AUC: 0.7314 and balanced accuracy: 0.3235\n","authors":["Dev Rishi Verma","Vibhor Saxena","Dhruv Sharma","Arpan Gupta"],"pdf_url":"https://arxiv.org/pdf/2410.19973v3.pdf","comment":"Submission for Video Capsule Endoscopy Challenge"},{"id":"http://arxiv.org/abs/2412.02479v1","updated":"2024-12-03T14:42:31Z","published":"2024-12-03T14:42:31Z","title":"OODFace: Benchmarking Robustness of Face Recognition under Common\n Corruptions and Appearance Variations","summary":" With the rise of deep learning, facial recognition technology has seen\nextensive research and rapid development. Although facial recognition is\nconsidered a mature technology, we find that existing open-source models and\ncommercial algorithms lack robustness in certain real-world Out-of-Distribution\n(OOD) scenarios, raising concerns about the reliability of these systems. In\nthis paper, we introduce OODFace, which explores the OOD challenges faced by\nfacial recognition models from two perspectives: common corruptions and\nappearance variations. We systematically design 30 OOD scenarios across 9 major\ncategories tailored for facial recognition. By simulating these challenges on\npublic datasets, we establish three robustness benchmarks: LFW-C/V, CFP-FP-C/V,\nand YTF-C/V. We then conduct extensive experiments on 19 different facial\nrecognition models and 3 commercial APIs, along with extended experiments on\nface masks, Vision-Language Models (VLMs), and defense strategies to assess\ntheir robustness. Based on the results, we draw several key insights,\nhighlighting the vulnerability of facial recognition systems to OOD data and\nsuggesting possible solutions. Additionally, we offer a unified toolkit that\nincludes all corruption and variation types, easily extendable to other\ndatasets. We hope that our benchmarks and findings can provide guidance for\nfuture improvements in facial recognition model robustness.\n","authors":["Caixin Kang","Yubo Chen","Shouwei Ruan","Shiji Zhao","Ruochen Zhang","Jiayi Wang","Shan Fu","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2412.02479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02449v1","updated":"2024-12-03T13:34:42Z","published":"2024-12-03T13:34:42Z","title":"BYE: Build Your Encoder with One Sequence of Exploration Data for\n Long-Term Dynamic Scene Understanding","summary":" Dynamic scene understanding remains a persistent challenge in robotic\napplications. Early dynamic mapping methods focused on mitigating the negative\ninfluence of short-term dynamic objects on camera motion estimation by masking\nor tracking specific categories, which often fall short in adapting to\nlong-term scene changes. Recent efforts address object association in long-term\ndynamic environments using neural networks trained on synthetic datasets, but\nthey still rely on predefined object shapes and categories. Other methods\nincorporate visual, geometric, or semantic heuristics for the association but\noften lack robustness. In this work, we introduce BYE, a class-agnostic,\nper-scene point cloud encoder that removes the need for predefined categories,\nshape priors, or extensive association datasets. Trained on only a single\nsequence of exploration data, BYE can efficiently perform object association in\ndynamically changing scenes. We further propose an ensembling scheme combining\nthe semantic strengths of Vision Language Models (VLMs) with the scene-specific\nexpertise of BYE, achieving a 7% improvement and a 95% success rate in object\nassociation tasks. Code and dataset are available at\nhttps://byencoder.github.io.\n","authors":["Chenguang Huang","Shengchao Yan","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2412.02449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02447v1","updated":"2024-12-03T13:31:29Z","published":"2024-12-03T13:31:29Z","title":"Resonance: Learning to Predict Social-Aware Pedestrian Trajectories as\n Co-Vibrations","summary":" Learning to forecast the trajectories of intelligent agents like pedestrians\nhas caught more researchers' attention. Despite researchers' efforts, it\nremains a challenge to accurately account for social interactions among agents\nwhen forecasting, and in particular, to simulate such social modifications to\nfuture trajectories in an explainable and decoupled way. Inspired by the\nresonance phenomenon of vibration systems, we propose the Resonance (short for\nRe) model to forecast pedestrian trajectories as co-vibrations, and regard that\nsocial interactions are associated with spectral properties of agents'\ntrajectories. It forecasts future trajectories as three distinct vibration\nterms to represent agents' future plans from different perspectives in a\ndecoupled way. Also, agents' social interactions and how they modify scheduled\ntrajectories will be considered in a resonance-like manner by learning the\nsimilarities of their trajectory spectrums. Experiments on multiple datasets,\nwhether pedestrian or vehicle, have verified the usefulness of our method both\nquantitatively and qualitatively.\n","authors":["Conghao Wong","Ziqian Zou","Beihao Xia","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2412.02447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00888v2","updated":"2024-12-03T13:30:51Z","published":"2024-12-01T16:56:03Z","title":"DPE-Net: Dual-Parallel Encoder Based Network for Semantic Segmentation\n of Polyps","summary":" In medical imaging, efficient segmentation of colon polyps plays a pivotal\nrole in minimally invasive solutions for colorectal cancer. This study\nintroduces a novel approach employing two parallel encoder branches within a\nnetwork for polyp segmentation. One branch of the encoder incorporates the dual\nconvolution blocks that have the capability to maintain feature information\nover increased depths, and the other block embraces the single convolution\nblock with the addition of the previous layer's feature, offering diversity in\nfeature extraction within the encoder, combining them before transpose layers\nwith a depth-wise concatenation operation. Our model demonstrated superior\nperformance, surpassing several established deep-learning architectures on the\nKvasir and CVC-ClinicDB datasets, achieved a Dice score of 0.919, a mIoU of\n0.866 for the Kvasir dataset, and a Dice score of 0.931 and a mIoU of 0.891 for\nthe CVC-ClinicDB. The visual and quantitative results highlight the efficacy of\nour model, potentially setting a new model in medical image segmentation.\n","authors":["Malik Abdul Manan","Feng Jinchao","Shahzad Ahmed","Abdul Raheem"],"pdf_url":"https://arxiv.org/pdf/2412.00888v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02443v1","updated":"2024-12-03T13:27:51Z","published":"2024-12-03T13:27:51Z","title":"Multi-scale and Multi-path Cascaded Convolutional Network for Semantic\n Segmentation of Colorectal Polyps","summary":" Colorectal polyps are structural abnormalities of the gastrointestinal tract\nthat can potentially become cancerous in some cases. The study introduces a\nnovel framework for colorectal polyp segmentation named the Multi-Scale and\nMulti-Path Cascaded Convolution Network (MMCC-Net), aimed at addressing the\nlimitations of existing models, such as inadequate spatial dependence\nrepresentation and the absence of multi-level feature integration during the\ndecoding stage by integrating multi-scale and multi-path cascaded convolutional\ntechniques and enhances feature aggregation through dual attention modules,\nskip connections, and a feature enhancer. MMCC-Net achieves superior\nperformance in identifying polyp areas at the pixel level. The Proposed\nMMCC-Net was tested across six public datasets and compared against eight SOTA\nmodels to demonstrate its efficiency in polyp segmentation. The MMCC-Net's\nperformance shows Dice scores with confidence intervals ranging between (77.08,\n77.56) and (94.19, 94.71) and Mean Intersection over Union (MIoU) scores with\nconfidence intervals ranging from (72.20, 73.00) to (89.69, 90.53) on the six\ndatabases. These results highlight the model's potential as a powerful tool for\naccurate and efficient polyp segmentation, contributing to early detection and\nprevention strategies in colorectal cancer.\n","authors":["Malik Abdul Manan","Feng Jinchao","Muhammad Yaqub","Shahzad Ahmed","Syed Muhammad Ali Imran","Imran Shabir Chuhan","Haroon Ahmed Khan"],"pdf_url":"https://arxiv.org/pdf/2412.02443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01407v2","updated":"2024-12-03T13:14:39Z","published":"2024-12-02T11:50:35Z","title":"HoloDrive: Holistic 2D-3D Multi-Modal Street Scene Generation for\n Autonomous Driving","summary":" Generative models have significantly improved the generation and prediction\nquality on either camera images or LiDAR point clouds for autonomous driving.\nHowever, a real-world autonomous driving system uses multiple kinds of input\nmodality, usually cameras and LiDARs, where they contain complementary\ninformation for generation, while existing generation methods ignore this\ncrucial feature, resulting in the generated results only covering separate 2D\nor 3D information. In order to fill the gap in 2D-3D multi-modal joint\ngeneration for autonomous driving, in this paper, we propose our framework,\n\\emph{HoloDrive}, to jointly generate the camera images and LiDAR point clouds.\nWe employ BEV-to-Camera and Camera-to-BEV transform modules between\nheterogeneous generative models, and introduce a depth prediction branch in the\n2D generative model to disambiguate the un-projecting from image space to BEV\nspace, then extend the method to predict the future by adding temporal\nstructure and carefully designed progressive training. Further, we conduct\nexperiments on single frame generation and world model benchmarks, and\ndemonstrate our method leads to significant performance gains over SOTA methods\nin terms of generation metrics.\n","authors":["Zehuan Wu","Jingcheng Ni","Xiaodong Wang","Yuxin Guo","Rui Chen","Lewei Lu","Jifeng Dai","Yuwen Xiong"],"pdf_url":"https://arxiv.org/pdf/2412.01407v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07266v5","updated":"2024-12-03T12:41:32Z","published":"2024-10-09T01:39:26Z","title":"Spiking GS: Towards High-Accuracy and Low-Cost Surface Reconstruction\n via Spiking Neuron-based Gaussian Splatting","summary":" 3D Gaussian Splatting is capable of reconstructing 3D scenes in minutes.\nDespite recent advances in improving surface reconstruction accuracy, the\nreconstructed results still exhibit bias and suffer from inefficiency in\nstorage and training. This paper provides a different observation on the cause\nof the inefficiency and the reconstruction bias, which is attributed to the\nintegration of the low-opacity parts (LOPs) of the generated Gaussians. We show\nthat LOPs consist of Gaussians with overall low-opacity (LOGs) and the\nlow-opacity tails (LOTs) of Gaussians. We propose Spiking GS to reduce such two\ntypes of LOPs by integrating spiking neurons into the Gaussian Splatting\npipeline. Specifically, we introduce global and local full-precision\nintegrate-and-fire spiking neurons to the opacity and representation function\nof flattened 3D Gaussians, respectively. Furthermore, we enhance the density\ncontrol strategy with spiking neurons' thresholds and a new criterion on the\nscale of Gaussians. Our method can represent more accurate reconstructed\nsurfaces at a lower cost. The supplementary material and code are available at\nhttps://github.com/zju-bmi-lab/SpikingGS.\n","authors":["Weixing Zhang","Zongrui Li","De Ma","Huajin Tang","Xudong Jiang","Qian Zheng","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2410.07266v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16200v2","updated":"2024-12-03T12:39:11Z","published":"2024-08-29T01:42:38Z","title":"PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object\n Detection in Bird's-Eye-View","summary":" Recently, LSS-based multi-view 3D object detection provides an economical and\ndeployment-friendly solution for autonomous driving. However, all the existing\nLSS-based methods transform multi-view image features into a Cartesian\nBird's-Eye-View(BEV) representation, which does not take into account the\nnon-uniform image information distribution and hardly exploits the view\nsymmetry. In this paper, in order to adapt the image information distribution\nand preserve the view symmetry by regular convolution, we propose to employ the\npolar BEV representation to substitute the Cartesian BEV representation. To\nachieve this, we elaborately tailor three modules: a polar view transformer to\ngenerate the polar BEV representation, a polar temporal fusion module for\nfusing historical polar BEV features and a polar detection head to predict the\npolar-parameterized representation of the object. In addition, we design a 2D\nauxiliary detection head and a spatial attention enhancement module to improve\nthe quality of feature extraction in perspective view and BEV, respectively.\nFinally, we integrate the above improvements into a novel multi-view 3D object\ndetector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves\nthe superior performance. The code is available at\nhttps://github.com/Yzichen/PolarBEVDet.git.\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16200v2.pdf","comment":"11 pages, 6 figures. This work has been submitted to the IEEE for\n possible publication"},{"id":"http://arxiv.org/abs/2412.02421v1","updated":"2024-12-03T12:36:38Z","published":"2024-12-03T12:36:38Z","title":"TimeWalker: Personalized Neural Space for Lifelong Head Avatars","summary":" We present TimeWalker, a novel framework that models realistic, full-scale 3D\nhead avatars of a person on lifelong scale. Unlike current human head avatar\npipelines that capture identity at the momentary level(e.g., instant\nphotography or short videos), TimeWalker constructs a person's comprehensive\nidentity from unstructured data collection over his/her various life stages,\noffering a paradigm to achieve full reconstruction and animation of that person\nat different moments of life. At the heart of TimeWalker's success is a novel\nneural parametric model that learns personalized representation with the\ndisentanglement of shape, expression, and appearance across ages. Central to\nour methodology are the concepts of two aspects: (1) We track back to the\nprinciple of modeling a person's identity in an additive combination of average\nhead representation in the canonical space, and moment-specific head attribute\nrepresentations driven from a set of neural head basis. To learn the set of\nhead basis that could represent the comprehensive head variations in a compact\nmanner, we propose a Dynamic Neural Basis-Blending Module (Dynamo). It\ndynamically adjusts the number and blend weights of neural head bases,\naccording to both shared and specific traits of the target person over ages.\n(2) Dynamic 2D Gaussian Splatting (DNA-2DGS), an extension of Gaussian\nsplatting representation, to model head motion deformations like facial\nexpressions without losing the realism of rendering and reconstruction.\nDNA-2DGS includes a set of controllable 2D oriented planar Gaussian disks that\nutilize the priors from parametric model, and move/rotate with the change of\nexpression. Through extensive experimental evaluations, we show TimeWalker's\nability to reconstruct and animate avatars across decoupled dimensions with\nrealistic rendering effects, demonstrating a way to achieve personalized 'time\ntraveling' in a breeze.\n","authors":["Dongwei Pan","Yang Li","Hongsheng Li","Kwan-Yee Lin"],"pdf_url":"https://arxiv.org/pdf/2412.02421v1.pdf","comment":"Project Page: https://timewalker2024.github.io/timewalker.github.io/\n , Video: https://www.youtube.com/watch?v=x8cpOVMY_ko"},{"id":"http://arxiv.org/abs/2412.02419v1","updated":"2024-12-03T12:31:44Z","published":"2024-12-03T12:31:44Z","title":"It Takes Two: Real-time Co-Speech Two-person's Interaction Generation\n via Reactive Auto-regressive Diffusion Model","summary":" Conversational scenarios are very common in real-world settings, yet existing\nco-speech motion synthesis approaches often fall short in these contexts, where\none person's audio and gestures will influence the other's responses.\nAdditionally, most existing methods rely on offline sequence-to-sequence\nframeworks, which are unsuitable for online applications. In this work, we\nintroduce an audio-driven, auto-regressive system designed to synthesize\ndynamic movements for two characters during a conversation. At the core of our\napproach is a diffusion-based full-body motion synthesis model, which is\nconditioned on the past states of both characters, speech audio, and a\ntask-oriented motion trajectory input, allowing for flexible spatial control.\nTo enhance the model's ability to learn diverse interactions, we have enriched\nexisting two-person conversational motion datasets with more dynamic and\ninteractive motions. We evaluate our system through multiple experiments to\nshow it outperforms across a variety of tasks, including single and two-person\nco-speech motion generation, as well as interactive motion generation. To the\nbest of our knowledge, this is the first system capable of generating\ninteractive full-body motions for two characters from speech in an online\nmanner.\n","authors":["Mingyi Shi","Dafei Qin","Leo Ho","Zhouyingcheng Liao","Yinghao Huang","Junichi Yamagishi","Taku Komura"],"pdf_url":"https://arxiv.org/pdf/2412.02419v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.02412v1","updated":"2024-12-03T12:12:03Z","published":"2024-12-03T12:12:03Z","title":"VISTA: A Panoramic View of Neural Representations","summary":" We present VISTA (Visualization of Internal States and Their Associations), a\nnovel pipeline for visually exploring and interpreting neural network\nrepresentations. VISTA addresses the challenge of analyzing vast\nmultidimensional spaces in modern machine learning models by mapping\nrepresentations into a semantic 2D space. The resulting collages visually\nreveal patterns and relationships within internal representations. We\ndemonstrate VISTA's utility by applying it to sparse autoencoder latents\nuncovering new properties and interpretations. We review the VISTA methodology,\npresent findings from our case study ( https://got.drib.net/latents/ ), and\ndiscuss implications for neural network interpretability across various domains\nof machine learning.\n","authors":["Tom White"],"pdf_url":"https://arxiv.org/pdf/2412.02412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16279v3","updated":"2024-12-03T12:09:33Z","published":"2024-06-24T03:01:08Z","title":"SegNet4D: Efficient Instance-Aware 4D Semantic Segmentation for LiDAR\n Point Cloud","summary":" 4D LiDAR semantic segmentation, also referred to as multi-scan semantic\nsegmentation, plays a crucial role in enhancing the environmental understanding\ncapabilities of autonomous vehicles or robots. It classifies the semantic\ncategory of each LiDAR measurement point and detects whether it is dynamic, a\ncritical ability for tasks like obstacle avoidance and autonomous navigation.\nExisting approaches often rely on computationally heavy 4D convolutions or\nrecursive networks, which result in poor real-time performance, making them\nunsuitable for online robotics and autonomous driving applications. In this\npaper, we introduce SegNet4D, a novel real-time 4D semantic segmentation\nnetwork offering both efficiency and strong semantic understanding. SegNet4D\naddresses 4D segmentation as two tasks: single-scan semantic segmentation and\nmoving object segmentation, each tackled by a separate network head. Both\nresults are combined in a motion-semantic fusion module to achieve\ncomprehensive 4D segmentation. Additionally, instance information is extracted\nfrom the current scan and exploited for instance-wise segmentation consistency.\nOur approach surpasses state-of-the-art in both multi-scan semantic\nsegmentation and moving object segmentation while offering greater efficiency,\nenabling real-time operation. Besides, its effectiveness and efficiency have\nalso been validated on a real-world unmanned ground platform. Our code will be\nreleased at https://github.com/nubot-nudt/SegNet4D.\n","authors":["Neng Wang","Ruibin Guo","Chenghao Shi","Ziyue Wang","Hui Zhang","Huimin Lu","Zhiqiang Zheng","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2406.16279v3.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.00277v2","updated":"2024-12-03T12:04:07Z","published":"2024-11-29T23:12:38Z","title":"Facial Expression Recognition with Controlled Privacy Preservation and\n Feature Compensation","summary":" Facial expression recognition (FER) systems raise significant privacy\nconcerns due to the potential exposure of sensitive identity information. This\npaper presents a study on removing identity information while preserving FER\ncapabilities. Drawing on the observation that low-frequency components\npredominantly contain identity information and high-frequency components\ncapture expression, we propose a novel two-stream framework that applies\nprivacy enhancement to each component separately. We introduce a controlled\nprivacy enhancement mechanism to optimize performance and a feature compensator\nto enhance task-relevant features without compromising privacy. Furthermore, we\npropose a novel privacy-utility trade-off, providing a quantifiable measure of\nprivacy preservation efficacy in closed-set FER tasks. Extensive experiments on\nthe benchmark CREMA-D dataset demonstrate that our framework achieves 78.84%\nrecognition accuracy with a privacy (facial identity) leakage ratio of only\n2.01%, highlighting its potential for secure and reliable video-based FER\napplications.\n","authors":["Feng Xu","David Ahmedt-Aristizabal","Lars Petersson","Dadong Wang","Xun Li"],"pdf_url":"https://arxiv.org/pdf/2412.00277v2.pdf","comment":"WACV2025 accepted"},{"id":"http://arxiv.org/abs/2404.06135v2","updated":"2024-12-03T12:03:40Z","published":"2024-04-09T09:02:21Z","title":"Efficient Concertormer for Image Deblurring and Beyond","summary":" The Transformer architecture has achieved remarkable success in natural\nlanguage processing and high-level vision tasks over the past few years.\nHowever, the inherent complexity of self-attention is quadratic to the size of\nthe image, leading to unaffordable computational costs for high-resolution\nvision tasks. In this paper, we introduce Concertormer, featuring a novel\nConcerto Self-Attention (CSA) mechanism designed for image deblurring. The\nproposed CSA divides self-attention into two distinct components: one\nemphasizes generally global and another concentrates on specifically local\ncorrespondence. By retaining partial information in additional dimensions\nindependent from the self-attention calculations, our method effectively\ncaptures global contextual representations with complexity linear to the image\nsize. To effectively leverage the additional dimensions, we present a\nCross-Dimensional Communication module, which linearly combines attention maps\nand thus enhances expressiveness. Moreover, we amalgamate the two-staged\nTransformer design into a single stage using the proposed gated-dconv MLP\narchitecture. While our primary objective is single-image motion deblurring,\nextensive quantitative and qualitative evaluations demonstrate that our\napproach performs favorably against the state-of-the-art methods in other\ntasks, such as deraining and deblurring with JPEG artifacts. The source codes\nand trained models will be made available to the public.\n","authors":["Pin-Hung Kuo","Jinshan Pan","Shao-Yi Chien","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.06135v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02403v1","updated":"2024-12-03T11:53:05Z","published":"2024-12-03T11:53:05Z","title":"3D Face Reconstruction From Radar Images","summary":" The 3D reconstruction of faces gains wide attention in computer vision and is\nused in many fields of application, for example, animation, virtual reality,\nand even forensics. This work is motivated by monitoring patients in sleep\nlaboratories. Due to their unique characteristics, sensors from the radar\ndomain have advantages compared to optical sensors, namely penetration of\nelectrically non-conductive materials and independence of light. These\nadvantages of radar signals unlock new applications and require adaptation of\n3D reconstruction frameworks. We propose a novel model-based method for 3D\nreconstruction from radar images. We generate a dataset of synthetic radar\nimages with a physics-based but non-differentiable radar renderer. This dataset\nis used to train a CNN-based encoder to estimate the parameters of a 3D\nmorphable face model. Whilst the encoder alone already leads to strong\nreconstructions of synthetic data, we extend our reconstruction in an\nAnalysis-by-Synthesis fashion to a model-based autoencoder. This is enabled by\nlearning the rendering process in the decoder, which acts as an object-specific\ndifferentiable radar renderer. Subsequently, the combination of both network\nparts is trained to minimize both, the loss of the parameters and the loss of\nthe resulting reconstructed radar image. This leads to the additional benefit,\nthat at test time the parameters can be further optimized by finetuning the\nautoencoder unsupervised on the image loss. We evaluated our framework on\ngenerated synthetic face images as well as on real radar images with 3D ground\ntruth of four individuals.\n","authors":["Valentin Braeutigam","Vanessa Wirth","Ingrid Ullmann","Christian Schüßler","Martin Vossiek","Matthias Berking","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2412.02403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02402v1","updated":"2024-12-03T11:50:16Z","published":"2024-12-03T11:50:16Z","title":"RG-SAN: Rule-Guided Spatial Awareness Network for End-to-End 3D\n Referring Expression Segmentation","summary":" 3D Referring Expression Segmentation (3D-RES) aims to segment 3D objects by\ncorrelating referring expressions with point clouds. However, traditional\napproaches frequently encounter issues like over-segmentation or\nmis-segmentation, due to insufficient emphasis on spatial information of\ninstances. In this paper, we introduce a Rule-Guided Spatial Awareness Network\n(RG-SAN) by utilizing solely the spatial information of the target instance for\nsupervision. This approach enables the network to accurately depict the spatial\nrelationships among all entities described in the text, thus enhancing the\nreasoning capabilities. The RG-SAN consists of the Text-driven Localization\nModule (TLM) and the Rule-guided Weak Supervision (RWS) strategy. The TLM\ninitially locates all mentioned instances and iteratively refines their\npositional information. The RWS strategy, acknowledging that only target\nobjects have supervised positional information, employs dependency tree rules\nto precisely guide the core instance's positioning. Extensive testing on the\nScanRefer benchmark has shown that RG-SAN not only establishes new performance\nbenchmarks, with an mIoU increase of 5.1 points, but also exhibits significant\nimprovements in robustness when processing descriptions with spatial ambiguity.\nAll codes are available at https://github.com/sosppxo/RG-SAN.\n","authors":["Changli Wu","Qi Chen","Jiayi Ji","Haowei Wang","Yiwei Ma","You Huang","Gen Luo","Hao Fei","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2412.02402v1.pdf","comment":"Accepted by NeurIPS 2024 (Oral), Code:\n https://github.com/sosppxo/RG-SAN"},{"id":"http://arxiv.org/abs/2412.02399v1","updated":"2024-12-03T11:49:01Z","published":"2024-12-03T11:49:01Z","title":"OMENN: One Matrix to Explain Neural Networks","summary":" Deep Learning (DL) models are often black boxes, making their decision-making\nprocesses difficult to interpret. This lack of transparency has driven\nadvancements in eXplainable Artificial Intelligence (XAI), a field dedicated to\nclarifying the reasoning behind DL model predictions. Among these,\nattribution-based methods such as LRP and GradCAM are widely used, though they\nrely on approximations that can be imprecise.\n To address these limitations, we introduce One Matrix to Explain Neural\nNetworks (OMENN), a novel post-hoc method that represents a neural network as a\nsingle, interpretable matrix for each specific input. This matrix is\nconstructed through a series of linear transformations that represent the\nprocessing of the input by each successive layer in the neural network. As a\nresult, OMENN provides locally precise, attribution-based explanations of the\ninput across various modern models, including ViTs and CNNs. We present a\ntheoretical analysis of OMENN based on dynamic linearity property and validate\nits effectiveness with extensive tests on two XAI benchmarks, demonstrating\nthat OMENN is competitive with state-of-the-art methods.\n","authors":["Adam Wróbel","Mikołaj Janusz","Bartosz Zieliński","Dawid Rymarczyk"],"pdf_url":"https://arxiv.org/pdf/2412.02399v1.pdf","comment":"Under review, code will be released after acceptance"},{"id":"http://arxiv.org/abs/2412.02395v1","updated":"2024-12-03T11:47:33Z","published":"2024-12-03T11:47:33Z","title":"Who Walks With You Matters: Perceiving Social Interactions with Groups\n for Pedestrian Trajectory Prediction","summary":" Understanding and anticipating human movement has become more critical and\nchallenging in diverse applications such as autonomous driving and\nsurveillance. The complex interactions brought by different relations between\nagents are a crucial reason that poses challenges to this task. Researchers\nhave put much effort into designing a system using rule-based or data-based\nmodels to extract and validate the patterns between pedestrian trajectories and\nthese interactions, which has not been adequately addressed yet. Inspired by\nhow humans perceive social interactions with different level of relations to\nthemself, this work proposes the GrouP ConCeption (short for GPCC) model\ncomposed of the Group method, which categorizes nearby agents into either group\nmembers or non-group members based on a long-term distance kernel function, and\nthe Conception module, which perceives both visual and acoustic information\nsurrounding the target agent. Evaluated across multiple datasets, the GPCC\nmodel demonstrates significant improvements in trajectory prediction accuracy,\nvalidating its effectiveness in modeling both social and individual dynamics.\nThe qualitative analysis also indicates that the GPCC framework successfully\nleverages grouping and perception cues human-like intuitively to validate the\nproposed model's explainability in pedestrian trajectory forecasting.\n","authors":["Ziqian Zou","Conghao Wong","Beihao Xia","Qinmu Peng","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2412.02395v1.pdf","comment":"15 pages, 10 figures, submitted to CVPR 2025"},{"id":"http://arxiv.org/abs/2412.02393v1","updated":"2024-12-03T11:47:14Z","published":"2024-12-03T11:47:14Z","title":"Bio-inspired visual relative localization for large swarms of UAVs","summary":" We propose a new approach to visual perception for relative localization of\nagents within large-scale swarms of UAVs. Inspired by biological perception\nutilized by schools of sardines, swarms of bees, and other large groups of\nanimals capable of moving in a decentralized yet coherent manner, our method\ndoes not rely on detecting individual neighbors by each agent and estimating\ntheir relative position, but rather we propose to regress a neighbor density\nover distance. This allows for a more accurate distance estimation as well as\nbetter scalability with respect to the number of neighbors. Additionally, a\nnovel swarm control algorithm is proposed to make it compatible with the new\nrelative localization method. We provide a thorough evaluation of the presented\nmethods and demonstrate that the regressing approach to distance estimation is\nmore robust to varying relative pose of the targets and that it is suitable to\nbe used as the main source of relative localization for swarm stabilization.\n","authors":["Martin Křížek","Matouš Vrba","Antonella Barišić Kulaš","Stjepan Bogdan","Martin Saska"],"pdf_url":"https://arxiv.org/pdf/2412.02393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02386v1","updated":"2024-12-03T11:21:17Z","published":"2024-12-03T11:21:17Z","title":"Single-Shot Metric Depth from Focused Plenoptic Cameras","summary":" Metric depth estimation from visual sensors is crucial for robots to\nperceive, navigate, and interact with their environment. Traditional range\nimaging setups, such as stereo or structured light cameras, face hassles\nincluding calibration, occlusions, and hardware demands, with accuracy limited\nby the baseline between cameras. Single- and multi-view monocular depth offers\na more compact alternative, but is constrained by the unobservability of the\nmetric scale. Light field imaging provides a promising solution for estimating\nmetric depth by using a unique lens configuration through a single device.\nHowever, its application to single-view dense metric depth is under-addressed\nmainly due to the technology's high cost, the lack of public benchmarks, and\nproprietary geometrical models and software.\n Our work explores the potential of focused plenoptic cameras for dense metric\ndepth. We propose a novel pipeline that predicts metric depth from a single\nplenoptic camera shot by first generating a sparse metric point cloud using\nmachine learning, which is then used to scale and align a dense relative depth\nmap regressed by a foundation depth model, resulting in dense metric depth. To\nvalidate it, we curated the Light Field & Stereo Image Dataset (LFS) of\nreal-world light field images with stereo depth labels, filling a current gap\nin existing resources. Experimental results show that our pipeline produces\naccurate metric depth predictions, laying a solid groundwork for future\nresearch in this field.\n","authors":["Blanca Lasheras-Hernandez","Klaus H. Strobl","Sergio Izquierdo","Tim Bodenmüller","Rudolph Triebel","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2412.02386v1.pdf","comment":"8 pages (6 for text + 2 for references), 6 figures, 2 tables.\n Submitted to IEEE ICRA 2025"},{"id":"http://arxiv.org/abs/2403.16970v3","updated":"2024-12-03T11:09:31Z","published":"2024-03-25T17:31:12Z","title":"Enhancing joint automatic chest X-ray diagnosis and clinical visual\n attention prediction with multi-stage cooperative learning","summary":" Purpose: As visual inspection is an inherent process during radiological\nscreening, the associated eye gaze data can provide valuable insights into\nrelevant clinical decisions. As deep learning has become the state-of-the-art\nfor computer-assisted diagnosis, integrating human behavior, such as eye gaze\ndata, into these systems is instrumental to help align machine predictions with\nclinical diagnostic criteria, thus enhancing the quality of automatic\nradiological diagnosis. Methods: We propose a novel deep learning framework for\njoint disease diagnosis and prediction of corresponding clinical visual\nattention maps for chest X-ray scans. Specifically, we introduce a new\ndual-encoder multi-task UNet, which leverages both a DenseNet201 backbone and a\nResidual and Squeeze-and-Excitation block-based encoder to extract diverse\nfeatures for visual attention map prediction, and a multi-scale feature-fusion\nclassifier to perform disease classification. To tackle the issue of\nasynchronous training schedules of individual tasks in multi-task learning, we\nproposed a multi-stage cooperative learning strategy, with contrastive learning\nfor feature encoder pretraining to boost performance. Results: Our proposed\nmethod is shown to significantly outperform existing techniques for chest X-ray\ndiagnosis (AUC=0.93) and the quality of visual attention map prediction\n(Correlation coefficient=0.58). Conclusion: Benefiting from the proposed\nmulti-task multi-stage cooperative learning, our technique demonstrates the\nbenefit of integrating clinicians' eye gaze into clinical AI systems to boost\nperformance and potentially explainability.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.16970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02373v1","updated":"2024-12-03T11:00:15Z","published":"2024-12-03T11:00:15Z","title":"Active Negative Loss: A Robust Framework for Learning with Noisy Labels","summary":" Deep supervised learning has achieved remarkable success across a wide range\nof tasks, yet it remains susceptible to overfitting when confronted with noisy\nlabels. To address this issue, noise-robust loss functions offer an effective\nsolution for enhancing learning in the presence of label noise. In this work,\nwe systematically investigate the limitation of the recently proposed Active\nPassive Loss (APL), which employs Mean Absolute Error (MAE) as its passive loss\nfunction. Despite the robustness brought by MAE, one of its key drawbacks is\nthat it pays equal attention to clean and noisy samples; this feature slows\ndown convergence and potentially makes training difficult, particularly in\nlarge-scale datasets. To overcome these challenges, we introduce a novel loss\nfunction class, termed Normalized Negative Loss Functions (NNLFs), which serve\nas passive loss functions within the APL framework. NNLFs effectively address\nthe limitations of MAE by concentrating more on memorized clean samples. By\nreplacing MAE in APL with our proposed NNLFs, we enhance APL and present a new\nframework called Active Negative Loss (ANL). Moreover, in non-symmetric noise\nscenarios, we propose an entropy-based regularization technique to mitigate the\nvulnerability to the label imbalance. Extensive experiments demonstrate that\nthe new loss functions adopted by our ANL framework can achieve better or\ncomparable performance to state-of-the-art methods across various label noise\ntypes and in image segmentation tasks. The source code is available at:\nhttps://github.com/Virusdoll/Active-Negative-Loss.\n","authors":["Xichen Ye","Yifan Wu","Yiwen Xu","Xiaoqiang Li","Weizhong Zhang","Yifan Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02373v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.02370v1","updated":"2024-12-03T10:54:37Z","published":"2024-12-03T10:54:37Z","title":"Trajectory-based Road Autolabeling with Lidar-Camera Fusion in Winter\n Conditions","summary":" Robust road segmentation in all road conditions is required for safe\nautonomous driving and advanced driver assistance systems. Supervised deep\nlearning methods provide accurate road segmentation in the domain of their\ntraining data but cannot be trusted in out-of-distribution scenarios. Including\nthe whole distribution in the trainset is challenging as each sample must be\nlabeled by hand. Trajectory-based self-supervised methods offer a potential\nsolution as they can learn from the traversed route without manual labels.\nHowever, existing trajectory-based methods use learning schemes that rely only\non the camera or only on the lidar. In this paper, trajectory-based learning is\nimplemented jointly with lidar and camera for increased performance. Our method\noutperforms recent standalone camera- and lidar-based methods when evaluated\nwith a challenging winter driving dataset including countryside and suburb\ndriving scenes. The source code is available at\nhttps://github.com/eerik98/lidar-camera-road-autolabeling.git\n","authors":["Eerik Alamikkotervo","Henrik Toikka","Kari Tammi","Risto Ojala"],"pdf_url":"https://arxiv.org/pdf/2412.02370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02368v1","updated":"2024-12-03T10:52:06Z","published":"2024-12-03T10:52:06Z","title":"ScImage: How Good Are Multimodal Large Language Models at Scientific\n Text-to-Image Generation?","summary":" Multimodal large language models (LLMs) have demonstrated impressive\ncapabilities in generating high-quality images from textual instructions.\nHowever, their performance in generating scientific images--a critical\napplication for accelerating scientific progress--remains underexplored. In\nthis work, we address this gap by introducing ScImage, a benchmark designed to\nevaluate the multimodal capabilities of LLMs in generating scientific images\nfrom textual descriptions. ScImage assesses three key dimensions of\nunderstanding: spatial, numeric, and attribute comprehension, as well as their\ncombinations, focusing on the relationships between scientific objects (e.g.,\nsquares, circles). We evaluate five models, GPT-4o, Llama, AutomaTikZ, Dall-E,\nand StableDiffusion, using two modes of output generation: code-based outputs\n(Python, TikZ) and direct raster image generation. Additionally, we examine\nfour different input languages: English, German, Farsi, and Chinese. Our\nevaluation, conducted with 11 scientists across three criteria (correctness,\nrelevance, and scientific accuracy), reveals that while GPT-4o produces outputs\nof decent quality for simpler prompts involving individual dimensions such as\nspatial, numeric, or attribute understanding in isolation, all models face\nchallenges in this task, especially for more complex prompts.\n","authors":["Leixin Zhang","Steffen Eger","Yinjie Cheng","Weihe Zhai","Jonas Belouadi","Christoph Leiter","Simone Paolo Ponzetto","Fahimeh Moafian","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.02368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02366v1","updated":"2024-12-03T10:45:34Z","published":"2024-12-03T10:45:34Z","title":"GenMix: Effective Data Augmentation with Generative Diffusion Model\n Image Editing","summary":" Data augmentation is widely used to enhance generalization in visual\nclassification tasks. However, traditional methods struggle when source and\ntarget domains differ, as in domain adaptation, due to their inability to\naddress domain gaps. This paper introduces GenMix, a generalizable\nprompt-guided generative data augmentation approach that enhances both\nin-domain and cross-domain image classification. Our technique leverages image\nediting to generate augmented images based on custom conditional prompts,\ndesigned specifically for each problem type. By blending portions of the input\nimage with its edited generative counterpart and incorporating fractal\npatterns, our approach mitigates unrealistic images and label ambiguity,\nimproving the performance and adversarial robustness of the resulting models.\nEfficacy of our method is established with extensive experiments on eight\npublic datasets for general and fine-grained classification, in both in-domain\nand cross-domain settings. Additionally, we demonstrate performance\nimprovements for self-supervised learning, learning with data scarcity, and\nadversarial robustness. As compared to the existing state-of-the-art methods,\nour technique achieves stronger performance across the board.\n","authors":["Khawar Islam","Muhammad Zaigham Zaheer","Arif Mahmood","Karthik Nandakumar","Naveed Akhtar"],"pdf_url":"https://arxiv.org/pdf/2412.02366v1.pdf","comment":"https://diffusemix.github.io/"},{"id":"http://arxiv.org/abs/2412.02359v1","updated":"2024-12-03T10:32:41Z","published":"2024-12-03T10:32:41Z","title":"Realistic Surgical Simulation from Monocular Videos","summary":" This paper tackles the challenge of automatically performing realistic\nsurgical simulations from readily available surgical videos. Recent efforts\nhave successfully integrated physically grounded dynamics within 3D Gaussians\nto perform high-fidelity simulations in well-reconstructed simulation\nenvironments from static scenes. However, they struggle with the geometric\ninconsistency in reconstructing simulation environments and unrealistic\nphysical deformations in simulations of soft tissues when it comes to dynamic\nand complex surgical processes. In this paper, we propose SurgiSim, a novel\nautomatic simulation system to overcome these limitations. To build a surgical\nsimulation environment, we maintain a canonical 3D scene composed of 3D\nGaussians coupled with a deformation field to represent a dynamic surgical\nscene. This process involves a multi-stage optimization with trajectory and\nanisotropic regularization, enhancing the geometry consistency of the canonical\nscene, which serves as the simulation environment. To achieve realistic\nphysical simulations in this environment, we implement a Visco-Elastic\ndeformation model based on the Maxwell model, effectively restoring the complex\ndeformations of tissues. Additionally, we infer the physical parameters of\ntissues by minimizing the discrepancies between the input video and simulation\nresults guided by estimated tissue motion, ensuring realistic simulation\noutcomes. Experiments on various surgical scenarios and interactions\ndemonstrate SurgiSim's ability to perform realistic simulation of soft tissues\namong surgical procedures, showing its enormous potential for enhancing\nsurgical training, planning, and robotic surgery systems. The project page is\nat https://namaenashibot.github.io/SurgiSim/.\n","authors":["Kailing Wang","Chen Yang","Keyang Zhao","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2412.02359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17459v3","updated":"2024-12-03T10:32:34Z","published":"2024-09-26T01:34:42Z","title":"TFS-NeRF: Template-Free NeRF for Semantic 3D Reconstruction of Dynamic\n Scene","summary":" Despite advancements in Neural Implicit models for 3D surface reconstruction,\nhandling dynamic environments with interactions between arbitrary rigid,\nnon-rigid, or deformable entities remains challenging. The generic\nreconstruction methods adaptable to such dynamic scenes often require\nadditional inputs like depth or optical flow or rely on pre-trained image\nfeatures for reasonable outcomes. These methods typically use latent codes to\ncapture frame-by-frame deformations. Another set of dynamic scene\nreconstruction methods, are entity-specific, mostly focusing on humans, and\nrelies on template models. In contrast, some template-free methods bypass these\nrequirements and adopt traditional LBS (Linear Blend Skinning) weights for a\ndetailed representation of deformable object motions, although they involve\ncomplex optimizations leading to lengthy training times. To this end, as a\nremedy, this paper introduces TFS-NeRF, a template-free 3D semantic NeRF for\ndynamic scenes captured from sparse or single-view RGB videos, featuring\ninteractions among two entities and more time-efficient than other LBS-based\napproaches. Our framework uses an Invertible Neural Network (INN) for LBS\nprediction, simplifying the training process. By disentangling the motions of\ninteracting entities and optimizing per-entity skinning weights, our method\nefficiently generates accurate, semantically separable geometries. Extensive\nexperiments demonstrate that our approach produces high-quality reconstructions\nof both deformable and non-deformable objects in complex interactions, with\nimproved training efficiency compared to existing methods.\n","authors":["Sandika Biswas","Qianyi Wu","Biplab Banerjee","Hamid Rezatofighi"],"pdf_url":"https://arxiv.org/pdf/2409.17459v3.pdf","comment":"Accepted in NeurIPS 2024 https://github.com/sbsws88/TFS-NeRF"},{"id":"http://arxiv.org/abs/2412.01801v2","updated":"2024-12-03T10:32:05Z","published":"2024-12-02T18:47:41Z","title":"SceneFactor: Factored Latent 3D Diffusion for Controllable 3D Scene\n Generation","summary":" We present SceneFactor, a diffusion-based approach for large-scale 3D scene\ngeneration that enables controllable generation and effortless editing.\nSceneFactor enables text-guided 3D scene synthesis through our factored\ndiffusion formulation, leveraging latent semantic and geometric manifolds for\ngeneration of arbitrary-sized 3D scenes. While text input enables easy,\ncontrollable generation, text guidance remains imprecise for intuitive,\nlocalized editing and manipulation of the generated 3D scenes. Our factored\nsemantic diffusion generates a proxy semantic space composed of semantic 3D\nboxes that enables controllable editing of generated scenes by adding,\nremoving, changing the size of the semantic 3D proxy boxes that guides\nhigh-fidelity, consistent 3D geometric editing. Extensive experiments\ndemonstrate that our approach enables high-fidelity 3D scene synthesis with\neffective controllable editing through our factored diffusion approach.\n","authors":["Alexey Bokhovkin","Quan Meng","Shubham Tulsiani","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2412.01801v2.pdf","comment":"21 pages, 12 figures; https://alexeybokhovkin.github.io/scenefactor/"},{"id":"http://arxiv.org/abs/2411.11421v3","updated":"2024-12-03T10:23:16Z","published":"2024-11-18T09:46:45Z","title":"Enabling DBSCAN for Very Large-Scale High-Dimensional Spaces","summary":" DBSCAN is one of the most important non-parametric unsupervised data analysis\ntools. By applying DBSCAN to a dataset, two key analytical results can be\nobtained: (1) clustering data points based on density distribution and (2)\nidentifying outliers in the dataset. However, the time complexity of the DBSCAN\nalgorithm is $O(n^2 \\beta)$, where $n$ is the number of data points and $\\beta\n= O(D)$, with $D$ representing the dimensionality of the data space. As a\nresult, DBSCAN becomes computationally infeasible when both $n$ and $D$ are\nlarge. In this paper, we propose a DBSCAN method based on spectral data\ncompression, capable of efficiently processing datasets with a large number of\ndata points ($n$) and high dimensionality ($D$). By preserving only the most\ncritical structural information during the compression process, our method\neffectively removes substantial redundancy and noise. Consequently, the\nsolution quality of DBSCAN is significantly improved, enabling more accurate\nand reliable results.\n","authors":["Yongyu Wang"],"pdf_url":"https://arxiv.org/pdf/2411.11421v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02351v1","updated":"2024-12-03T10:15:41Z","published":"2024-12-03T10:15:41Z","title":"Dual Exposure Stereo for Extended Dynamic Range 3D Imaging","summary":" Achieving robust stereo 3D imaging under diverse illumination conditions is\nan important however challenging task, due to the limited dynamic ranges (DRs)\nof cameras, which are significantly smaller than real world DR. As a result,\nthe accuracy of existing stereo depth estimation methods is often compromised\nby under- or over-exposed images. Here, we introduce dual-exposure stereo for\nextended dynamic range 3D imaging. We develop automatic dual-exposure control\nmethod that adjusts the dual exposures, diverging them when the scene DR\nexceeds the camera DR, thereby providing information about broader DR. From the\ncaptured dual-exposure stereo images, we estimate depth using motion-aware\ndual-exposure stereo network. To validate our method, we develop a robot-vision\nsystem, collect stereo video datasets, and generate a synthetic dataset. Our\nmethod outperforms other exposure control methods.\n","authors":["Juhyung Choi","Jinnyeong Kim","Seokjun Choi","Jinwoo Lee","Samuel Brucker","Mario Bijelic","Felix Heide","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2412.02351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02344v1","updated":"2024-12-03T10:04:15Z","published":"2024-12-03T10:04:15Z","title":"UniForm: A Reuse Attention Mechanism Optimized for Efficient Vision\n Transformers on Edge Devices","summary":" Transformer-based architectures have demonstrated remarkable success across\nvarious domains, but their deployment on edge devices remains challenging due\nto high memory and computational demands. In this paper, we introduce a novel\nReuse Attention mechanism, tailored for efficient memory access and\ncomputational optimization, enabling seamless operation on resource-constrained\nplatforms without compromising performance. Unlike traditional multi-head\nattention (MHA), which redundantly computes separate attention matrices for\neach head, Reuse Attention consolidates these computations into a shared\nattention matrix, significantly reducing memory overhead and computational\ncomplexity. Comprehensive experiments on ImageNet-1K and downstream tasks show\nthat the proposed UniForm models leveraging Reuse Attention achieve\nstate-of-the-art imagenet classification accuracy while outperforming existing\nattention mechanisms, such as Linear Attention and Flash Attention, in\ninference speed and memory scalability. Notably, UniForm-l achieves a 76.7%\nTop-1 accuracy on ImageNet-1K with 21.8ms inference time on edge devices like\nthe Jetson AGX Orin, representing up to a 5x speedup over competing benchmark\nmethods. These results demonstrate the versatility of Reuse Attention across\nhigh-performance GPUs and edge platforms, paving the way for broader real-time\napplications\n","authors":["Seul-Ki Yeom","Tae-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2412.02344v1.pdf","comment":"13 Pages, 8 Tables, 7 Figures"},{"id":"http://arxiv.org/abs/2412.02336v1","updated":"2024-12-03T09:56:38Z","published":"2024-12-03T09:56:38Z","title":"Amodal Depth Anything: Amodal Depth Estimation in the Wild","summary":" Amodal depth estimation aims to predict the depth of occluded (invisible)\nparts of objects in a scene. This task addresses the question of whether models\ncan effectively perceive the geometry of occluded regions based on visible\ncues. Prior methods primarily rely on synthetic datasets and focus on metric\ndepth estimation, limiting their generalization to real-world settings due to\ndomain shifts and scalability challenges. In this paper, we propose a novel\nformulation of amodal depth estimation in the wild, focusing on relative depth\nprediction to improve model generalization across diverse natural images. We\nintroduce a new large-scale dataset, Amodal Depth In the Wild (ADIW), created\nusing a scalable pipeline that leverages segmentation datasets and compositing\ntechniques. Depth maps are generated using large pre-trained depth models, and\na scale-and-shift alignment strategy is employed to refine and blend depth\npredictions, ensuring consistency in ground-truth annotations. To tackle the\namodal depth task, we present two complementary frameworks: Amodal-DAV2, a\ndeterministic model based on Depth Anything V2, and Amodal-DepthFM, a\ngenerative model that integrates conditional flow matching principles. Our\nproposed frameworks effectively leverage the capabilities of large pre-trained\nmodels with minimal modifications to achieve high-quality amodal depth\npredictions. Experiments validate our design choices, demonstrating the\nflexibility of our models in generating diverse, plausible depth structures for\noccluded regions. Our method achieves a 69.5% improvement in accuracy over the\nprevious SoTA on the ADIW dataset.\n","authors":["Zhenyu Li","Mykola Lavreniuk","Jian Shi","Shariq Farooq Bhat","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2412.02336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19035v2","updated":"2024-12-03T09:50:03Z","published":"2024-05-29T12:23:29Z","title":"A Good Foundation is Worth Many Labels: Label-Efficient Panoptic\n Segmentation","summary":" A key challenge for the widespread application of learning-based models for\nrobotic perception is to significantly reduce the required amount of annotated\ntraining data while achieving accurate predictions. This is essential not only\nto decrease operating costs but also to speed up deployment time. In this work,\nwe address this challenge for PAnoptic SegmenTation with fEw Labels (PASTEL) by\nexploiting the groundwork paved by visual foundation models. We leverage\ndescriptive image features from such a model to train two lightweight network\nheads for semantic segmentation and object boundary detection, using very few\nannotated training samples. We then merge their predictions via a novel fusion\nmodule that yields panoptic maps based on normalized cut. To further enhance\nthe performance, we utilize self-training on unlabeled images selected by a\nfeature-driven similarity scheme. We underline the relevance of our approach by\nemploying PASTEL to important robot perception use cases from autonomous\ndriving and agricultural robotics. In extensive experiments, we demonstrate\nthat PASTEL significantly outperforms previous methods for label-efficient\nsegmentation even when using fewer annotations. The code of our work is\npublicly available at http://pastel.cs.uni-freiburg.de.\n","authors":["Niclas Vödisch","Kürsat Petek","Markus Käppeler","Abhinav Valada","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2405.19035v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02332v1","updated":"2024-12-03T09:49:43Z","published":"2024-12-03T09:49:43Z","title":"SimuScope: Realistic Endoscopic Synthetic Dataset Generation through\n Surgical Simulation and Diffusion Models","summary":" Computer-assisted surgical (CAS) systems enhance surgical execution and\noutcomes by providing advanced support to surgeons. These systems often rely on\ndeep learning models trained on complex, challenging-to-annotate data. While\nsynthetic data generation can address these challenges, enhancing the realism\nof such data is crucial. This work introduces a multi-stage pipeline for\ngenerating realistic synthetic data, featuring a fully-fledged surgical\nsimulator that automatically produces all necessary annotations for modern CAS\nsystems. This simulator generates a wide set of annotations that surpass those\navailable in public synthetic datasets. Additionally, it offers a more complex\nand realistic simulation of surgical interactions, including the dynamics\nbetween surgical instruments and deformable anatomical environments,\noutperforming existing approaches. To further bridge the visual gap between\nsynthetic and real data, we propose a lightweight and flexible image-to-image\ntranslation method based on Stable Diffusion (SD) and Low-Rank Adaptation\n(LoRA). This method leverages a limited amount of annotated data, enables\nefficient training, and maintains the integrity of annotations generated by our\nsimulator. The proposed pipeline is experimentally validated and can translate\nsynthetic images into images with real-world characteristics, which can\ngeneralize to real-world context, thereby improving both training and CAS\nguidance. The code and the dataset are available at\nhttps://github.com/SanoScience/SimuScope.\n","authors":["Sabina Martyniak","Joanna Kaleta","Diego Dall'Alba","Michał Naskręt","Szymon Płotka","Przemysław Korzeniowski"],"pdf_url":"https://arxiv.org/pdf/2412.02332v1.pdf","comment":"Accepted to IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2412.02322v1","updated":"2024-12-03T09:38:14Z","published":"2024-12-03T09:38:14Z","title":"Controlling the Latent Diffusion Model for Generative Image Shadow\n Removal via Residual Generation","summary":" Large-scale generative models have achieved remarkable advancements in\nvarious visual tasks, yet their application to shadow removal in images remains\nchallenging. These models often generate diverse, realistic details without\nadequate focus on fidelity, failing to meet the crucial requirements of shadow\nremoval, which necessitates precise preservation of image content. In contrast\nto prior approaches that aimed to regenerate shadow-free images from scratch,\nthis paper utilizes diffusion models to generate and refine image residuals.\nThis strategy fully uses the inherent detailed information within shadowed\nimages, resulting in a more efficient and faithful reconstruction of\nshadow-free content. Additionally, to revent the accumulation of errors during\nthe generation process, a crosstimestep self-enhancement training strategy is\nproposed. This strategy leverages the network itself to augment the training\ndata, not only increasing the volume of data but also enabling the network to\ndynamically correct its generation trajectory, ensuring a more accurate and\nrobust output. In addition, to address the loss of original details in the\nprocess of image encoding and decoding of large generative models, a\ncontent-preserved encoder-decoder structure is designed with a control\nmechanism and multi-scale skip connections to achieve high-fidelity shadow-free\nimage reconstruction. Experimental results demonstrate that the proposed method\ncan reproduce high-quality results based on a large latent diffusion prior and\nfaithfully preserve the original contents in shadow regions.\n","authors":["Xinjie Li","Yang Zhao","Dong Wang","Yuan Chen","Li Cao","Xiaoping Liu"],"pdf_url":"https://arxiv.org/pdf/2412.02322v1.pdf","comment":"13pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.02317v1","updated":"2024-12-03T09:33:00Z","published":"2024-12-03T09:33:00Z","title":"HumanRig: Learning Automatic Rigging for Humanoid Character in a Large\n Scale Dataset","summary":" With the rapid evolution of 3D generation algorithms, the cost of producing\n3D humanoid character models has plummeted, yet the field is impeded by the\nlack of a comprehensive dataset for automatic rigging, which is a pivotal step\nin character animation. Addressing this gap, we present HumanRig, the first\nlarge-scale dataset specifically designed for 3D humanoid character rigging,\nencompassing 11,434 meticulously curated T-posed meshes adhered to a uniform\nskeleton topology. Capitalizing on this dataset, we introduce an innovative,\ndata-driven automatic rigging framework, which overcomes the limitations of\nGNN-based methods in handling complex AI-generated meshes. Our approach\nintegrates a Prior-Guided Skeleton Estimator (PGSE) module, which uses 2D\nskeleton joints to provide a preliminary 3D skeleton, and a Mesh-Skeleton\nMutual Attention Network (MSMAN) that fuses skeleton features with 3D mesh\nfeatures extracted by a U-shaped point transformer. This enables a\ncoarse-to-fine 3D skeleton joint regression and a robust skinning estimation,\nsurpassing previous methods in quality and versatility. This work not only\nremedies the dataset deficiency in rigging research but also propels the\nanimation industry towards more efficient and automated character rigging\npipelines.\n","authors":["Zedong Chu","Feng Xiong","Meiduo Liu","Jinzhi Zhang","Mingqi Shao","Zhaoxu Sun","Di Wang","Mu Xu"],"pdf_url":"https://arxiv.org/pdf/2412.02317v1.pdf","comment":"Website: https://github.com/c8241998/HumanRig"},{"id":"http://arxiv.org/abs/2412.02314v1","updated":"2024-12-03T09:31:16Z","published":"2024-12-03T09:31:16Z","title":"LoCo: Low-Contrast-Enhanced Contrastive Learning for Semi-Supervised\n Endoscopic Image Segmentation","summary":" The segmentation of endoscopic images plays a vital role in computer-aided\ndiagnosis and treatment. The advancements in deep learning have led to the\nemployment of numerous models for endoscopic tumor segmentation, achieving\npromising segmentation performance. Despite recent advancements, precise\nsegmentation remains challenging due to limited annotations and the issue of\nlow contrast. To address these issues, we propose a novel semi-supervised\nsegmentation framework termed LoCo via low-contrast-enhanced contrastive\nlearning (LCC). This innovative approach effectively harnesses the vast amounts\nof unlabeled data available for endoscopic image segmentation, improving both\naccuracy and robustness in the segmentation process. Specifically, LCC\nincorporates two advanced strategies to enhance the distinctiveness of\nlow-contrast pixels: inter-class contrast enhancement (ICE) and boundary\ncontrast enhancement (BCE), enabling models to segment low-contrast pixels\namong malignant tumors, benign tumors, and normal tissues. Additionally, a\nconfidence-based dynamic filter (CDF) is designed for pseudo-label selection,\nenhancing the utilization of generated pseudo-labels for unlabeled data with a\nspecific focus on minority classes. Extensive experiments conducted on two\npublic datasets, as well as a large proprietary dataset collected over three\nyears, demonstrate that LoCo achieves state-of-the-art results, significantly\noutperforming previous methods. The source code of LoCo is available at the URL\nof https://github.com/AnoK3111/LoCo.\n","authors":["Lingcong Cai","Yun Li","Xiaomao Fan","Kaixuan Song","Yongcheng Li","Yixuan Yuan","Ruxin Wang","Wenbin Lei"],"pdf_url":"https://arxiv.org/pdf/2412.02314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02313v1","updated":"2024-12-03T09:30:57Z","published":"2024-12-03T09:30:57Z","title":"Noisy Ostracods: A Fine-Grained, Imbalanced Real-World Dataset for\n Benchmarking Robust Machine Learning and Label Correction Methods","summary":" We present the Noisy Ostracods, a noisy dataset for genus and species\nclassification of crustacean ostracods with specialists' annotations. Over the\n71466 specimens collected, 5.58% of them are estimated to be noisy (possibly\nproblematic) at genus level. The dataset is created to addressing a real-world\nchallenge: creating a clean fine-grained taxonomy dataset. The Noisy Ostracods\ndataset has diverse noises from multiple sources. Firstly, the noise is\nopen-set, including new classes discovered during curation that were not part\nof the original annotation. The dataset has pseudo-classes, where annotators\nmisclassified samples that should belong to an existing class into a new\npseudo-class. The Noisy Ostracods dataset is highly imbalanced with a imbalance\nfactor $\\rho$ = 22429. This presents a unique challenge for robust machine\nlearning methods, as existing approaches have not been extensively evaluated on\nfine-grained classification tasks with such diverse real-world noise. Initial\nexperiments using current robust learning techniques have not yielded\nsignificant performance improvements on the Noisy Ostracods dataset compared to\ncross-entropy training on the raw, noisy data. On the other hand, noise\ndetection methods have underperformed in error hit rate compared to naive\ncross-validation ensembling for identifying problematic labels. These findings\nsuggest that the fine-grained, imbalanced nature, and complex noise\ncharacteristics of the dataset present considerable challenges for existing\nnoise-robust algorithms. By openly releasing the Noisy Ostracods dataset, our\ngoal is to encourage further research into the development of noise-resilient\nmachine learning methods capable of effectively handling diverse, real-world\nnoise in fine-grained classification tasks. The dataset, along with its\nevaluation protocols, can be accessed at\nhttps://github.com/H-Jamieu/Noisy_ostracods.\n","authors":["Jiamian Hu","Yuanyuan Hong","Yihua Chen","He Wang","Moriaki Yasuhara"],"pdf_url":"https://arxiv.org/pdf/2412.02313v1.pdf","comment":"Initial submit"},{"id":"http://arxiv.org/abs/2412.02310v1","updated":"2024-12-03T09:27:46Z","published":"2024-12-03T09:27:46Z","title":"Active Learning via Classifier Impact and Greedy Selection for\n Interactive Image Retrieval","summary":" Active Learning (AL) is a user-interactive approach aimed at reducing\nannotation costs by selecting the most crucial examples to label. Although AL\nhas been extensively studied for image classification tasks, the specific\nscenario of interactive image retrieval has received relatively little\nattention. This scenario presents unique characteristics, including an open-set\nand class-imbalanced binary classification, starting with very few labeled\nsamples. We introduce a novel batch-mode Active Learning framework named GAL\n(Greedy Active Learning) that better copes with this application. It\nincorporates a new acquisition function for sample selection that measures the\nimpact of each unlabeled sample on the classifier. We further embed this\nstrategy in a greedy selection approach, better exploiting the samples within\neach batch. We evaluate our framework with both linear (SVM) and non-linear\nMLP/Gaussian Process classifiers. For the Gaussian Process case, we show a\ntheoretical guarantee on the greedy approximation. Finally, we assess our\nperformance for the interactive content-based image retrieval task on several\nbenchmarks and demonstrate its superiority over existing approaches and common\nbaselines. Code is available at https://github.com/barleah/GreedyAL.\n","authors":["Leah Bar","Boaz Lerner","Nir Darshan","Rami Ben-Ari"],"pdf_url":"https://arxiv.org/pdf/2412.02310v1.pdf","comment":"Accepted to Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2412.02306v1","updated":"2024-12-03T09:21:04Z","published":"2024-12-03T09:21:04Z","title":"Partial Non-rigid Deformations and interpolations of Human Body Surfaces","summary":" Non-rigid shape deformations pose significant challenges, and most existing\nmethods struggle to handle partial deformations effectively. We present Partial\nNon-rigid Deformations and interpolations of the human body Surfaces (PaNDAS),\na new method to learn local and global deformations of 3D surface meshes by\nbuilding on recent deep models. Unlike previous approaches, our method enables\nrestricting deformations to specific parts of the shape in a versatile way and\nallows for mixing and combining various poses from the database, all while not\nrequiring any optimization at inference time. We demonstrate that the proposed\nframework can be used to generate new shapes, interpolate between parts of\nshapes, and perform other shape manipulation tasks with state-of-the-art\naccuracy and greater locality across various types of human surface data. Code\nand data will be made available soon.\n","authors":["Thomas Besnier","Emery Pierson","Sylvain Arguillere","Mohamed Daoudi"],"pdf_url":"https://arxiv.org/pdf/2412.02306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02294v1","updated":"2024-12-03T09:08:38Z","published":"2024-12-03T09:08:38Z","title":"Initial Study On Improving Segmentation By Combining Preoperative CT And\n Intraoperative CBCT Using Synthetic Data","summary":" Computer-Assisted Interventions enable clinicians to perform precise,\nminimally invasive procedures, often relying on advanced imaging methods.\nCone-beam computed tomography (CBCT) can be used to facilitate\ncomputer-assisted interventions, despite often suffering from artifacts that\npose challenges for accurate interpretation. While the degraded image quality\ncan affect image analysis, the availability of high quality, preoperative scans\noffers potential for improvements. Here we consider a setting where\npreoperative CT and intraoperative CBCT scans are available, however, the\nalignment (registration) between the scans is imperfect to simulate a real\nworld scenario. We propose a multimodal learning method that fuses roughly\naligned CBCT and CT scans and investigate the effect on segmentation\nperformance. For this experiment we use synthetically generated data containing\nreal CT and synthetic CBCT volumes with corresponding voxel annotations. We\nshow that this fusion setup improves segmentation performance in $18$ out of\n$20$ investigated setups.\n","authors":["Maximilian E. Tschuchnig","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2412.02294v1.pdf","comment":"Accepted at BVM 2025. arXiv admin note: text overlap with\n arXiv:2406.11650"},{"id":"http://arxiv.org/abs/2411.16316v4","updated":"2024-12-03T09:05:50Z","published":"2024-11-25T12:09:43Z","title":"Monocular Lane Detection Based on Deep Learning: A Survey","summary":" Lane detection plays an important role in autonomous driving perception\nsystems. As deep learning algorithms gain popularity, monocular lane detection\nmethods based on them have demonstrated superior performance and emerged as a\nkey research direction in autonomous driving perception. The core designs of\nthese algorithmic frameworks can be summarized as follows: (1) Task paradigm,\nfocusing on lane instance-level discrimination; (2) Lane modeling, representing\nlanes as a set of learnable parameters in the neural network; (3) Global\ncontext supplementation, enhancing inference on the obscure lanes; (4)\nPerspective effect elimination, providing accurate 3D lanes for downstream\napplications. From these perspectives, this paper presents a comprehensive\noverview of existing methods, encompassing both the increasingly mature 2D lane\ndetection approaches and the developing 3D lane detection works. Besides, this\npaper compares the performance of mainstream methods on different benchmarks\nand investigates their inference speed under a unified setting for fair\ncomparison. Moreover, we present some extended works on lane detection,\nincluding multi-task perception, video lane detection, online high-definition\nmap construction, and lane topology reasoning, to offer readers a comprehensive\nroadmap for the evolution of lane detection. Finally, we point out some\npotential future research directions in this field. We exhaustively collect the\npapers and codes of existing works at\nhttps://github.com/Core9724/Awesome-Lane-Detection and will keep tracing the\nresearch.\n","authors":["Xin He","Haiyun Guo","Kuan Zhu","Bingke Zhu","Xu Zhao","Jianwu Fang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.16316v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02287v1","updated":"2024-12-03T09:05:32Z","published":"2024-12-03T09:05:32Z","title":"Viewpoint Consistency in 3D Generation via Attention and CLIP Guidance","summary":" Despite recent advances in text-to-3D generation techniques, current methods\noften suffer from geometric inconsistencies, commonly referred to as the Janus\nProblem. This paper identifies the root cause of the Janus Problem: viewpoint\ngeneration bias in diffusion models, which creates a significant gap between\nthe actual generated viewpoint and the expected one required for optimizing the\n3D model. To address this issue, we propose a tuning-free approach called the\nAttention and CLIP Guidance (ACG) mechanism. ACG enhances desired viewpoints by\nadaptively controlling cross-attention maps, employs CLIP-based view-text\nsimilarities to filter out erroneous viewpoints, and uses a coarse-to-fine\noptimization strategy with staged prompts to progressively refine 3D\ngeneration. Extensive experiments demonstrate that our method significantly\nreduces the Janus Problem without compromising generation speed, establishing\nACG as an efficient, plug-and-play component for existing text-to-3D\nframeworks.\n","authors":["Qing Zhang","Zehao Chen","Jinguang Tong","Jing Zhang","Jie Hong","Xuesong Li"],"pdf_url":"https://arxiv.org/pdf/2412.02287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10099v3","updated":"2024-12-03T09:01:32Z","published":"2024-03-15T08:44:56Z","title":"KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and\n Deformation","summary":" In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and\nDeformation framework that takes object scans as input and jointly retrieves\nand deforms the most geometrically similar CAD models from a pre-processed\ndatabase to tightly match the target. Unlike existing dense matching based\nmethods that typically struggle with noisy partial scans, we propose to\nleverage category-consistent sparse keypoints to naturally handle both full and\npartial object scans. Specifically, we first employ a lightweight retrieval\nmodule to establish a keypoint-based embedding space, measuring the similarity\namong objects by dynamically aggregating deformation-aware local-global\nfeatures around extracted keypoints. Objects that are close in the embedding\nspace are considered similar in geometry. Then we introduce the neural\ncage-based deformation module that estimates the influence vector of each\nkeypoint upon cage vertices inside its local support region to control the\ndeformation of the retrieved shape. Extensive experiments on the synthetic\ndataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED\nsurpasses existing state-of-the-art approaches by a large margin. Codes and\ntrained models are released on https://github.com/lolrudy/KP-RED.\n","authors":["Ruida Zhang","Chenyangguang Zhang","Yan Di","Fabian Manhardt","Xingyu Liu","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2403.10099v3.pdf","comment":"Accepted by CVPR 2024. We identified an error in our baseline\n experiments, re-ran them, and updated the results without impacting the\n paper's conclusions. We apologize for the oversight and appreciate your\n understanding"},{"id":"http://arxiv.org/abs/2412.02280v1","updated":"2024-12-03T08:55:10Z","published":"2024-12-03T08:55:10Z","title":"AH-OCDA: Amplitude-based Curriculum Learning and Hopfield Segmentation\n Model for Open Compound Domain Adaptation","summary":" Open compound domain adaptation (OCDA) is a practical domain adaptation\nproblem that consists of a source domain, target compound domain, and unseen\nopen domain. In this problem, the absence of domain labels and pixel-level\nsegmentation labels for both compound and open domains poses challenges to the\ndirect application of existing domain adaptation and generalization methods. To\naddress this issue, we propose Amplitude-based curriculum learning and a\nHopfield segmentation model for Open Compound Domain Adaptation (AH-OCDA). Our\nmethod comprises two complementary components: 1) amplitude-based curriculum\nlearning and 2) Hopfield segmentation model. Without prior knowledge of target\ndomains within the compound domains, amplitude-based curriculum learning\ngradually induces the semantic segmentation model to adapt from the near-source\ncompound domain to the far-source compound domain by ranking unlabeled compound\ndomain images through Fast Fourier Transform (FFT). Additionally, the Hopfield\nsegmentation model maps segmentation feature distributions from arbitrary\ndomains to the feature distributions of the source domain. AH-OCDA achieves\nstate-of-the-art performance on two OCDA benchmarks and extended open domains,\ndemonstrating its adaptability to continuously changing compound domains and\nunseen open domains.\n","authors":["Jaehyun Choi","Junwon Ko","Dong-Jae Lee","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2412.02280v1.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2412.02275v1","updated":"2024-12-03T08:48:30Z","published":"2024-12-03T08:48:30Z","title":"PCIM: Learning Pixel Attributions via Pixel-wise Channel Isolation\n Mixing in High Content Imaging","summary":" Deep Neural Networks (DNNs) have shown remarkable success in various computer\nvision tasks. However, their black-box nature often leads to difficulty in\ninterpreting their decisions, creating an unfilled need for methods to explain\nthe decisions, and ultimately forming a barrier to their wide acceptance\nespecially in biomedical applications. This work introduces a novel method,\nPixel-wise Channel Isolation Mixing (PCIM), to calculate pixel attribution\nmaps, highlighting the image parts most crucial for a classification decision\nbut without the need to extract internal network states or gradients. Unlike\nexisting methods, PCIM treats each pixel as a distinct input channel and trains\na blending layer to mix these pixels, reflecting specific classifications. This\nunique approach allows the generation of pixel attribution maps for each image,\nbut agnostic to the choice of the underlying classification network. Benchmark\ntesting on three application relevant, diverse high content Imaging datasets\nshow state-of-the-art performance, particularly for model fidelity and\nlocalization ability in both, fluorescence and bright field High Content\nImaging. PCIM contributes as a unique and effective method for creating\npixel-level attribution maps from arbitrary DNNs, enabling interpretability and\ntrust.\n","authors":["Daniel Siegismund","Mario Wieser","Stephan Heyse","Stephan Steigele"],"pdf_url":"https://arxiv.org/pdf/2412.02275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10929v5","updated":"2024-12-03T08:48:21Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v5.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2412.02270v1","updated":"2024-12-03T08:41:11Z","published":"2024-12-03T08:41:11Z","title":"Sustainable Self-evolution Adversarial Training","summary":" With the wide application of deep neural network models in various computer\nvision tasks, there has been a proliferation of adversarial example generation\nstrategies aimed at deeply exploring model security. However, existing\nadversarial training defense models, which rely on single or limited types of\nattacks under a one-time learning process, struggle to adapt to the dynamic and\nevolving nature of attack methods. Therefore, to achieve defense performance\nimprovements for models in long-term applications, we propose a novel\nSustainable Self-Evolution Adversarial Training (SSEAT) framework.\nSpecifically, we introduce a continual adversarial defense pipeline to realize\nlearning from various kinds of adversarial examples across multiple stages.\nAdditionally, to address the issue of model catastrophic forgetting caused by\ncontinual learning from ongoing novel attacks, we propose an adversarial data\nreplay module to better select more diverse and key relearning data.\nFurthermore, we design a consistency regularization strategy to encourage\ncurrent defense models to learn more from previously trained ones, guiding them\nto retain more past knowledge and maintain accuracy on clean samples. Extensive\nexperiments have been conducted to verify the efficacy of the proposed SSEAT\ndefense method, which demonstrates superior defense performance and\nclassification accuracy compared to competitors.\n","authors":["Wenxuan Wang","Chenglei Wang","Huihui Qi","Menghao Ye","Xuelin Qian","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02270v1.pdf","comment":"Accepted to ACMMM 2024"},{"id":"http://arxiv.org/abs/2412.02267v1","updated":"2024-12-03T08:38:44Z","published":"2024-12-03T08:38:44Z","title":"GSGTrack: Gaussian Splatting-Guided Object Pose Tracking from RGB Videos","summary":" Tracking the 6DoF pose of unknown objects in monocular RGB video sequences is\ncrucial for robotic manipulation. However, existing approaches typically rely\non accurate depth information, which is non-trivial to obtain in real-world\nscenarios. Although depth estimation algorithms can be employed, geometric\ninaccuracy can lead to failures in RGBD-based pose tracking methods. To address\nthis challenge, we introduce GSGTrack, a novel RGB-based pose tracking\nframework that jointly optimizes geometry and pose. Specifically, we adopt 3D\nGaussian Splatting to create an optimizable 3D representation, which is learned\nsimultaneously with a graph-based geometry optimization to capture the object's\nappearance features and refine its geometry. However, the joint optimization\nprocess is susceptible to perturbations from noisy pose and geometry data.\nThus, we propose an object silhouette loss to address the issue of pixel-wise\nloss being overly sensitive to pose noise during tracking. To mitigate the\ngeometric ambiguities caused by inaccurate depth information, we propose a\ngeometry-consistent image pair selection strategy, which filters out\nlow-confidence pairs and ensures robust geometric optimization. Extensive\nexperiments on the OnePose and HO3D datasets demonstrate the effectiveness of\nGSGTrack in both 6DoF pose tracking and object reconstruction.\n","authors":["Zhiyuan Chen","Fan Lu","Guo Yu","Bin Li","Sanqing Qu","Yuan Huang","Changhong Fu","Guang Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02265v1","updated":"2024-12-03T08:37:28Z","published":"2024-12-03T08:37:28Z","title":"Diabetic Retinopathy Classification from Retinal Images using Machine\n Learning Approaches","summary":" Diabetic Retinopathy is one of the most familiar diseases and is a diabetes\ncomplication that affects eyes. Initially, diabetic retinopathy may cause no\nsymptoms or only mild vision problems. Eventually, it can cause blindness. So\nearly detection of symptoms could help to avoid blindness. In this paper, we\npresent some experiments on some features of diabetic retinopathy, like\nproperties of exudates, properties of blood vessels and properties of\nmicroaneurysm. Using the features, we can classify healthy, mild\nnon-proliferative, moderate non-proliferative, severe non-proliferative and\nproliferative stages of DR. Support Vector Machine, Random Forest and Naive\nBayes classifiers are used to classify the stages. Finally, Random Forest is\nfound to be the best for higher accuracy, sensitivity and specificity of 76.5%,\n77.2% and 93.3% respectively.\n","authors":["Indronil Bhattacharjee"," Al-Mahmud","Tareq Mahmud"],"pdf_url":"https://arxiv.org/pdf/2412.02265v1.pdf","comment":"5 pages, 9 figures, 2 tables. International Conference on Advanced\n Engineering, Technology and Applications (ICAETA-2021), Istanbul, Turkey"},{"id":"http://arxiv.org/abs/2412.02262v1","updated":"2024-12-03T08:34:42Z","published":"2024-12-03T08:34:42Z","title":"Composing Open-domain Vision with RAG for Ocean Monitoring and\n Conservation","summary":" Climate change's destruction of marine biodiversity is threatening\ncommunities and economies around the world which rely on healthy oceans for\ntheir livelihoods. The challenge of applying computer vision to niche,\nreal-world domains such as ocean conservation lies in the dynamic and diverse\nenvironments where traditional top-down learning struggle with long-tailed\ndistributions, generalization, and domain transfer. Scalable species\nidentification for ocean monitoring is particularly difficult due to the need\nto adapt models to new environments and identify rare or unseen species. To\novercome these limitations, we propose leveraging bottom-up, open-domain\nlearning frameworks as a resilient, scalable solution for image and video\nanalysis in marine applications. Our preliminary demonstration uses pretrained\nvision-language models (VLMs) combined with retrieval-augmented generation\n(RAG) as grounding, leaving the door open for numerous architectural, training\nand engineering optimizations. We validate this approach through a preliminary\napplication in classifying fish from video onboard fishing vessels,\ndemonstrating impressive emergent retrieval and prediction capabilities without\ndomain-specific training or knowledge of the task itself.\n","authors":["Sepand Dyanatkar","Angran Li","Alexander Dungate"],"pdf_url":"https://arxiv.org/pdf/2412.02262v1.pdf","comment":"Accepted to Climate Change AI Workshop at NeurIPS 2024. 9 pages, 6\n figures, 1 table"},{"id":"http://arxiv.org/abs/2412.02261v1","updated":"2024-12-03T08:34:41Z","published":"2024-12-03T08:34:41Z","title":"Diffusion Implicit Policy for Unpaired Scene-aware Motion Synthesis","summary":" Human motion generation is a long-standing problem, and scene-aware motion\nsynthesis has been widely researched recently due to its numerous applications.\nPrevailing methods rely heavily on paired motion-scene data whose quantity is\nlimited. Meanwhile, it is difficult to generalize to diverse scenes when\ntrained only on a few specific ones. Thus, we propose a unified framework,\ntermed Diffusion Implicit Policy (DIP), for scene-aware motion synthesis, where\npaired motion-scene data are no longer necessary. In this framework, we\ndisentangle human-scene interaction from motion synthesis during training and\nthen introduce an interaction-based implicit policy into motion diffusion\nduring inference. Synthesized motion can be derived through iterative diffusion\ndenoising and implicit policy optimization, thus motion naturalness and\ninteraction plausibility can be maintained simultaneously. The proposed\nimplicit policy optimizes the intermediate noised motion in a GAN Inversion\nmanner to maintain motion continuity and control keyframe poses though the\nControlNet branch and motion inpainting. For long-term motion synthesis, we\nintroduce motion blending for stable transitions between multiple sub-tasks,\nwhere motions are fused in rotation power space and translation linear space.\nThe proposed method is evaluated on synthesized scenes with ShapeNet furniture,\nand real scenes from PROX and Replica. Results show that our framework presents\nbetter motion naturalness and interaction plausibility than cutting-edge\nmethods. This also indicates the feasibility of utilizing the DIP for motion\nsynthesis in more general tasks and versatile scenes.\nhttps://jingyugong.github.io/DiffusionImplicitPolicy/\n","authors":["Jingyu Gong","Chong Zhang","Fengqi Liu","Ke Fan","Qianyu Zhou","Xin Tan","Zhizhong Zhang","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2412.02261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02259v1","updated":"2024-12-03T08:33:50Z","published":"2024-12-03T08:33:50Z","title":"VideoGen-of-Thought: A Collaborative Framework for Multi-Shot Video\n Generation","summary":" Current video generation models excel at generating short clips but still\nstruggle with creating multi-shot, movie-like videos. Existing models trained\non large-scale data on the back of rich computational resources are\nunsurprisingly inadequate for maintaining a logical storyline and visual\nconsistency across multiple shots of a cohesive script since they are often\ntrained with a single-shot objective. To this end, we propose\nVideoGen-of-Thought (VGoT), a collaborative and training-free architecture\ndesigned specifically for multi-shot video generation. VGoT is designed with\nthree goals in mind as follows. Multi-Shot Video Generation: We divide the\nvideo generation process into a structured, modular sequence, including (1)\nScript Generation, which translates a curt story into detailed prompts for each\nshot; (2) Keyframe Generation, responsible for creating visually consistent\nkeyframes faithful to character portrayals; and (3) Shot-Level Video\nGeneration, which transforms information from scripts and keyframes into shots;\n(4) Smoothing Mechanism that ensures a consistent multi-shot output. Reasonable\nNarrative Design: Inspired by cinematic scriptwriting, our prompt generation\napproach spans five key domains, ensuring logical consistency, character\ndevelopment, and narrative flow across the entire video. Cross-Shot\nConsistency: We ensure temporal and identity consistency by leveraging\nidentity-preserving (IP) embeddings across shots, which are automatically\ncreated from the narrative. Additionally, we incorporate a cross-shot smoothing\nmechanism, which integrates a reset boundary that effectively combines latent\nfeatures from adjacent shots, resulting in smooth transitions and maintaining\nvisual coherence throughout the video. Our experiments demonstrate that VGoT\nsurpasses existing video generation methods in producing high-quality,\ncoherent, multi-shot videos.\n","authors":["Mingzhe Zheng","Yongqi Xu","Haojian Huang","Xuran Ma","Yexin Liu","Wenjie Shu","Yatian Pang","Feilong Tang","Qifeng Chen","Harry Yang","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2412.02259v1.pdf","comment":"Webpage: https://cheliosoops.github.io/VGoT"},{"id":"http://arxiv.org/abs/2412.02254v1","updated":"2024-12-03T08:30:59Z","published":"2024-12-03T08:30:59Z","title":"ProbPose: A Probabilistic Approach to 2D Human Pose Estimation","summary":" Current Human Pose Estimation methods have achieved significant improvements.\nHowever, state-of-the-art models ignore out-of-image keypoints and use\nuncalibrated heatmaps as keypoint location representations. To address these\nlimitations, we propose ProbPose, which predicts for each keypoint: a\ncalibrated probability of keypoint presence at each location in the activation\nwindow, the probability of being outside of it, and its predicted visibility.\nTo address the lack of evaluation protocols for out-of-image keypoints, we\nintroduce the CropCOCO dataset and the Extended OKS (Ex-OKS) metric, which\nextends OKS to out-of-image points. Tested on COCO, CropCOCO, and OCHuman,\nProbPose shows significant gains in out-of-image keypoint localization while\nalso improving in-image localization through data augmentation. Additionally,\nthe model improves robustness along the edges of the bounding box and offers\nbetter flexibility in keypoint evaluation. The code and models are available on\nhttps://mirapurkrabek.github.io/ProbPose/ for research purposes.\n","authors":["Miroslav Purkrabek","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2412.02254v1.pdf","comment":"Code: https://mirapurkrabek.github.io/ProbPose/"},{"id":"http://arxiv.org/abs/2412.00115v2","updated":"2024-12-03T08:27:44Z","published":"2024-11-28T07:01:06Z","title":"OpenHumanVid: A Large-Scale High-Quality Dataset for Enhancing\n Human-Centric Video Generation","summary":" Recent advancements in visual generation technologies have markedly increased\nthe scale and availability of video datasets, which are crucial for training\neffective video generation models. However, a significant lack of high-quality,\nhuman-centric video datasets presents a challenge to progress in this field. To\nbridge this gap, we introduce OpenHumanVid, a large-scale and high-quality\nhuman-centric video dataset characterized by precise and detailed captions that\nencompass both human appearance and motion states, along with supplementary\nhuman motion conditions, including skeleton sequences and speech audio. To\nvalidate the efficacy of this dataset and the associated training strategies,\nwe propose an extension of existing classical diffusion transformer\narchitectures and conduct further pretraining of our models on the proposed\ndataset. Our findings yield two critical insights: First, the incorporation of\na large-scale, high-quality dataset substantially enhances evaluation metrics\nfor generated human videos while preserving performance in general video\ngeneration tasks. Second, the effective alignment of text with human\nappearance, human motion, and facial motion is essential for producing\nhigh-quality video outputs. Based on these insights and corresponding\nmethodologies, the straightforward extended network trained on the proposed\ndataset demonstrates an obvious improvement in the generation of human-centric\nvideos. Project page https://fudan-generative-vision.github.io/OpenHumanVid\n","authors":["Hui Li","Mingwang Xu","Yun Zhan","Shan Mu","Jiaye Li","Kaihui Cheng","Yuxuan Chen","Tan Chen","Mao Ye","Jingdong Wang","Siyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.00115v2.pdf","comment":"11 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2412.02250v1","updated":"2024-12-03T08:27:20Z","published":"2024-12-03T08:27:20Z","title":"Vision Transformers for Weakly-Supervised Microorganism Enumeration","summary":" Microorganism enumeration is an essential task in many applications, such as\nassessing contamination levels or ensuring health standards when evaluating\nsurface cleanliness. However, it's traditionally performed by human-supervised\nmethods that often require manual counting, making it tedious and\ntime-consuming. Previous research suggests automating this task using computer\nvision and machine learning methods, primarily through instance segmentation or\ndensity estimation techniques. This study conducts a comparative analysis of\nvision transformers (ViTs) for weakly-supervised counting in microorganism\nenumeration, contrasting them with traditional architectures such as ResNet and\ninvestigating ViT-based models such as TransCrowd. We trained different\nversions of ViTs as the architectural backbone for feature extraction using\nfour microbiology datasets to determine potential new approaches for total\nmicroorganism enumeration in images. Results indicate that while ResNets\nperform better overall, ViTs performance demonstrates competent results across\nall datasets, opening up promising lines of research in microorganism\nenumeration. This comparative study contributes to the field of microbial image\nanalysis by presenting innovative approaches to the recurring challenge of\nmicroorganism enumeration and by highlighting the capabilities of ViTs in the\ntask of regression counting.\n","authors":["Javier Ureña Santiago","Thomas Ströhle","Antonio Rodríguez-Sánchez","Ruth Breu"],"pdf_url":"https://arxiv.org/pdf/2412.02250v1.pdf","comment":"8 pages, 3 figures, 3 tables, conference"},{"id":"http://arxiv.org/abs/2412.02249v1","updated":"2024-12-03T08:27:17Z","published":"2024-12-03T08:27:17Z","title":"Multi-robot autonomous 3D reconstruction using Gaussian splatting with\n Semantic guidance","summary":" Implicit neural representations and 3D Gaussian splatting (3DGS) have shown\ngreat potential for scene reconstruction. Recent studies have expanded their\napplications in autonomous reconstruction through task assignment methods.\nHowever, these methods are mainly limited to single robot, and rapid\nreconstruction of large-scale scenes remains challenging. Additionally,\ntask-driven planning based on surface uncertainty is prone to being trapped in\nlocal optima. To this end, we propose the first 3DGS-based centralized\nmulti-robot autonomous 3D reconstruction framework. To further reduce time cost\nof task generation and improve reconstruction quality, we integrate online\nopen-vocabulary semantic segmentation with surface uncertainty of 3DGS,\nfocusing view sampling on regions with high instance uncertainty. Finally, we\ndevelop a multi-robot collaboration strategy with mode and task assignments\nimproving reconstruction quality while ensuring planning efficiency. Our method\ndemonstrates the highest reconstruction quality among all planning methods and\nsuperior planning efficiency compared to existing multi-robot methods. We\ndeploy our method on multiple robots, and results show that it can effectively\nplan view paths and reconstruct scenes with high quality.\n","authors":["Jing Zeng","Qi Ye","Tianle Liu","Yang Xu","Jin Li","Jinming Xu","Liang Li","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02245v1","updated":"2024-12-03T08:18:56Z","published":"2024-12-03T08:18:56Z","title":"SparseLGS: Sparse View Language Embedded Gaussian Splatting","summary":" Recently, several studies have combined Gaussian Splatting to obtain scene\nrepresentations with language embeddings for open-vocabulary 3D scene\nunderstanding. While these methods perform well, they essentially require very\ndense multi-view inputs, limiting their applicability in real-world scenarios.\nIn this work, we propose SparseLGS to address the challenge of 3D scene\nunderstanding with pose-free and sparse view input images. Our method leverages\na learning-based dense stereo model to handle pose-free and sparse inputs, and\na three-step region matching approach to address the multi-view semantic\ninconsistency problem, which is especially important for sparse inputs.\nDifferent from directly learning high-dimensional CLIP features, we extract\nlow-dimensional information and build bijections to avoid excessive learning\nand storage costs. We introduce a reconstruction loss during semantic training\nto improve Gaussian positions and shapes. To the best of our knowledge, we are\nthe first to address the 3D semantic field problem with sparse pose-free\ninputs. Experimental results show that SparseLGS achieves comparable quality\nwhen reconstructing semantic fields with fewer inputs (3-4 views) compared to\nprevious SOTA methods with dense input. Besides, when using the same sparse\ninput, SparseLGS leads significantly in quality and heavily improves the\ncomputation speed (5$\\times$ speedup). Project page: {\\tt\\small\n\\url{https://ustc3dv.github.io/SparseLGS}}\n","authors":["Jun Hu","Zhang Chen","Zhong Li","Yi Xu","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02245v1.pdf","comment":"Project Page: https://ustc3dv.github.io/SparseLGS"},{"id":"http://arxiv.org/abs/2403.16469v2","updated":"2024-12-03T08:18:17Z","published":"2024-03-25T06:50:25Z","title":"Learning from Reduced Labels for Long-Tailed Data","summary":" Long-tailed data is prevalent in real-world classification tasks and heavily\nrelies on supervised information, which makes the annotation process\nexceptionally labor-intensive and time-consuming. Unfortunately, despite being\na common approach to mitigate labeling costs, existing weakly supervised\nlearning methods struggle to adequately preserve supervised information for\ntail samples, resulting in a decline in accuracy for the tail classes. To\nalleviate this problem, we introduce a novel weakly supervised labeling setting\ncalled Reduced Label. The proposed labeling setting not only avoids the decline\nof supervised information for the tail samples, but also decreases the labeling\ncosts associated with long-tailed data. Additionally, we propose an\nstraightforward and highly efficient unbiased framework with strong theoretical\nguarantees to learn from these Reduced Labels. Extensive experiments conducted\non benchmark datasets including ImageNet validate the effectiveness of our\napproach, surpassing the performance of state-of-the-art weakly supervised\nmethods.\n","authors":["Meng Wei","Zhongnian Li","Yong Zhou","Xinzheng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16469v2.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2412.02242v1","updated":"2024-12-03T08:11:06Z","published":"2024-12-03T08:11:06Z","title":"U-Net in Medical Image Segmentation: A Review of Its Applications Across\n Modalities","summary":" Medical imaging is essential in healthcare to provide key insights into\npatient anatomy and pathology, aiding in diagnosis and treatment. Non-invasive\ntechniques such as X-ray, Magnetic Resonance Imaging (MRI), Computed Tomography\n(CT), and Ultrasound (US), capture detailed images of organs, tissues, and\nabnormalities. Effective analysis of these images requires precise segmentation\nto delineate regions of interest (ROI), such as organs or lesions. Traditional\nsegmentation methods, relying on manual feature-extraction, are labor-intensive\nand vary across experts. Recent advancements in Artificial Intelligence (AI)\nand Deep Learning (DL), particularly convolutional models such as U-Net and its\nvariants (U-Net++ and U-Net 3+), have transformed medical image segmentation\n(MIS) by automating the process and enhancing accuracy. These models enable\nefficient, precise pixel-wise classification across various imaging modalities,\novercoming the limitations of manual segmentation. This review explores various\nmedical imaging techniques, examines the U-Net architectures and their\nadaptations, and discusses their application across different modalities. It\nalso identifies common challenges in MIS and proposes potential solutions.\n","authors":["Fnu Neha","Deepshikha Bhati","Deepak Kumar Shukla","Sonavi Makarand Dalvi","Nikolaos Mantzou","Safa Shubbar"],"pdf_url":"https://arxiv.org/pdf/2412.02242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02241v1","updated":"2024-12-03T08:10:53Z","published":"2024-12-03T08:10:53Z","title":"Fast LiDAR Data Generation with Rectified Flows","summary":" Building LiDAR generative models holds promise as powerful data priors for\nrestoration, scene manipulation, and scalable simulation in autonomous mobile\nrobots. In recent years, approaches using diffusion models have emerged,\nsignificantly improving training stability and generation quality. Despite the\nsuccess of diffusion models, generating high-quality samples requires numerous\niterations of running neural networks, and the increasing computational cost\ncan pose a barrier to robotics applications. To address this challenge, this\npaper presents R2Flow, a fast and high-fidelity generative model for LiDAR\ndata. Our method is based on rectified flows that learn straight trajectories,\nsimulating data generation with much fewer sampling steps against diffusion\nmodels. We also propose a efficient Transformer-based model architecture for\nprocessing the image representation of LiDAR range and reflectance\nmeasurements. Our experiments on the unconditional generation of the KITTI-360\ndataset demonstrate the effectiveness of our approach in terms of both\nefficiency and quality.\n","authors":["Kazuto Nakashima","Xiaowen Liu","Tomoya Miyawaki","Yumi Iwashita","Ryo Kurazume"],"pdf_url":"https://arxiv.org/pdf/2412.02241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02237v1","updated":"2024-12-03T08:05:56Z","published":"2024-12-03T08:05:56Z","title":"Cross-Attention Head Position Patterns Can Align with Human Visual\n Concepts in Text-to-Image Generative Models","summary":" Recent text-to-image diffusion models leverage cross-attention layers, which\nhave been effectively utilized to enhance a range of visual generative tasks.\nHowever, our understanding of cross-attention layers remains somewhat limited.\nIn this study, we present a method for constructing Head Relevance Vectors\n(HRVs) that align with useful visual concepts. An HRV for a given visual\nconcept is a vector with a length equal to the total number of cross-attention\nheads, where each element represents the importance of the corresponding head\nfor the given visual concept. We develop and employ an ordered weakening\nanalysis to demonstrate the effectiveness of HRVs as interpretable features. To\ndemonstrate the utility of HRVs, we propose concept strengthening and concept\nadjusting methods and apply them to enhance three visual generative tasks. We\nshow that misinterpretations of polysemous words in image generation can be\ncorrected in most cases, five challenging attributes in image editing can be\nsuccessfully modified, and catastrophic neglect in multi-concept generation can\nbe mitigated. Overall, our work provides an advancement in understanding\ncross-attention layers and introduces new approaches for fine-controlling these\nlayers at the head level.\n","authors":["Jungwon Park","Jungmin Ko","Dongnam Byun","Jangwon Suh","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2412.02237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02234v1","updated":"2024-12-03T08:02:26Z","published":"2024-12-03T08:02:26Z","title":"CubeFormer: A Simple yet Effective Baseline for Lightweight Image\n Super-Resolution","summary":" Lightweight image super-resolution (SR) methods aim at increasing the\nresolution and restoring the details of an image using a lightweight neural\nnetwork. However, current lightweight SR methods still suffer from inferior\nperformance and unpleasant details. Our analysis reveals that these methods are\nhindered by constrained feature diversity, which adversely impacts feature\nrepresentation and detail recovery. To respond this issue, we propose a simple\nyet effective baseline called CubeFormer, designed to enhance feature richness\nby completing holistic information aggregation. To be specific, we introduce\ncube attention, which expands 2D attention to 3D space, facilitating exhaustive\ninformation interactions, further encouraging comprehensive information\nextraction and promoting feature variety. In addition, we inject block and grid\nsampling strategies to construct intra-cube transformer blocks (Intra-CTB) and\ninter-cube transformer blocks (Inter-CTB), which perform local and global\nmodeling, respectively. Extensive experiments show that our CubeFormer achieves\nstate-of-the-art performance on commonly used SR benchmarks. Our source code\nand models will be publicly available.\n","authors":["Jikai Wang","Huan Zheng","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2412.02234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16515v2","updated":"2024-12-03T07:48:06Z","published":"2024-11-25T15:57:19Z","title":"PriorPath: Coarse-To-Fine Approach for Controlled De-Novo Pathology\n Semantic Masks Generation","summary":" Incorporating artificial intelligence (AI) into digital pathology offers\npromising prospects for automating and enhancing tasks such as image analysis\nand diagnostic processes. However, the diversity of tissue samples and the\nnecessity for meticulous image labeling often result in biased datasets,\nconstraining the applicability of algorithms trained on them. To harness\nsynthetic histopathological images to cope with this challenge, it is essential\nnot only to produce photorealistic images but also to be able to exert control\nover the cellular characteristics they depict. Previous studies used methods to\ngenerate, from random noise, semantic masks that captured the spatial\ndistribution of the tissue. These masks were then used as a prior for\nconditional generative approaches to produce photorealistic histopathological\nimages. However, as with many other generative models, this solution exhibits\nmode collapse as the model fails to capture the full diversity of the\nunderlying data distribution. In this work, we present a pipeline, coined\nPriorPath, that generates detailed, realistic, semantic masks derived from\ncoarse-grained images delineating tissue regions. This approach enables control\nover the spatial arrangement of the generated masks and, consequently, the\nresulting synthetic images. We demonstrated the efficacy of our method across\nthree cancer types, skin, prostate, and lung, showcasing PriorPath's capability\nto cover the semantic mask space and to provide better similarity to real masks\ncompared to previous methods. Our approach allows for specifying desired tissue\ndistributions and obtaining both photorealistic masks and images within a\nsingle platform, thus providing a state-of-the-art, controllable solution for\ngenerating histopathological images to facilitate AI for computational\npathology.\n","authors":["Nati Daniel","May Nathan","Eden Azeroual","Yael Fisher","Yonatan Savir"],"pdf_url":"https://arxiv.org/pdf/2411.16515v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01525v2","updated":"2024-12-03T07:43:55Z","published":"2024-12-02T14:18:17Z","title":"Take Your Steps: Hierarchically Efficient Pulmonary Disease Screening\n via CT Volume Compression","summary":" Deep learning models are widely used to process Computed Tomography (CT) data\nin the automated screening of pulmonary diseases, significantly reducing the\nworkload of physicians. However, the three-dimensional nature of CT volumes\ninvolves an excessive number of voxels, which significantly increases the\ncomplexity of model processing. Previous screening approaches often overlook\nthis issue, which undoubtedly reduces screening efficiency. Towards efficient\nand effective screening, we design a hierarchical approach to reduce the\ncomputational cost of pulmonary disease screening. The new approach\nre-organizes the screening workflows into three steps. First, we propose a\nComputed Tomography Volume Compression (CTVC) method to select a small slice\nsubset that comprehensively represents the whole CT volume. Second, the\nselected CT slices are used to detect pulmonary diseases coarsely via a\nlightweight classification model. Third, an uncertainty measurement strategy is\napplied to identify samples with low diagnostic confidence, which are\nre-detected by radiologists. Experiments on two public pulmonary disease\ndatasets demonstrate that our approach achieves comparable accuracy and recall\nwhile reducing the time by 50%-70% compared with the counterparts using full CT\nvolumes. Besides, we also found that our approach outperforms previous\ncutting-edge CTVC methods in retaining important indications after compression.\n","authors":["Qian Shao","Kai Zhang","Bang Du","Zepeng Li","Yixuan Wu","Qiyuan Chen","Jian Wu","Jintai Chen","Honghao Gao","Hongxia Xu"],"pdf_url":"https://arxiv.org/pdf/2412.01525v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.15514v2","updated":"2024-12-03T07:32:10Z","published":"2024-09-23T20:04:29Z","title":"SpaGBOL: Spatial-Graph-Based Orientated Localisation","summary":" Cross-View Geo-Localisation within urban regions is challenging in part due\nto the lack of geo-spatial structuring within current datasets and techniques.\nWe propose utilising graph representations to model sequences of local\nobservations and the connectivity of the target location. Modelling as a graph\nenables generating previously unseen sequences by sampling with new parameter\nconfigurations. To leverage this newly available information, we propose a\nGNN-based architecture, producing spatially strong embeddings and improving\ndiscriminability over isolated image embeddings. We outline SpaGBOL,\nintroducing three novel contributions. 1) The first graph-structured dataset\nfor Cross-View Geo-Localisation, containing multiple streetview images per node\nto improve generalisation. 2) Introducing GNNs to the problem, we develop the\nfirst system that exploits the correlation between node proximity and feature\nsimilarity. 3) Leveraging the unique properties of the graph representation -\nwe demonstrate a novel retrieval filtering approach based on neighbourhood\nbearings. SpaGBOL achieves state-of-the-art accuracies on the unseen test graph\n- with relative Top-1 retrieval improvements on previous techniques of 11%, and\n50% when filtering with Bearing Vector Matching on the SpaGBOL dataset.\n","authors":["Tavis Shore","Oscar Mendez","Simon Hadfield"],"pdf_url":"https://arxiv.org/pdf/2409.15514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02225v1","updated":"2024-12-03T07:31:54Z","published":"2024-12-03T07:31:54Z","title":"How to Use Diffusion Priors under Sparse Views?","summary":" Novel view synthesis under sparse views has been a long-term important\nchallenge in 3D reconstruction. Existing works mainly rely on introducing\nexternal semantic or depth priors to supervise the optimization of 3D\nrepresentations. However, the diffusion model, as an external prior that can\ndirectly provide visual supervision, has always underperformed in sparse-view\n3D reconstruction using Score Distillation Sampling (SDS) due to the low\ninformation entropy of sparse views compared to text, leading to optimization\nchallenges caused by mode deviation. To this end, we present a thorough\nanalysis of SDS from the mode-seeking perspective and propose Inline Prior\nGuided Score Matching (IPSM), which leverages visual inline priors provided by\npose relationships between viewpoints to rectify the rendered image\ndistribution and decomposes the original optimization objective of SDS, thereby\noffering effective diffusion visual guidance without any fine-tuning or\npre-training. Furthermore, we propose the IPSM-Gaussian pipeline, which adopts\n3D Gaussian Splatting as the backbone and supplements depth and geometry\nconsistency regularization based on IPSM to further improve inline priors and\nrectified distribution. Experimental results on different public datasets show\nthat our method achieves state-of-the-art reconstruction quality. The code is\nreleased at https://github.com/iCVTEAM/IPSM.\n","authors":["Qisen Wang","Yifan Zhao","Jiawei Ma","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2412.02225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02220v1","updated":"2024-12-03T07:25:30Z","published":"2024-12-03T07:25:30Z","title":"Unlocking Tuning-Free Few-Shot Adaptability in Visual Foundation Models\n by Recycling Pre-Tuned LoRAs","summary":" Large Language Models (LLMs) such as ChatGPT demonstrate strong few-shot\nadaptability without requiring fine-tuning, positioning them ideal for\ndata-limited and real-time applications. However, this adaptability has not yet\nbeen replicated in current Visual Foundation Models (VFMs), which require\nexplicit fine-tuning with sufficient tuning data. Besides, the\npretraining-finetuning paradigm has led to the surge of numerous task-specific\nmodular components, such as Low-Rank Adaptation (LoRA). For the first time, we\nexplore the potential of reusing diverse pre-tuned LoRAs without accessing\ntheir original training data, to achieve tuning-free few-shot adaptation in\nVFMs. Our framework, LoRA Recycle, distills a meta-LoRA from diverse pre-tuned\nLoRAs with a meta-learning objective, using surrogate data generated inversely\nfrom pre-tuned LoRAs themselves. The VFM, once equipped with the meta-LoRA, is\nempowered to solve new few-shot tasks in a single forward pass, akin to the\nin-context learning of LLMs. Additionally, we incorporate a double-efficient\nmechanism tailored to our framework, significantly accelerating the\nmeta-training process while maintaining or even improving performance.\nExtensive experiments across various few-shot classification benchmarks across\nboth in- and cross-domain scenarios demonstrate the superiority of our\nframework.\n","authors":["Zixuan Hu","Yongxian Wei","Li Shen","Chun Yuan","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2412.02220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16794v2","updated":"2024-12-03T07:24:18Z","published":"2024-11-25T09:22:42Z","title":"Phase-Informed Tool Segmentation for Manual Small-Incision Cataract\n Surgery","summary":" Cataract surgery is the most common surgical procedure globally, with a\ndisproportionately higher burden in developing countries. While automated\nsurgical video analysis has been explored in general surgery, its application\nto ophthalmic procedures remains limited. Existing works primarily focus on\nPhaco cataract surgery, an expensive technique not accessible in regions where\ncataract treatment is most needed. In contrast, Manual Small-Incision Cataract\nSurgery (MSICS) is the preferred low-cost, faster alternative in high-volume\nsettings and for challenging cases. However, no dataset exists for MSICS. To\naddress this gap, we introduce Sankara-MSICS, the first comprehensive dataset\ncontaining 53 surgical videos annotated for 18 surgical phases and 3,527 frames\nwith 13 surgical tools at the pixel level. We benchmark this dataset on\nstate-of-the-art models and present ToolSeg, a novel framework that enhances\ntool segmentation by introducing a phase-conditional decoder and a simple yet\neffective semi-supervised setup leveraging pseudo-labels from foundation\nmodels. Our approach significantly improves segmentation performance, achieving\na $23.77\\%$ to $38.10\\%$ increase in mean Dice scores, with a notable boost for\ntools that are less prevalent and small. Furthermore, we demonstrate that\nToolSeg generalizes to other surgical settings, showcasing its effectiveness on\nthe CaDIS dataset.\n","authors":["Bhuvan Sachdeva","Naren Akash","Tajamul Ashraf","Simon Mueller","Thomas Schultz","Maximilian W. M. Wintergerst","Niharika Singri Prasad","Kaushik Murali","Mohit Jain"],"pdf_url":"https://arxiv.org/pdf/2411.16794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00156v2","updated":"2024-12-03T07:18:25Z","published":"2024-11-29T08:10:49Z","title":"VISION-XL: High Definition Video Inverse Problem Solver using Latent\n Image Diffusion Models","summary":" In this paper, we propose a novel framework for solving high-definition video\ninverse problems using latent image diffusion models. Building on recent\nadvancements in spatio-temporal optimization for video inverse problems using\nimage diffusion models, our approach leverages latent-space diffusion models to\nachieve enhanced video quality and resolution. To address the high\ncomputational demands of processing high-resolution frames, we introduce a\npseudo-batch consistent sampling strategy, allowing efficient operation on a\nsingle GPU. Additionally, to improve temporal consistency, we present\nbatch-consistent inversion, an initialization technique that incorporates\ninformative latents from the measurement frame. By integrating with SDXL, our\nframework achieves state-of-the-art video reconstruction across a wide range of\nspatio-temporal inverse problems, including complex combinations of frame\naveraging and various spatial degradations, such as deblurring,\nsuper-resolution, and inpainting. Unlike previous methods, our approach\nsupports multiple aspect ratios (landscape, vertical, and square) and delivers\nHD-resolution reconstructions (exceeding 1280x720) in under 2.5 minutes on a\nsingle NVIDIA 4090 GPU.\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2412.00156v2.pdf","comment":"Project page: https://vision-xl.github.io/"},{"id":"http://arxiv.org/abs/2412.00473v2","updated":"2024-12-03T07:13:51Z","published":"2024-11-30T13:21:15Z","title":"Jailbreak Large Vision-Language Models Through Multi-Modal Linkage","summary":" With the significant advancement of Large Vision-Language Models (VLMs),\nconcerns about their potential misuse and abuse have grown rapidly. Previous\nstudies have highlighted VLMs' vulnerability to jailbreak attacks, where\ncarefully crafted inputs can lead the model to produce content that violates\nethical and legal standards. However, existing methods struggle against\nstate-of-the-art VLMs like GPT-4o, due to the over-exposure of harmful content\nand lack of stealthy malicious guidance. In this work, we propose a novel\njailbreak attack framework: Multi-Modal Linkage (MML) Attack. Drawing\ninspiration from cryptography, MML utilizes an encryption-decryption process\nacross text and image modalities to mitigate over-exposure of malicious\ninformation. To align the model's output with malicious intent covertly, MML\nemploys a technique called \"evil alignment\", framing the attack within a video\ngame production scenario. Comprehensive experiments demonstrate MML's\neffectiveness. Specifically, MML jailbreaks GPT-4o with attack success rates of\n97.80% on SafeBench, 98.81% on MM-SafeBench and 99.07% on HADES-Dataset. Our\ncode is available at https://github.com/wangyu-ovo/MML\n","authors":["Yu Wang","Xiaofei Zhou","Yichen Wang","Geyuan Zhang","Tianxing He"],"pdf_url":"https://arxiv.org/pdf/2412.00473v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02214v1","updated":"2024-12-03T07:05:39Z","published":"2024-12-03T07:05:39Z","title":"GIST: Towards Photorealistic Style Transfer via Multiscale Geometric\n Representations","summary":" State-of-the-art Style Transfer methods often leverage pre-trained encoders\noptimized for discriminative tasks, which may not be ideal for image synthesis.\nThis can result in significant artifacts and loss of photorealism. Motivated by\nthe ability of multiscale geometric image representations to capture\nfine-grained details and global structure, we propose GIST: Geometric-based\nImage Style Transfer, a novel Style Transfer technique that exploits the\ngeometric properties of content and style images. GIST replaces the standard\nNeural Style Transfer autoencoding framework with a multiscale image expansion,\npreserving scene details without the need for post-processing or training. Our\nmethod matches multiresolution and multidirectional representations such as\nWavelets and Contourlets by solving an optimal transport problem, leading to an\nefficient texture transferring. Experiments show that GIST is on-par or\noutperforms recent photorealistic Style Transfer approaches while significantly\nreducing the processing time with no model training.\n","authors":["Renan A. Rojas-Gomez","Minh N. Do"],"pdf_url":"https://arxiv.org/pdf/2412.02214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02210v1","updated":"2024-12-03T07:03:25Z","published":"2024-12-03T07:03:25Z","title":"CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating\n Large Multimodal Models in Literacy","summary":" Large Multimodal Models (LMMs) have demonstrated impressive performance on\nrecognizing document images with natural language instructions. However, it\nremains unclear to what extent capabilities in literacy with rich structure and\nfine-grained visual challenges. The current landscape lacks a comprehensive\nbenchmark to effectively measure the literate capabilities of LMMs. Existing\nbenchmarks are often limited by narrow scenarios and specified tasks. To this\nend, we introduce CC-OCR, a comprehensive benchmark that possess a diverse\nrange of scenarios, tasks, and challenges. CC-OCR comprises four OCR-centric\ntracks: multi-scene text reading, multilingual text reading, document parsing,\nand key information extraction. It includes 39 subsets with 7,058 full\nannotated images, of which 41% are sourced from real applications, being\nreleased for the first time. Furthermore, we evaluate nine prominent LMMs and\nreveal both the strengths and weaknesses of these models, particularly in text\ngrounding, multi-orientation, and hallucination of repetition. CC-OCR aims to\ncomprehensively evaluate the capabilities of LMMs on OCR-centered tasks,\ndriving advancement in LMMs.\n","authors":["Zhibo Yang","Jun Tang","Zhaohai Li","Pengfei Wang","Jianqiang Wan","Humen Zhong","Xuejing Liu","Mingkun Yang","Peng Wang","Yuliang Liu","LianWen Jin","Xiang Bai","Shuai Bai","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2412.02210v1.pdf","comment":"11 pages, 4 figures; The code and data will be publicly available as\n soon as possible"},{"id":"http://arxiv.org/abs/2407.01003v4","updated":"2024-12-03T06:43:25Z","published":"2024-07-01T06:35:53Z","title":"Embedded Prompt Tuning: Towards Enhanced Calibration of Pretrained\n Models for Medical Images","summary":" Foundation models pre-trained on large-scale data have been widely witnessed\nto achieve success in various natural imaging downstream tasks.\nParameter-efficient fine-tuning (PEFT) methods aim to adapt foundation models\nto new domains by updating only a small portion of parameters in order to\nreduce computational overhead. However, the effectiveness of these PEFT\nmethods, especially in cross-domain few-shot scenarios, e.g., medical image\nanalysis, has not been fully explored. In this work, we facilitate the study of\nthe performance of PEFT when adapting foundation models to medical image\nclassification tasks. Furthermore, to alleviate the limitations of prompt\nintroducing ways and approximation capabilities on Transformer architectures of\nmainstream prompt tuning methods, we propose the Embedded Prompt Tuning (EPT)\nmethod by embedding prompt tokens into the expanded channels. We also find that\nthere are anomalies in the feature space distribution of foundation models\nduring pre-training process, and prompt tuning can help mitigate this negative\nimpact. To explain this phenomenon, we also introduce a novel perspective to\nunderstand prompt tuning: Prompt tuning is a distribution calibrator. And we\nsupport it by analyzing patch-wise scaling and feature separation operations\ncontained in EPT. Our experiments show that EPT outperforms several\nstate-of-the-art fine-tuning methods by a significant margin on few-shot\nmedical image classification tasks, and completes the fine-tuning process\nwithin highly competitive time, indicating EPT is an effective PEFT method. The\nsource code is available at github.com/zuwenqiang/EPT.\n","authors":["Wenqiang Zu","Shenghao Xie","Qing Zhao","Guoqi Li","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2407.01003v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02202v1","updated":"2024-12-03T06:31:25Z","published":"2024-12-03T06:31:25Z","title":"3D representation in 512-Byte:Variational tokenizer is the key for\n autoregressive 3D generation","summary":" Autoregressive transformers have revolutionized high-fidelity image\ngeneration. One crucial ingredient lies in the tokenizer, which compresses\nhigh-resolution image patches into manageable discrete tokens with a scanning\nor hierarchical order suitable for large language models. Extending these\ntokenizers to 3D generation, however, presents a significant challenge: unlike\nimage patches that naturally exhibit spatial sequence and multi-scale\nrelationships, 3D data lacks an inherent order, making it difficult to compress\ninto fewer tokens while preserving structural details. To address this, we\nintroduce the Variational Tokenizer (VAT), which transforms unordered 3D data\ninto compact latent tokens with an implicit hierarchy, suited for efficient and\nhigh-fidelity coarse-to-fine autoregressive modeling. VAT begins with an\nin-context transformer, which compress numerous unordered 3D features into a\nreduced token set with minimal information loss. This latent space is then\nmapped to a Gaussian distribution for residual quantization, with token counts\nprogressively increasing across scales. In this way, tokens at different scales\nnaturally establish the interconnections by allocating themselves into\ndifferent subspaces within the same Gaussian distribution, facilitating\ndiscrete modeling of token relationships across scales. During the decoding\nphase, a high-resolution triplane is utilized to convert these compact latent\ntokens into detailed 3D shapes. Extensive experiments demonstrate that VAT\nenables scalable and efficient 3D generation, outperforming existing methods in\nquality, efficiency, and generalization. Remarkably, VAT achieves up to a 250x\ncompression, reducing a 1MB mesh to just 3.9KB with a 96% F-score, and can\nfurther compress to 256 int8 tokens, achieving a 2000x reduction while\nmaintaining a 92% F-score.\n","authors":["Jinzhi Zhang","Feng Xiong","Mu Xu"],"pdf_url":"https://arxiv.org/pdf/2412.02202v1.pdf","comment":"22 pages, 21 figures"},{"id":"http://arxiv.org/abs/2412.02198v1","updated":"2024-12-03T06:23:35Z","published":"2024-12-03T06:23:35Z","title":"Transformer-Metric Loss for CNN-Based Face Recognition","summary":" In deep learning, the loss function plays a crucial role in optimizing the\nnetwork. Many recent innovations in loss techniques have been made, and various\nmargin-based angular loss functions (metric loss) have been designed\nparticularly for face recognition. The concept of transformers is already\nwell-researched and applied in many facets of machine vision. This paper\npresents a technique for loss evaluation that uses a transformer network as an\nadditive loss in the face recognition domain. The standard metric loss function\ntypically takes the final embedding of the main CNN backbone as its input.\nHere, we employ a transformer-metric loss, a combined approach that integrates\nboth transformer-loss and metric-loss. This research intends to analyze the\ntransformer behavior on the convolution output when the CNN outcome is arranged\nin a sequential vector. The transformer encoder takes input from the contextual\nvectors obtained from the final convolution layer of the network. With this\ntechnique, we use transformer loss with various base metric-loss functions to\nevaluate the effect of the combined loss functions. We observe that such a\nconfiguration allows the network to achieve SoTA results on various validation\ndatasets with some limitations. This research expands the role of transformers\nin the machine vision domain and opens new possibilities for exploring\ntransformers as a loss function.\n","authors":["Pritesh Prakash","Ashish Jacob Sam"],"pdf_url":"https://arxiv.org/pdf/2412.02198v1.pdf","comment":"Face Recognition using Transformer Loss"},{"id":"http://arxiv.org/abs/2412.02197v1","updated":"2024-12-03T06:23:19Z","published":"2024-12-03T06:23:19Z","title":"Cascaded Multi-Scale Attention for Enhanced Multi-Scale Feature\n Extraction and Interaction with Low-Resolution Images","summary":" In real-world applications of image recognition tasks, such as human pose\nestimation, cameras often capture objects, like human bodies, at low\nresolutions. This scenario poses a challenge in extracting and leveraging\nmulti-scale features, which is often essential for precise inference. To\naddress this challenge, we propose a new attention mechanism, named cascaded\nmulti-scale attention (CMSA), tailored for use in CNN-ViT hybrid architectures,\nto handle low-resolution inputs effectively. The design of CMSA enables the\nextraction and seamless integration of features across various scales without\nnecessitating the downsampling of the input image or feature maps. This is\nachieved through a novel combination of grouped multi-head self-attention\nmechanisms with window-based local attention and cascaded fusion of multi-scale\nfeatures over different scales. This architecture allows for the effective\nhandling of features across different scales, enhancing the model's ability to\nperform tasks such as human pose estimation, head pose estimation, and more\nwith low-resolution images. Our experimental results show that the proposed\nmethod outperforms existing state-of-the-art methods in these areas with fewer\nparameters, showcasing its potential for broad application in real-world\nscenarios where capturing high-resolution images is not feasible. Code is\navailable at https://github.com/xyongLu/CMSA.\n","authors":["Xiangyong Lu","Masanori Suganuma","Takayuki Okatani"],"pdf_url":"https://arxiv.org/pdf/2412.02197v1.pdf","comment":"9 pages, 4 figures, 5 tables. The paper is under consideration at\n Computer Vision and Image Understanding"},{"id":"http://arxiv.org/abs/2412.02193v1","updated":"2024-12-03T06:15:04Z","published":"2024-12-03T06:15:04Z","title":"LayoutVLM: Differentiable Optimization of 3D Layout via Vision-Language\n Models","summary":" Open-universe 3D layout generation arranges unlabeled 3D assets conditioned\non language instruction. Large language models (LLMs) struggle with generating\nphysically plausible 3D scenes and adherence to input instructions,\nparticularly in cluttered scenes. We introduce LayoutVLM, a framework and scene\nlayout representation that exploits the semantic knowledge of Vision-Language\nModels (VLMs) and supports differentiable optimization to ensure physical\nplausibility. LayoutVLM employs VLMs to generate two mutually reinforcing\nrepresentations from visually marked images, and a self-consistent decoding\nprocess to improve VLMs spatial planning. Our experiments show that LayoutVLM\naddresses the limitations of existing LLM and constraint-based approaches,\nproducing physically plausible 3D layouts better aligned with the semantic\nintent of input language instructions. We also demonstrate that fine-tuning\nVLMs with the proposed scene layout representation extracted from existing\nscene datasets can improve performance.\n","authors":["Fan-Yun Sun","Weiyu Liu","Siyi Gu","Dylan Lim","Goutam Bhat","Federico Tombari","Manling Li","Nick Haber","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2412.02193v1.pdf","comment":"project website: https://ai.stanford.edu/~sunfanyun/layoutvlm/"},{"id":"http://arxiv.org/abs/2412.02186v1","updated":"2024-12-03T05:54:43Z","published":"2024-12-03T05:54:43Z","title":"VideoICL: Confidence-based Iterative In-context Learning for\n Out-of-Distribution Video Understanding","summary":" Recent advancements in video large multimodal models (LMMs) have\nsignificantly improved their video understanding and reasoning capabilities.\nHowever, their performance drops on out-of-distribution (OOD) tasks that are\nunderrepresented in training data. Traditional methods like fine-tuning on OOD\ndatasets are impractical due to high computational costs. While In-context\nlearning (ICL) with demonstration examples has shown promising generalization\nperformance in language tasks and image-language tasks without fine-tuning,\napplying ICL to video-language tasks faces challenges due to the limited\ncontext length in Video LMMs, as videos require longer token lengths. To\naddress these issues, we propose VideoICL, a novel video in-context learning\nframework for OOD tasks that introduces a similarity-based relevant example\nselection strategy and a confidence-based iterative inference approach. This\nallows to select the most relevant examples and rank them based on similarity,\nto be used for inference. If the generated response has low confidence, our\nframework selects new examples and performs inference again, iteratively\nrefining the results until a high-confidence response is obtained. This\napproach improves OOD video understanding performance by extending effective\ncontext length without incurring high costs. The experimental results on\nmultiple benchmarks demonstrate significant performance gains, especially in\ndomain-specific scenarios, laying the groundwork for broader video\ncomprehension applications. Code will be released at\nhttps://github.com/KangsanKim07/VideoICL\n","authors":["Kangsan Kim","Geon Park","Youngwan Lee","Woongyeong Yeo","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2412.02186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15660v2","updated":"2024-12-03T05:45:23Z","published":"2023-08-29T22:43:46Z","title":"CamoFA: A Learnable Fourier-based Augmentation for Camouflage\n Segmentation","summary":" Camouflaged object detection (COD) and camouflaged instance segmentation\n(CIS) aim to recognize and segment objects that are blended into their\nsurroundings, respectively. While several deep neural network models have been\nproposed to tackle those tasks, augmentation methods for COD and CIS have not\nbeen thoroughly explored. Augmentation strategies can help improve models'\nperformance by increasing the size and diversity of the training data and\nexposing the model to a wider range of variations in the data. Besides, we aim\nto automatically learn transformations that help to reveal the underlying\nstructure of camouflaged objects and allow the model to learn to better\nidentify and segment camouflaged objects. To achieve this, we propose a\nlearnable augmentation method in the frequency domain for COD and CIS via the\nFourier transform approach, dubbed CamoFA. Our method leverages a conditional\ngenerative adversarial network and cross-attention mechanism to generate a\nreference image and an adaptive hybrid swapping with parameters to mix the\nlow-frequency component of the reference image and the high-frequency component\nof the input image. This approach aims to make camouflaged objects more visible\nfor detection and segmentation models. Without bells and whistles, our proposed\naugmentation method boosts the performance of camouflaged object detectors and\ninstance segmenters by large margins.\n","authors":["Minh-Quan Le","Minh-Triet Tran","Trung-Nghia Le","Tam V. Nguyen","Thanh-Toan Do"],"pdf_url":"https://arxiv.org/pdf/2308.15660v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2412.01471v2","updated":"2024-12-03T05:42:22Z","published":"2024-12-02T13:17:41Z","title":"Multi-Granularity Video Object Segmentation","summary":" Current benchmarks for video segmentation are limited to annotating only\nsalient objects (i.e., foreground instances). Despite their impressive\narchitectural designs, previous works trained on these benchmarks have\nstruggled to adapt to real-world scenarios. Thus, developing a new video\nsegmentation dataset aimed at tracking multi-granularity segmentation target in\nthe video scene is necessary. In this work, we aim to generate\nmulti-granularity video segmentation dataset that is annotated for both salient\nand non-salient masks. To achieve this, we propose a large-scale, densely\nannotated multi-granularity video object segmentation (MUG-VOS) dataset that\nincludes various types and granularities of mask annotations. We automatically\ncollected a training set that assists in tracking both salient and non-salient\nobjects, and we also curated a human-annotated test set for reliable\nevaluation. In addition, we present memory-based mask propagation model (MMPM),\ntrained and evaluated on MUG-VOS dataset, which leads to the best performance\namong the existing video object segmentation methods and Segment SAM-based\nvideo segmentation methods. Project page is available at\nhttps://cvlab-kaist.github.io/MUG-VOS.\n","authors":["Sangbeom Lim","Seongchan Kim","Seungjun An","Seokju Cho","Paul Hongsuck Seo","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2412.01471v2.pdf","comment":"Project Page: https://cvlab-kaist.github.io/MUG-VOS"},{"id":"http://arxiv.org/abs/2411.04480v4","updated":"2024-12-03T05:24:50Z","published":"2024-11-07T07:19:28Z","title":"CFPNet: Improving Lightweight ToF Depth Completion via Cross-zone\n Feature Propagation","summary":" Depth completion using lightweight time-of-flight (ToF) depth sensors is\nattractive due to their low cost. However, lightweight ToF sensors usually have\na limited field of view (FOV) compared with cameras. Thus, only pixels in the\nzone area of the image can be associated with depth signals. Previous methods\nfail to propagate depth features from the zone area to the outside-zone area\neffectively, thus suffering from degraded depth completion performance outside\nthe zone. To this end, this paper proposes the CFPNet to achieve cross-zone\nfeature propagation from the zone area to the outside-zone area with two novel\nmodules. The first is a direct-attention-based propagation module (DAPM), which\nenforces direct cross-zone feature acquisition. The second is a\nlarge-kernel-based propagation module (LKPM), which realizes cross-zone feature\npropagation by utilizing convolution layers with kernel sizes up to 31. CFPNet\nachieves state-of-the-art (SOTA) depth completion performance by combining\nthese two modules properly, as verified by extensive experimental results on\nthe ZJU-L5 dataset. The code is available at\nhttps://github.com/denyingmxd/CFPNet.\n","authors":["Laiyan Ding","Hualie Jiang","Rui Xu","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04480v4.pdf","comment":"Accepted by 3DV 2025"},{"id":"http://arxiv.org/abs/2412.02177v1","updated":"2024-12-03T05:21:42Z","published":"2024-12-03T05:21:42Z","title":"Anatomically-Grounded Fact Checking of Automated Chest X-ray Reports","summary":" With the emergence of large-scale vision-language models, realistic radiology\nreports may be generated using only medical images as input guided by simple\nprompts. However, their practical utility has been limited due to the factual\nerrors in their description of findings. In this paper, we propose a novel\nmodel for explainable fact-checking that identifies errors in findings and\ntheir locations indicated through the reports. Specifically, we analyze the\ntypes of errors made by automated reporting methods and derive a new synthetic\ndataset of images paired with real and fake descriptions of findings and their\nlocations from a ground truth dataset. A new multi-label cross-modal\ncontrastive regression network is then trained on this datsaset. We evaluate\nthe resulting fact-checking model and its utility in correcting reports\ngenerated by several SOTA automated reporting tools on a variety of benchmark\ndatasets with results pointing to over 40\\% improvement in report quality\nthrough such error detection and correction.\n","authors":["R. Mahmood","K. C. L. Wong","D. M. Reyes","N. D'Souza","L. Shi","J. Wu","P. Kaviani","M. Kalra","G. Wang","P. Yan","T. Syeda-Mahmood"],"pdf_url":"https://arxiv.org/pdf/2412.02177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.04041v3","updated":"2024-12-03T05:21:26Z","published":"2024-07-04T16:29:05Z","title":"Towards Cross-View-Consistent Self-Supervised Surround Depth Estimation","summary":" Depth estimation is a cornerstone for autonomous driving, yet acquiring\nper-pixel depth ground truth for supervised learning is challenging.\nSelf-Supervised Surround Depth Estimation (SSSDE) from consecutive images\noffers an economical alternative. While previous SSSDE methods have proposed\ndifferent mechanisms to fuse information across images, few of them explicitly\nconsider the cross-view constraints, leading to inferior performance,\nparticularly in overlapping regions. This paper proposes an efficient and\nconsistent pose estimation design and two loss functions to enhance cross-view\nconsistency for SSSDE. For pose estimation, we propose to use only front-view\nimages to reduce training memory and sustain pose estimation consistency. The\nfirst loss function is the dense depth consistency loss, which penalizes the\ndifference between predicted depths in overlapping regions. The second one is\nthe multi-view reconstruction consistency loss, which aims to maintain\nconsistency between reconstruction from spatial and spatial-temporal contexts.\nAdditionally, we introduce a novel flipping augmentation to improve the\nperformance further. Our techniques enable a simple neural model to achieve\nstate-of-the-art performance on the DDAD and nuScenes datasets. Last but not\nleast, our proposed techniques can be easily applied to other methods. The code\nis available at https://github.com/denyingmxd/CVCDepth.\n","authors":["Laiyan Ding","Hualie Jiang","Jie Li","Yongquan Chen","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2407.04041v3.pdf","comment":"Accepted by IROS2024"},{"id":"http://arxiv.org/abs/2412.02172v1","updated":"2024-12-03T05:04:49Z","published":"2024-12-03T05:04:49Z","title":"VISCO: Benchmarking Fine-Grained Critique and Correction Towards\n Self-Improvement in Visual Reasoning","summary":" The ability of large vision-language models (LVLMs) to critique and correct\ntheir reasoning is an essential building block towards their self-improvement.\nHowever, a systematic analysis of such capabilities in LVLMs is still lacking.\nWe propose VISCO, the first benchmark to extensively analyze the fine-grained\ncritique and correction capabilities of LVLMs. Compared to existing work that\nuses a single scalar value to critique the entire reasoning [4], VISCO features\ndense and fine-grained critique, requiring LVLMs to evaluate the correctness of\neach step in the chain-of-thought and provide natural language explanations to\nsupport their judgments. Extensive evaluation of 24 LVLMs demonstrates that\nhuman-written critiques significantly enhance the performance after correction,\nshowcasing the potential of the self-improvement strategy. However, the\nmodel-generated critiques are less helpful and sometimes detrimental to the\nperformance, suggesting that critique is the crucial bottleneck. We identified\nthree common patterns in critique failures: failure to critique visual\nperception, reluctance to \"say no\", and exaggerated assumption of error\npropagation. To address these issues, we propose an effective LookBack strategy\nthat revisits the image to verify each piece of information in the initial\nreasoning. LookBack significantly improves critique and correction performance\nby up to 13.5%.\n","authors":["Xueqing Wu","Yuheng Ding","Bingxuan Li","Pan Lu","Da Yin","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2412.02172v1.pdf","comment":"Project: https://visco-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2412.02171v1","updated":"2024-12-03T05:00:26Z","published":"2024-12-03T05:00:26Z","title":"Underload: Defending against Latency Attacks for Object Detectors on\n Edge Devices","summary":" Object detection is a fundamental enabler for many real-time downstream\napplications such as autonomous driving, augmented reality and supply chain\nmanagement. However, the algorithmic backbone of neural networks is brittle to\nimperceptible perturbations in the system inputs, which were generally known as\nmisclassifying attacks. By targeting the real-time processing capability, a new\nclass of latency attacks are reported recently. They exploit new attack\nsurfaces in object detectors by creating a computational bottleneck in the\npost-processing module, that leads to cascading failure and puts the real-time\ndownstream tasks at risks. In this work, we take an initial attempt to defend\nagainst this attack via background-attentive adversarial training that is also\ncognizant of the underlying hardware capabilities. We first draw system-level\nconnections between latency attack and hardware capacity across heterogeneous\nGPU devices. Based on the particular adversarial behaviors, we utilize\nobjectness loss as a proxy and build background attention into the adversarial\ntraining pipeline, and achieve a reasonable balance between clean and robust\naccuracy. The extensive experiments demonstrate the defense effectiveness of\nrestoring real-time processing capability from $13$ FPS to $43$ FPS on Jetson\nOrin NX, with a better trade-off between the clean and robust accuracy.\n","authors":["Tianyi Wang","Zichen Wang","Cong Wang","Yuanchao Shu","Ruilong Deng","Peng Cheng","Jiming Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02168v1","updated":"2024-12-03T04:55:02Z","published":"2024-12-03T04:55:02Z","title":"Generative Photography: Scene-Consistent Camera Control for Realistic\n Text-to-Image Synthesis","summary":" Image generation today can produce somewhat realistic images from text\nprompts. However, if one asks the generator to synthesize a particular camera\nsetting such as creating different fields of view using a 24mm lens versus a\n70mm lens, the generator will not be able to interpret and generate\nscene-consistent images. This limitation not only hinders the adoption of\ngenerative tools in photography applications but also exemplifies a broader\nissue of bridging the gap between the data-driven models and the physical\nworld. In this paper, we introduce the concept of Generative Photography, a\nframework designed to control camera intrinsic settings during content\ngeneration. The core innovation of this work are the concepts of Dimensionality\nLifting and Contrastive Camera Learning, which achieve continuous and\nconsistent transitions for different camera settings. Experimental results show\nthat our method produces significantly more scene-consistent photorealistic\nimages than state-of-the-art models such as Stable Diffusion 3 and FLUX.\n","authors":["Yu Yuan","Xijun Wang","Yichen Sheng","Prateek Chennuri","Xingguang Zhang","Stanley Chan"],"pdf_url":"https://arxiv.org/pdf/2412.02168v1.pdf","comment":"Project page: https://generative-photography.github.io/project/"},{"id":"http://arxiv.org/abs/2412.02158v1","updated":"2024-12-03T04:34:23Z","published":"2024-12-03T04:34:23Z","title":"Agri-LLaVA: Knowledge-Infused Large Multimodal Assistant on Agricultural\n Pests and Diseases","summary":" In the general domain, large multimodal models (LMMs) have achieved\nsignificant advancements, yet challenges persist in applying them to specific\nfields, especially agriculture. As the backbone of the global economy,\nagriculture confronts numerous challenges, with pests and diseases being\nparticularly concerning due to their complexity, variability, rapid spread, and\nhigh resistance. This paper specifically addresses these issues. We construct\nthe first multimodal instruction-following dataset in the agricultural domain,\ncovering over 221 types of pests and diseases with approximately 400,000 data\nentries. This dataset aims to explore and address the unique challenges in pest\nand disease control. Based on this dataset, we propose a knowledge-infused\ntraining method to develop Agri-LLaVA, an agricultural multimodal conversation\nsystem. To accelerate progress in this field and inspire more researchers to\nengage, we design a diverse and challenging evaluation benchmark for\nagricultural pests and diseases. Experimental results demonstrate that\nAgri-LLaVA excels in agricultural multimodal conversation and visual\nunderstanding, providing new insights and approaches to address agricultural\npests and diseases. By open-sourcing our dataset and model, we aim to promote\nresearch and development in LMMs within the agricultural domain and make\nsignificant contributions to tackle the challenges of agricultural pests and\ndiseases. All resources can be found at https://github.com/Kki2Eve/Agri-LLaVA.\n","authors":["Liqiong Wang","Teng Jin","Jinyu Yang","Ales Leonardis","Fangyi Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.02158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01552v2","updated":"2024-12-03T04:28:06Z","published":"2024-12-02T14:38:26Z","title":"GFreeDet: Exploiting Gaussian Splatting and Foundation Models for\n Model-free Unseen Object Detection in the BOP Challenge 2024","summary":" In this report, we provide the technical details of the submitted method\nGFreeDet, which exploits Gaussian splatting and vision Foundation models for\nthe model-free unseen object Detection track in the BOP 2024 Challenge.\n","authors":["Xingyu Liu","Yingyue Li","Chengxi Li","Gu Wang","Chenyangguang Zhang","Ziqin Huang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2412.01552v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01244v2","updated":"2024-12-03T04:25:48Z","published":"2024-12-02T08:05:39Z","title":"Concept Replacer: Replacing Sensitive Concepts in Diffusion Models via\n Precision Localization","summary":" As large-scale diffusion models continue to advance, they excel at producing\nhigh-quality images but often generate unwanted content, such as sexually\nexplicit or violent content. Existing methods for concept removal generally\nguide the image generation process but can unintentionally modify unrelated\nregions, leading to inconsistencies with the original model. We propose a novel\napproach for targeted concept replacing in diffusion models, enabling specific\nconcepts to be removed without affecting non-target areas. Our method\nintroduces a dedicated concept localizer for precisely identifying the target\nconcept during the denoising process, trained with few-shot learning to require\nminimal labeled data. Within the identified region, we introduce a\ntraining-free Dual Prompts Cross-Attention (DPCA) module to substitute the\ntarget concept, ensuring minimal disruption to surrounding content. We evaluate\nour method on concept localization precision and replacement efficiency.\nExperimental results demonstrate that our method achieves superior precision in\nlocalizing target concepts and performs coherent concept replacement with\nminimal impact on non-target areas, outperforming existing approaches.\n","authors":["Lingyun Zhang","Yu Xie","Yanwei Fu","Ping Chen"],"pdf_url":"https://arxiv.org/pdf/2412.01244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10374v2","updated":"2024-12-03T04:25:30Z","published":"2024-07-15T00:48:06Z","title":"An Empirical Study of Mamba-based Pedestrian Attribute Recognition","summary":" Current strong pedestrian attribute recognition models are developed based on\nTransformer networks, which are computationally heavy. Recently proposed models\nwith linear complexity (e.g., Mamba) have garnered significant attention and\nhave achieved a good balance between accuracy and computational cost across a\nvariety of visual tasks. Relevant review articles also suggest that while these\nmodels can perform well on some pedestrian attribute recognition datasets, they\nare generally weaker than the corresponding Transformer models. To further tap\ninto the potential of the novel Mamba architecture for PAR tasks, this paper\ndesigns and adapts Mamba into two typical PAR frameworks, i.e., the text-image\nfusion approach and pure vision Mamba multi-label recognition framework. It is\nfound that interacting with attribute tags as additional input does not always\nlead to an improvement, specifically, Vim can be enhanced, but VMamba cannot.\nThis paper further designs various hybrid Mamba-Transformer variants and\nconducts thorough experimental validations. These experimental results indicate\nthat simply enhancing Mamba with a Transformer does not always lead to\nperformance improvements but yields better results under certain settings. We\nhope this empirical study can further inspire research in Mamba for PAR, and\neven extend into the domain of multi-label recognition, through the design of\nthese network structures and comprehensive experimentation. The source code of\nthis work will be released at \\url{https://github.com/Event-AHU/OpenPAR}\n","authors":["Xiao Wang","Weizhe Kong","Jiandong Jin","Shiao Wang","Ruichong Gao","Qingchuan Ma","Chenglong Li","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2407.10374v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2411.17106v3","updated":"2024-12-03T04:14:09Z","published":"2024-11-26T04:49:42Z","title":"PassionSR: Post-Training Quantization with Adaptive Scale in One-Step\n Diffusion based Image Super-Resolution","summary":" Diffusion-based image super-resolution (SR) models have shown superior\nperformance at the cost of multiple denoising steps. However, even though the\ndenoising step has been reduced to one, they require high computational costs\nand storage requirements, making it difficult for deployment on hardware\ndevices. To address these issues, we propose a novel post-training quantization\napproach with adaptive scale in one-step diffusion (OSD) image SR, PassionSR.\nFirst, we simplify OSD model to two core components, UNet and Variational\nAutoencoder (VAE) by removing the CLIPEncoder. Secondly, we propose Learnable\nBoundary Quantizer (LBQ) and Learnable Equivalent Transformation (LET) to\noptimize the quantization process and manipulate activation distributions for\nbetter quantization. Finally, we design a Distributed Quantization Calibration\n(DQC) strategy that stabilizes the training of quantized parameters for rapid\nconvergence. Comprehensive experiments demonstrate that PassionSR with 8-bit\nand 6-bit obtains comparable visual results with full-precision model.\nMoreover, our PassionSR achieves significant advantages over recent leading\nlow-bit quantization methods for image SR. Our code will be at\nhttps://github.com/libozhu03/PassionSR.\n","authors":["Libo Zhu","Jianze Li","Haotong Qin","Wenbo Li","Yulun Zhang","Yong Guo","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.17106v3.pdf","comment":"https://github.com/libozhu03/PassionSR"},{"id":"http://arxiv.org/abs/2412.01004v2","updated":"2024-12-03T04:13:14Z","published":"2024-12-01T23:41:42Z","title":"Adaptive Rank, Reduced Forgetting: Knowledge Retention in Continual\n Learning Vision-Language Models with Dynamic Rank-Selective LoRA","summary":" We investigate whether the pre-trained knowledge of vision-language models\n(VLMs), such as CLIP, can be retained or even enhanced during continual\nlearning (CL) while absorbing knowledge from a data stream. Existing methods\noften rely on additional reference data, isolated components for distribution\nor domain predictions, leading to high training costs, increased inference\ncomplexity, and limited improvement potential for pre-trained models. To\naddress these challenges, we first comprehensively analyze the effects of\nparameter update locations and ranks on downstream adaptation and knowledge\nretention. Based on these insights, we propose Dynamic Rank-Selective Low Rank\nAdaptation (LoRA), a universal and efficient CL approach that adaptively\nassigns ranks to LoRA modules based on their relevance to the current data.\nUnlike prior methods, our approach continually enhances the pre-trained VLM by\nretaining both the pre-trained knowledge and the knowledge acquired during CL.\nOur approach eliminates the need for explicit domain or distribution prediction\nand additional reference data, enabling seamless integration of new tasks while\npreserving pre-trained capabilities. It also maintains the original\narchitecture and deployment pipeline of the pre-trained model without incurring\nany additional inference overhead. Extensive experiments and analyses\ndemonstrate that our method outperforms state-of-the-art approaches in\ncontinually absorbing knowledge of downstream tasks while retaining pre-trained\nknowledge.\n","authors":["Haodong Lu","Chongyang Zhao","Jason Xue","Lina Yao","Kristen Moore","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2412.01004v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.00759v2","updated":"2024-12-03T04:00:09Z","published":"2024-12-01T10:32:47Z","title":"DyMO: Training-Free Diffusion Model Alignment with Dynamic\n Multi-Objective Scheduling","summary":" Text-to-image diffusion model alignment is critical for improving the\nalignment between the generated images and human preferences. While\ntraining-based methods are constrained by high computational costs and dataset\nrequirements, training-free alignment methods remain underexplored and are\noften limited by inaccurate guidance. We propose a plug-and-play training-free\nalignment method, DyMO, for aligning the generated images and human preferences\nduring inference. Apart from text-aware human preference scores, we introduce a\nsemantic alignment objective for enhancing the semantic alignment in the early\nstages of diffusion, relying on the fact that the attention maps are effective\nreflections of the semantics in noisy images. We propose dynamic scheduling of\nmultiple objectives and intermediate recurrent steps to reflect the\nrequirements at different steps. Experiments with diverse pre-trained diffusion\nmodels and metrics demonstrate the effectiveness and robustness of the proposed\nmethod.\n","authors":["Xin Xie","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2412.00759v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02142v1","updated":"2024-12-03T03:59:03Z","published":"2024-12-03T03:59:03Z","title":"Personalized Multimodal Large Language Models: A Survey","summary":" Multimodal Large Language Models (MLLMs) have become increasingly important\ndue to their state-of-the-art performance and ability to integrate multiple\ndata modalities, such as text, images, and audio, to perform complex tasks with\nhigh accuracy. This paper presents a comprehensive survey on personalized\nmultimodal large language models, focusing on their architecture, training\nmethods, and applications. We propose an intuitive taxonomy for categorizing\nthe techniques used to personalize MLLMs to individual users, and discuss the\ntechniques accordingly. Furthermore, we discuss how such techniques can be\ncombined or adapted when appropriate, highlighting their advantages and\nunderlying rationale. We also provide a succinct summary of personalization\ntasks investigated in existing research, along with the evaluation metrics\ncommonly used. Additionally, we summarize the datasets that are useful for\nbenchmarking personalized MLLMs. Finally, we outline critical open challenges.\nThis survey aims to serve as a valuable resource for researchers and\npractitioners seeking to understand and advance the development of personalized\nmultimodal large language models.\n","authors":["Junda Wu","Hanjia Lyu","Yu Xia","Zhehao Zhang","Joe Barrow","Ishita Kumar","Mehrnoosh Mirtaheri","Hongjie Chen","Ryan A. Rossi","Franck Dernoncourt","Tong Yu","Ruiyi Zhang","Jiuxiang Gu","Nesreen K. Ahmed","Yu Wang","Xiang Chen","Hanieh Deilamsalehy","Namyong Park","Sungchul Kim","Huanrui Yang","Subrata Mitra","Zhengmian Hu","Nedim Lipka","Dang Nguyen","Yue Zhao","Jiebo Luo","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2412.02142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05022v4","updated":"2024-12-03T03:57:43Z","published":"2023-08-09T15:38:36Z","title":"Exploring Frequency-Inspired Optimization in Transformer for Efficient\n Single Image Super-Resolution","summary":" Transformer-based methods have exhibited remarkable potential in single image\nsuper-resolution (SISR) by effectively extracting long-range dependencies.\nHowever, most of the current research in this area has prioritized the design\nof transformer blocks to capture global information, while overlooking the\nimportance of incorporating high-frequency priors, which we believe could be\nbeneficial. In our study, we conducted a series of experiments and found that\ntransformer structures are more adept at capturing low-frequency information,\nbut have limited capacity in constructing high-frequency representations when\ncompared to their convolutional counterparts. Our proposed solution, the\ncross-refinement adaptive feature modulation transformer (CRAFT), integrates\nthe strengths of both convolutional and transformer structures. It comprises\nthree key components: the high-frequency enhancement residual block (HFERB) for\nextracting high-frequency information, the shift rectangle window attention\nblock (SRWAB) for capturing global information, and the hybrid fusion block\n(HFB) for refining the global representation. To tackle the inherent\nintricacies of transformer structures, we introduce a frequency-guided\npost-training quantization (PTQ) method aimed at enhancing CRAFT's efficiency.\nThese strategies incorporate adaptive dual clipping and boundary refinement. To\nfurther amplify the versatility of our proposed approach, we extend our PTQ\nstrategy to function as a general quantization method for transformer-based\nSISR techniques. Our experimental findings showcase CRAFT's superiority over\ncurrent state-of-the-art methods, both in full-precision and quantization\nscenarios. These results underscore the efficacy and universality of our PTQ\nstrategy. The source code is available at:\nhttps://github.com/AVC2-UESTC/Frequency-Inspired-Optimization-for-EfficientSR.git.\n","authors":["Ao Li","Le Zhang","Yun Liu","Ce Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.05022v4.pdf","comment":"Extended CRAFT, accepted by TPAMI"},{"id":"http://arxiv.org/abs/2412.02141v1","updated":"2024-12-03T03:57:24Z","published":"2024-12-03T03:57:24Z","title":"WSI-LLaVA: A Multimodal Large Language Model for Whole Slide Image","summary":" Recent advancements in computational pathology have produced patch-level\nMulti-modal Large Language Models (MLLMs), but these models are limited by\ntheir inability to analyze whole slide images (WSIs) comprehensively and their\ntendency to bypass crucial morphological features that pathologists rely on for\ndiagnosis. To address these challenges, we first introduce WSI-Bench, a\nlarge-scale morphology-aware benchmark containing 180k VQA pairs from 9,850\nWSIs across 30 cancer types, designed to evaluate MLLMs' understanding of\nmorphological characteristics crucial for accurate diagnosis. Building upon\nthis benchmark, we present WSI-LLaVA, a novel framework for gigapixel WSI\nunderstanding that employs a three-stage training approach: WSI-text alignment,\nfeature space alignment, and task-specific instruction tuning. To better assess\nmodel performance in pathological contexts, we develop two specialized WSI\nmetrics: WSI-Precision and WSI-Relevance. Experimental results demonstrate that\nWSI-LLaVA outperforms existing models across all capability dimensions, with a\nsignificant improvement in morphological analysis, establishing a clear\ncorrelation between morphological understanding and diagnostic accuracy.\n","authors":["Yuci Liang","Xinheng Lyu","Meidan Ding","Wenting Chen","Jipeng Zhang","Yuexiang Ren","Xiangjian He","Song Wu","Sen Yang","Xiyue Wang","Xiaohan Xing","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2412.02141v1.pdf","comment":"38 pages, 22 figures, 35 tables"},{"id":"http://arxiv.org/abs/2409.18938v2","updated":"2024-12-03T03:56:52Z","published":"2024-09-27T17:38:36Z","title":"From Seconds to Hours: Reviewing MultiModal Large Language Models on\n Comprehensive Long Video Understanding","summary":" The integration of Large Language Models (LLMs) with visual encoders has\nrecently shown promising performance in visual understanding tasks, leveraging\ntheir inherent capability to comprehend and generate human-like text for visual\nreasoning. Given the diverse nature of visual data, MultiModal Large Language\nModels (MM-LLMs) exhibit variations in model designing and training for\nunderstanding images, short videos, and long videos. Our paper focuses on the\nsubstantial differences and unique challenges posed by long video understanding\ncompared to static image and short video understanding. Unlike static images,\nshort videos encompass sequential frames with both spatial and within-event\ntemporal information, while long videos consist of multiple events with\nbetween-event and long-term temporal information. In this survey, we aim to\ntrace and summarize the advancements of MM-LLMs from image understanding to\nlong video understanding. We review the differences among various visual\nunderstanding tasks and highlight the challenges in long video understanding,\nincluding more fine-grained spatiotemporal details, dynamic events, and\nlong-term dependencies. We then provide a detailed summary of the advancements\nin MM-LLMs in terms of model design and training methodologies for\nunderstanding long videos. Finally, we compare the performance of existing\nMM-LLMs on video understanding benchmarks of various lengths and discuss\npotential future directions for MM-LLMs in long video understanding.\n","authors":["Heqing Zou","Tianze Luo","Guiyang Xie"," Victor"," Zhang","Fengmao Lv","Guangcong Wang","Junyang Chen","Zhuochen Wang","Hansheng Zhang","Huaijian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18938v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2412.02140v1","updated":"2024-12-03T03:56:01Z","published":"2024-12-03T03:56:01Z","title":"SparseGrasp: Robotic Grasping via 3D Semantic Gaussian Splatting from\n Sparse Multi-View RGB Images","summary":" Language-guided robotic grasping is a rapidly advancing field where robots\nare instructed using human language to grasp specific objects. However,\nexisting methods often depend on dense camera views and struggle to quickly\nupdate scenes, limiting their effectiveness in changeable environments.\n In contrast, we propose SparseGrasp, a novel open-vocabulary robotic grasping\nsystem that operates efficiently with sparse-view RGB images and handles scene\nupdates fastly. Our system builds upon and significantly enhances existing\ncomputer vision modules in robotic learning. Specifically, SparseGrasp utilizes\nDUSt3R to generate a dense point cloud as the initialization for 3D Gaussian\nSplatting (3DGS), maintaining high fidelity even under sparse supervision.\nImportantly, SparseGrasp incorporates semantic awareness from recent vision\nfoundation models. To further improve processing efficiency, we repurpose\nPrincipal Component Analysis (PCA) to compress features from 2D models.\nAdditionally, we introduce a novel render-and-compare strategy that ensures\nrapid scene updates, enabling multi-turn grasping in changeable environments.\n Experimental results show that SparseGrasp significantly outperforms\nstate-of-the-art methods in terms of both speed and adaptability, providing a\nrobust solution for multi-turn grasping in changeable environment.\n","authors":["Junqiu Yu","Xinlin Ren","Yongchong Gu","Haitao Lin","Tianyu Wang","Yi Zhu","Hang Xu","Yu-Gang Jiang","Xiangyang Xue","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2412.02140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02129v1","updated":"2024-12-03T03:34:18Z","published":"2024-12-03T03:34:18Z","title":"GSOT3D: Towards Generic 3D Single Object Tracking in the Wild","summary":" In this paper, we present a novel benchmark, GSOT3D, that aims at\nfacilitating development of generic 3D single object tracking (SOT) in the\nwild. Specifically, GSOT3D offers 620 sequences with 123K frames, and covers a\nwide selection of 54 object categories. Each sequence is offered with multiple\nmodalities, including the point cloud (PC), RGB image, and depth. This allows\nGSOT3D to support various 3D tracking tasks, such as single-modal 3D SOT on PC\nand multi-modal 3D SOT on RGB-PC or RGB-D, and thus greatly broadens research\ndirections for 3D object tracking. To provide highquality per-frame 3D\nannotations, all sequences are labeled manually with multiple rounds of\nmeticulous inspection and refinement. To our best knowledge, GSOT3D is the\nlargest benchmark dedicated to various generic 3D object tracking tasks. To\nunderstand how existing 3D trackers perform and to provide comparisons for\nfuture research on GSOT3D, we assess eight representative point cloud-based\ntracking models. Our evaluation results exhibit that these models heavily\ndegrade on GSOT3D, and more efforts are required for robust and generic 3D\nobject tracking. Besides, to encourage future research, we present a simple yet\neffective generic 3D tracker, named PROT3D, that localizes the target object\nvia a progressive spatial-temporal network and outperforms all current\nsolutions by a large margin. By releasing GSOT3D, we expect to advance further\n3D tracking in future research and applications. Our benchmark and model as\nwell as the evaluation results will be publicly released at our webpage\nhttps://github.com/ailovejinx/GSOT3D.\n","authors":["Yifan Jiao","Yunhao Li","Junhua Ding","Qing Yang","Song Fu","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02129v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2412.00051v2","updated":"2024-12-03T03:33:50Z","published":"2024-11-24T06:39:06Z","title":"TransFair: Transferring Fairness from Ocular Disease Classification to\n Progression Prediction","summary":" The use of artificial intelligence (AI) in automated disease classification\nsignificantly reduces healthcare costs and improves the accessibility of\nservices. However, this transformation has given rise to concerns about the\nfairness of AI, which disproportionately affects certain groups, particularly\npatients from underprivileged populations. Recently, a number of methods and\nlarge-scale datasets have been proposed to address group performance\ndisparities. Although these methods have shown effectiveness in disease\nclassification tasks, they may fall short in ensuring fair prediction of\ndisease progression, mainly because of limited longitudinal data with diverse\ndemographics available for training a robust and equitable prediction model. In\nthis paper, we introduce TransFair to enhance demographic fairness in\nprogression prediction for ocular diseases. TransFair aims to transfer a\nfairness-enhanced disease classification model to the task of progression\nprediction with fairness preserved. Specifically, we train a fair EfficientNet,\ntermed FairEN, equipped with a fairness-aware attention mechanism using\nextensive data for ocular disease classification. Subsequently, this fair\nclassification model is adapted to a fair progression prediction model through\nknowledge distillation, which aims to minimize the latent feature distances\nbetween the classification and progression prediction models. We evaluate\nFairEN and TransFair for fairness-enhanced ocular disease classification and\nprogression prediction using both two-dimensional (2D) and 3D retinal images.\nExtensive experiments and comparisons with models with and without considering\nfairness learning show that TransFair effectively enhances demographic equity\nin predicting ocular disease progression.\n","authors":["Leila Gheisi","Henry Chu","Raju Gottumukkala","Yan Luo","Xingquan Zhu","Mengyu Wang","Min Shi"],"pdf_url":"https://arxiv.org/pdf/2412.00051v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2412.01027v2","updated":"2024-12-03T03:32:00Z","published":"2024-12-02T01:19:21Z","title":"Unleashing In-context Learning of Autoregressive Models for Few-shot\n Image Manipulation","summary":" Text-guided image manipulation has experienced notable advancement in recent\nyears. In order to mitigate linguistic ambiguity, few-shot learning with visual\nexamples has been applied for instructions that are underrepresented in the\ntraining set, or difficult to describe purely in language. However, learning\nfrom visual prompts requires strong reasoning capability, which diffusion\nmodels are struggling with. To address this issue, we introduce a novel\nmulti-modal autoregressive model, dubbed $\\textbf{InstaManip}$, that can\n$\\textbf{insta}$ntly learn a new image $\\textbf{manip}$ulation operation from\ntextual and visual guidance via in-context learning, and apply it to new query\nimages. Specifically, we propose an innovative group self-attention mechanism\nto break down the in-context learning process into two separate stages --\nlearning and applying, which simplifies the complex problem into two easier\ntasks. We also introduce a relation regularization method to further\ndisentangle image transformation features from irrelevant contents in exemplar\nimages. Extensive experiments suggest that our method surpasses previous\nfew-shot image manipulation models by a notable margin ($\\geq$19% in human\nevaluation). We also find our model can be further boosted by increasing the\nnumber or diversity of exemplar images.\n","authors":["Bolin Lai","Felix Juefei-Xu","Miao Liu","Xiaoliang Dai","Nikhil Mehta","Chenguang Zhu","Zeyi Huang","James M. Rehg","Sangmin Lee","Ning Zhang","Tong Xiao"],"pdf_url":"https://arxiv.org/pdf/2412.01027v2.pdf","comment":"18 pages, 16 figures, 5 tables"}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2412.02698v1","updated":"2024-12-03T18:59:51Z","published":"2024-12-03T18:59:51Z","title":"Scaling BERT Models for Turkish Automatic Punctuation and Capitalization\n Correction","summary":" This paper investigates the effectiveness of BERT based models for automated\npunctuation and capitalization corrections in Turkish texts across five\ndistinct model sizes. The models are designated as Tiny, Mini, Small, Medium,\nand Base. The design and capabilities of each model are tailored to address the\nspecific challenges of the Turkish language, with a focus on optimizing\nperformance while minimizing computational overhead. The study presents a\nsystematic comparison of the performance metrics precision, recall, and F1\nscore of each model, offering insights into their applicability in diverse\noperational contexts. The results demonstrate a significant improvement in text\nreadability and accuracy as model size increases, with the Base model achieving\nthe highest correction precision. This research provides a comprehensive guide\nfor selecting the appropriate model size based on specific user needs and\ncomputational resources, establishing a framework for deploying these models in\nreal-world applications to enhance the quality of written Turkish.\n","authors":["Abdulkader Saoud","Mahmut Alomeyr","Himmet Toprak Kesgin","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2412.02698v1.pdf","comment":"2024 Innovations in Intelligent Systems and Applications Conference\n (ASYU)"},{"id":"http://arxiv.org/abs/2412.02692v1","updated":"2024-12-03T18:59:10Z","published":"2024-12-03T18:59:10Z","title":"Taming Scalable Visual Tokenizer for Autoregressive Image Generation","summary":" Existing vector quantization (VQ) methods struggle with scalability, largely\nattributed to the instability of the codebook that undergoes partial updates\nduring training. The codebook is prone to collapse as utilization decreases,\ndue to the progressively widening distribution gap between non-activated codes\nand visual features. To solve the problem, we propose Index Backpropagation\nQuantization (IBQ), a new VQ method for the joint optimization of all codebook\nembeddings and the visual encoder. Applying a straight-through estimator on the\none-hot categorical distribution between the encoded feature and codebook, all\ncodes are differentiable and maintain a consistent latent space with the visual\nencoder. IBQ enables scalable training of visual tokenizers and, for the first\ntime, achieves a large-scale codebook ($2^{18}$) with high dimension ($256$)\nand high utilization. Experiments on the standard ImageNet benchmark\ndemonstrate the scalability and superiority of IBQ, achieving competitive\nresults on both reconstruction ($1.00$ rFID) and autoregressive visual\ngeneration ($2.05$ gFID). The code and models are available at\nhttps://github.com/TencentARC/SEED-Voken.\n","authors":["Fengyuan Shi","Zhuoyan Luo","Yixiao Ge","Yujiu Yang","Ying Shan","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.02692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.11021v3","updated":"2024-12-03T18:58:22Z","published":"2024-03-16T21:40:27Z","title":"Towards Neuro-Symbolic Video Understanding","summary":" The unprecedented surge in video data production in recent years necessitates\nefficient tools to extract meaningful frames from videos for downstream tasks.\nLong-term temporal reasoning is a key desideratum for frame retrieval systems.\nWhile state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are\nproficient in short-term semantic understanding, they surprisingly fail at\nlong-term reasoning across frames. A key reason for this failure is that they\nintertwine per-frame perception and temporal reasoning into a single deep\nnetwork. Hence, decoupling but co-designing semantic understanding and temporal\nreasoning is essential for efficient scene identification. We propose a system\nthat leverages vision-language models for semantic understanding of individual\nframes but effectively reasons about the long-term evolution of events using\nstate machines and temporal logic (TL) formulae that inherently capture memory.\nOur TL-based reasoning improves the F1 score of complex event identification by\n9-15% compared to benchmarks that use GPT4 for reasoning on state-of-the-art\nself-driving datasets such as Waymo and NuScenes.\n","authors":["Minkyu Choi","Harsh Goel","Mohammad Omama","Yunhao Yang","Sahil Shah","Sandeep Chinchali"],"pdf_url":"https://arxiv.org/pdf/2403.11021v3.pdf","comment":"Accepted by The European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2411.16718v3","updated":"2024-12-03T18:56:47Z","published":"2024-11-22T23:59:12Z","title":"Neuro-Symbolic Evaluation of Text-to-Video Models using Formal\n Verification","summary":" Recent advancements in text-to-video models such as Sora, Gen-3, MovieGen,\nand CogVideoX are pushing the boundaries of synthetic video generation, with\nadoption seen in fields like robotics, autonomous driving, and entertainment.\nAs these models become prevalent, various metrics and benchmarks have emerged\nto evaluate the quality of the generated videos. However, these metrics\nemphasize visual quality and smoothness, neglecting temporal fidelity and\ntext-to-video alignment, which are crucial for safety-critical applications. To\naddress this gap, we introduce NeuS-V, a novel synthetic video evaluation\nmetric that rigorously assesses text-to-video alignment using neuro-symbolic\nformal verification techniques. Our approach first converts the prompt into a\nformally defined Temporal Logic (TL) specification and translates the generated\nvideo into an automaton representation. Then, it evaluates the text-to-video\nalignment by formally checking the video automaton against the TL\nspecification. Furthermore, we present a dataset of temporally extended prompts\nto evaluate state-of-the-art video generation models against our benchmark. We\nfind that NeuS-V demonstrates a higher correlation by over 5x with human\nevaluations when compared to existing metrics. Our evaluation further reveals\nthat current video generation models perform poorly on these temporally complex\nprompts, highlighting the need for future work in improving text-to-video\ngeneration capabilities.\n","authors":["S. P. Sharan","Minkyu Choi","Sahil Shah","Harsh Goel","Mohammad Omama","Sandeep Chinchali"],"pdf_url":"https://arxiv.org/pdf/2411.16718v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02685v1","updated":"2024-12-03T18:56:07Z","published":"2024-12-03T18:56:07Z","title":"T-REG: Preference Optimization with Token-Level Reward Regularization","summary":" Reinforcement learning from human feedback (RLHF) has been crucial in\naligning large language models (LLMs) with human values. Traditionally, RLHF\ninvolves generating responses to a query and using a reward model to assign a\nreward to the entire response. However, this approach faces challenges due to\nits reliance on a single, sparse reward, which makes it challenging for the\nmodel to identify which parts of the sequence contribute most significantly to\nthe final reward. Recent methods have attempted to address this limitation by\nintroducing token-level rewards. However, these methods often rely on either a\ntrained credit assignment model or AI annotators, raising concerns about the\nquality and reliability of the rewards. In this paper, we propose token-level\nreward regularization (T-REG), a novel approach that leverages both\nsequence-level and token-level rewards for preference optimization. Harnessing\nthe self-refinement capabilities of LLMs, our method uses contrastive prompting\nto enable LLMs to self-generate token-level rewards. These self-generated\nrewards then act as reward regularization, guiding the model to more\neffectively distribute sequence-level rewards across tokens. This facilitates\nbetter token-level credit assignment and enhances alignment performance.\nExperiments on the instruction following benchmarks, including Alpaca Eval 2\nand Arena-Hard, show that our method consistently outperforms baseline methods\nby up to 3.8% and 4.4%, respectively. We will release the code and models at\nhttps://github.com/wzhouad/T-REG.\n","authors":["Wenxuan Zhou","Shujian Zhang","Lingxiao Zhao","Tao Meng"],"pdf_url":"https://arxiv.org/pdf/2412.02685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02684v1","updated":"2024-12-03T18:55:39Z","published":"2024-12-03T18:55:39Z","title":"AniGS: Animatable Gaussian Avatar from a Single Image with Inconsistent\n Gaussian Reconstruction","summary":" Generating animatable human avatars from a single image is essential for\nvarious digital human modeling applications. Existing 3D reconstruction methods\noften struggle to capture fine details in animatable models, while generative\napproaches for controllable animation, though avoiding explicit 3D modeling,\nsuffer from viewpoint inconsistencies in extreme poses and computational\ninefficiencies. In this paper, we address these challenges by leveraging the\npower of generative models to produce detailed multi-view canonical pose\nimages, which help resolve ambiguities in animatable human reconstruction. We\nthen propose a robust method for 3D reconstruction of inconsistent images,\nenabling real-time rendering during inference. Specifically, we adapt a\ntransformer-based video generation model to generate multi-view canonical pose\nimages and normal maps, pretraining on a large-scale video dataset to improve\ngeneralization. To handle view inconsistencies, we recast the reconstruction\nproblem as a 4D task and introduce an efficient 3D modeling approach using 4D\nGaussian Splatting. Experiments demonstrate that our method achieves\nphotorealistic, real-time animation of 3D human avatars from in-the-wild\nimages, showcasing its effectiveness and generalization capability.\n","authors":["Lingteng Qiu","Shenhao Zhu","Qi Zuo","Xiaodong Gu","Yuan Dong","Junfei Zhang","Chao Xu","Zhe Li","Weihao Yuan","Liefeng Bo","Guanying Chen","Zilong Dong"],"pdf_url":"https://arxiv.org/pdf/2412.02684v1.pdf","comment":"Project Page: https://lingtengqiu.github.io/2024/AniGS/"},{"id":"http://arxiv.org/abs/2412.02682v1","updated":"2024-12-03T18:54:49Z","published":"2024-12-03T18:54:49Z","title":"The Asymptotic Behavior of Attention in Transformers","summary":" A key component of transformers is the attention mechanism orchestrating how\neach token influences the propagation of every other token through a\ntransformer. In this paper we provide a rigorous, mathematical analysis of the\nasymptotic properties of attention in transformers. Although we present several\nresults based on different assumptions, all of them point to the same\nconclusion, all tokens asymptotically converge to each other, a phenomenon that\nhas been empirically reported in the literature. Our findings are carefully\ncompared with existing theoretical results and illustrated by simulations and\nexperimental studies using the GPT-2 model.\n","authors":["Álvaro Rodríguez Abella","João Pedro Silvestre","Paulo Tabuada"],"pdf_url":"https://arxiv.org/pdf/2412.02682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14052v2","updated":"2024-12-03T18:48:00Z","published":"2024-10-17T21:47:11Z","title":"From Isolated Conversations to Hierarchical Schemas: Dynamic Tree Memory\n Representation for LLMs","summary":" Recent advancements in large language models have significantly improved\ntheir context windows, yet challenges in effective long-term memory management\nremain. We introduce MemTree, an algorithm that leverages a dynamic,\ntree-structured memory representation to optimize the organization, retrieval,\nand integration of information, akin to human cognitive schemas. MemTree\norganizes memory hierarchically, with each node encapsulating aggregated\ntextual content, corresponding semantic embeddings, and varying abstraction\nlevels across the tree's depths. Our algorithm dynamically adapts this memory\nstructure by computing and comparing semantic embeddings of new and existing\ninformation to enrich the model's context-awareness. This approach allows\nMemTree to handle complex reasoning and extended interactions more effectively\nthan traditional memory augmentation methods, which often rely on flat lookup\ntables. Evaluations on benchmarks for multi-turn dialogue understanding and\ndocument question answering show that MemTree significantly enhances\nperformance in scenarios that demand structured memory management.\n","authors":["Alireza Rezazadeh","Zichao Li","Wei Wei","Yujia Bao"],"pdf_url":"https://arxiv.org/pdf/2410.14052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17861v2","updated":"2024-12-03T18:38:45Z","published":"2024-11-26T20:22:31Z","title":"Accelerating Proximal Policy Optimization Learning Using Task Prediction\n for Solving Environments with Delayed Rewards","summary":" In this paper, we tackle the challenging problem of delayed rewards in\nreinforcement learning (RL). While Proximal Policy Optimization (PPO) has\nemerged as a leading Policy Gradient method, its performance can degrade under\ndelayed rewards. We introduce two key enhancements to PPO: a hybrid policy\narchitecture that combines an offline policy (trained on expert demonstrations)\nwith an online PPO policy, and a reward shaping mechanism using Time Window\nTemporal Logic (TWTL). The hybrid architecture leverages offline data\nthroughout training while maintaining PPO's theoretical guarantees. Building on\nthe monotonic improvement framework of Trust Region Policy Optimization (TRPO),\nwe prove that our approach ensures improvement over both the offline policy and\nprevious iterations, with a bounded performance gap of\n$(2\\varsigma\\gamma\\alpha^2)/(1-\\gamma)^2$, where $\\alpha$ is the mixing\nparameter, $\\gamma$ is the discount factor, and $\\varsigma$ bounds the expected\nadvantage. Additionally, we prove that our TWTL-based reward shaping preserves\nthe optimal policy of the original problem. TWTL enables formal translation of\ntemporal objectives into immediate feedback signals that guide learning. We\ndemonstrate the effectiveness of our approach through extensive experiments on\nan inverted pendulum and a lunar lander environments, showing improvements in\nboth learning speed and final performance compared to standard PPO and\noffline-only approaches.\n","authors":["Ahmad Ahmad","Mehdi Kermanshah","Kevin Leahy","Zachary Serlin","Ho Chit Siu","Makai Mann","Cristian-Ioan Vasile","Roberto Tron","Calin Belta"],"pdf_url":"https://arxiv.org/pdf/2411.17861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02659v1","updated":"2024-12-03T18:33:48Z","published":"2024-12-03T18:33:48Z","title":"Adaptive Informed Deep Neural Networks for Power Flow Analysis","summary":" This study introduces PINN4PF, an end-to-end deep learning architecture for\npower flow (PF) analysis that effectively captures the nonlinear dynamics of\nlarge-scale modern power systems. The proposed neural network (NN) architecture\nconsists of two important advancements in the training pipeline: (A) a\ndouble-head feed-forward NN that aligns with PF analysis, including an\nactivation function that adjusts to active and reactive power consumption\npatterns, and (B) a physics-based loss function that partially incorporates\npower system topology information. The effectiveness of the proposed\narchitecture is illustrated through 4-bus, 15-bus, 290-bus, and 2224-bus test\nsystems and is evaluated against two baselines: a linear regression model (LR)\nand a black-box NN (MLP). The comparison is based on (i) generalization\nability, (ii) robustness, (iii) impact of training dataset size on\ngeneralization ability, (iv) accuracy in approximating derived PF quantities\n(specifically line current, line active power, and line reactive power), and\n(v) scalability. Results demonstrate that PINN4PF outperforms both baselines\nacross all test systems by up to two orders of magnitude not only in terms of\ndirect criteria, e.g., generalization ability but also in terms of\napproximating derived physical quantities.\n","authors":["Zeynab Kaseb","Stavros Orfanoudakis","Pedro P. Vergara","Peter Palensky"],"pdf_url":"https://arxiv.org/pdf/2412.02659v1.pdf","comment":"10 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2412.02653v1","updated":"2024-12-03T18:27:40Z","published":"2024-12-03T18:27:40Z","title":"Scaffold or Crutch? Examining College Students' Use and Views of\n Generative AI Tools for STEM Education","summary":" Developing problem-solving competency is central to Science, Technology,\nEngineering, and Mathematics (STEM) education, yet translating this priority\ninto effective approaches to problem-solving instruction and assessment remain\na significant challenge. The recent proliferation of generative artificial\nintelligence (genAI) tools like ChatGPT in higher education introduces new\nconsiderations about how these tools can help or hinder students' development\nof STEM problem-solving competency. Our research examines these considerations\nby studying how and why college students use genAI tools in their STEM\ncoursework, focusing on their problem-solving support. We surveyed 40 STEM\ncollege students from diverse U.S. institutions and 28 STEM faculty to\nunderstand instructor perspectives on effective genAI tool use and guidance in\nSTEM courses. Our findings reveal high adoption rates and diverse applications\nof genAI tools among STEM students. The most common use cases include finding\nexplanations, exploring related topics, summarizing readings, and helping with\nproblem-set questions. The primary motivation for using genAI tools was to save\ntime. Moreover, over half of student participants reported simply inputting\nproblems for AI to generate solutions, potentially bypassing their own\nproblem-solving processes. These findings indicate that despite high adoption\nrates, students' current approaches to utilizing genAI tools often fall short\nin enhancing their own STEM problem-solving competencies. The study also\nexplored students' and STEM instructors' perceptions of the benefits and risks\nassociated with using genAI tools in STEM education. Our findings provide\ninsights into how to guide students on appropriate genAI use in STEM courses\nand how to design genAI-based tools to foster students' problem-solving\ncompetency.\n","authors":["Karen D. Wang","Zhangyang Wu","L'Nard Tufts II","Carl Wieman","Shima Salehi","Nick Haber"],"pdf_url":"https://arxiv.org/pdf/2412.02653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02638v1","updated":"2024-12-03T18:10:31Z","published":"2024-12-03T18:10:31Z","title":"QA-TOOLBOX: Conversational Question-Answering for process task guidance\n in manufacturing","summary":" In this work we explore utilizing LLMs for data augmentation for\nmanufacturing task guidance system. The dataset consists of representative\nsamples of interactions with technicians working in an advanced manufacturing\nsetting. The purpose of this work to explore the task, data augmentation for\nthe supported tasks and evaluating the performance of the existing LLMs. We\nobserve that that task is complex requiring understanding from procedure\nspecification documents, actions and objects sequenced temporally. The dataset\nconsists of 200,000+ question/answer pairs that refer to the spec document and\nare grounded in narrations and/or video demonstrations. We compared the\nperformance of several popular open-sourced LLMs by developing a baseline using\neach LLM and then compared the responses in a reference-free setting using\nLLM-as-a-judge and compared the ratings with crowd-workers whilst validating\nthe ratings with experts.\n","authors":["Ramesh Manuvinakurike","Elizabeth Watkins","Celal Savur","Anthony Rhodes","Sovan Biswas","Gesem Gudino Mejia","Richard Beckwith","Saurav Sahay","Giuseppe Raffa","Lama Nachman"],"pdf_url":"https://arxiv.org/pdf/2412.02638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02632v1","updated":"2024-12-03T18:01:45Z","published":"2024-12-03T18:01:45Z","title":"Scaling Image Tokenizers with Grouped Spherical Quantization","summary":" Vision tokenizers have gained a lot of attraction due to their scalability\nand compactness; previous works depend on old-school GAN-based hyperparameters,\nbiased comparisons, and a lack of comprehensive analysis of the scaling\nbehaviours. To tackle those issues, we introduce Grouped Spherical Quantization\n(GSQ), featuring spherical codebook initialization and lookup regularization to\nconstrain codebook latent to a spherical surface. Our empirical analysis of\nimage tokenizer training strategies demonstrates that GSQ-GAN achieves superior\nreconstruction quality over state-of-the-art methods with fewer training\niterations, providing a solid foundation for scaling studies. Building on this,\nwe systematically examine the scaling behaviours of GSQ, specifically in latent\ndimensionality, codebook size, and compression ratios, and their impact on\nmodel performance. Our findings reveal distinct behaviours at high and low\nspatial compression levels, underscoring challenges in representing\nhigh-dimensional latent spaces. We show that GSQ can restructure\nhigh-dimensional latent into compact, low-dimensional spaces, thus enabling\nefficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x\ndown-sampling with a reconstruction FID (rFID) of 0.50.\n","authors":["Jiangtao Wang","Zhen Qin","Yifan Zhang","Vincent Tao Hu","Björn Ommer","Rania Briq","Stefan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2412.02632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02626v1","updated":"2024-12-03T17:54:12Z","published":"2024-12-03T17:54:12Z","title":"Time-Reversal Provides Unsupervised Feedback to LLMs","summary":" Large Language Models (LLMs) are typically trained to predict in the forward\ndirection of time. However, recent works have shown that prompting these models\nto look back and critique their own generations can produce useful feedback.\nMotivated by this, we explore the question of whether LLMs can be empowered to\nthink (predict and score) backwards to provide unsupervised feedback that\ncomplements forward LLMs. Towards this, we introduce Time Reversed Language\nModels (TRLMs), which can score and generate queries when conditioned on\nresponses, effectively functioning in the reverse direction of time. Further,\nto effectively infer in the response to query direction, we pre-train and\nfine-tune a language model (TRLM-Ba) in the reverse token order from scratch.\nWe show empirically (and theoretically in a stylized setting) that\ntime-reversed models can indeed complement forward model predictions when used\nto score the query given response for re-ranking multiple forward generations.\nWe obtain up to 5\\% improvement on the widely used AlpacaEval Leaderboard over\nthe competent baseline of best-of-N re-ranking using self log-perplexity\nscores. We further show that TRLM scoring outperforms conventional forward\nscoring of response given query, resulting in significant gains in applications\nsuch as citation generation and passage retrieval. We next leverage the\ngenerative ability of TRLM to augment or provide unsupervised feedback to input\nsafety filters of LLMs, demonstrating a drastic reduction in false negative\nrate with negligible impact on false positive rates against several attacks\npublished on the popular JailbreakBench leaderboard.\n","authors":["Yerram Varun","Rahul Madhavan","Sravanti Addepalli","Arun Suggala","Karthikeyan Shanmugam","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2412.02626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02621v1","updated":"2024-12-03T17:50:19Z","published":"2024-12-03T17:50:19Z","title":"Medical Multimodal Foundation Models in Clinical Diagnosis and\n Treatment: Applications, Challenges, and Future Directions","summary":" Recent advancements in deep learning have significantly revolutionized the\nfield of clinical diagnosis and treatment, offering novel approaches to improve\ndiagnostic precision and treatment efficacy across diverse clinical domains,\nthus driving the pursuit of precision medicine. The growing availability of\nmulti-organ and multimodal datasets has accelerated the development of\nlarge-scale Medical Multimodal Foundation Models (MMFMs). These models, known\nfor their strong generalization capabilities and rich representational power,\nare increasingly being adapted to address a wide range of clinical tasks, from\nearly diagnosis to personalized treatment strategies. This review offers a\ncomprehensive analysis of recent developments in MMFMs, focusing on three key\naspects: datasets, model architectures, and clinical applications. We also\nexplore the challenges and opportunities in optimizing multimodal\nrepresentations and discuss how these advancements are shaping the future of\nhealthcare by enabling improved patient outcomes and more efficient clinical\nworkflows.\n","authors":["Kai Sun","Siyan Xue","Fuchun Sun","Haoran Sun","Yu Luo","Ling Wang","Siyuan Wang","Na Guo","Lei Liu","Tian Zhao","Xinzhou Wang","Lei Yang","Shuo Jin","Jun Yan","Jiahong Dong"],"pdf_url":"https://arxiv.org/pdf/2412.02621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00608v2","updated":"2024-12-03T17:49:02Z","published":"2024-11-30T23:11:44Z","title":"Leveraging LLM for Automated Ontology Extraction and Knowledge Graph\n Generation","summary":" Extracting relevant and structured knowledge from large, complex technical\ndocuments within the Reliability and Maintainability (RAM) domain is\nlabor-intensive and prone to errors. Our work addresses this challenge by\npresenting OntoKGen, a genuine pipeline for ontology extraction and Knowledge\nGraph (KG) generation. OntoKGen leverages Large Language Models (LLMs) through\nan interactive user interface guided by our adaptive iterative Chain of Thought\n(CoT) algorithm to ensure that the ontology extraction process and, thus, KG\ngeneration align with user-specific requirements. Although KG generation\nfollows a clear, structured path based on the confirmed ontology, there is no\nuniversally correct ontology as it is inherently based on the user's\npreferences. OntoKGen recommends an ontology grounded in best practices,\nminimizing user effort and providing valuable insights that may have been\noverlooked, all while giving the user complete control over the final ontology.\nHaving generated the KG based on the confirmed ontology, OntoKGen enables\nseamless integration into schemeless, non-relational databases like Neo4j. This\nintegration allows for flexible storage and retrieval of knowledge from\ndiverse, unstructured sources, facilitating advanced querying, analysis, and\ndecision-making. Moreover, the generated KG serves as a robust foundation for\nfuture integration into Retrieval Augmented Generation (RAG) systems, offering\nenhanced capabilities for developing domain-specific intelligent applications.\n","authors":["Mohammad Sadeq Abolhasani","Rong Pan"],"pdf_url":"https://arxiv.org/pdf/2412.00608v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02617v1","updated":"2024-12-03T17:44:23Z","published":"2024-12-03T17:44:23Z","title":"Improving Dynamic Object Interactions in Text-to-Video Generation with\n AI Feedback","summary":" Large text-to-video models hold immense potential for a wide range of\ndownstream applications. However, these models struggle to accurately depict\ndynamic object interactions, often resulting in unrealistic movements and\nfrequent violations of real-world physics. One solution inspired by large\nlanguage models is to align generated outputs with desired outcomes using\nexternal feedback. This enables the model to refine its responses autonomously,\neliminating extensive manual data collection. In this work, we investigate the\nuse of feedback to enhance the object dynamics in text-to-video models. We aim\nto answer a critical question: what types of feedback, paired with which\nspecific self-improvement algorithms, can most effectively improve text-video\nalignment and realistic object interactions? We begin by deriving a unified\nprobabilistic objective for offline RL finetuning of text-to-video models. This\nperspective highlights how design elements in existing algorithms like KL\nregularization and policy projection emerge as specific choices within a\nunified framework. We then use derived methods to optimize a set of text-video\nalignment metrics (e.g., CLIP scores, optical flow), but notice that they often\nfail to align with human perceptions of generation quality. To address this\nlimitation, we propose leveraging vision-language models to provide more\nnuanced feedback specifically tailored to object dynamics in videos. Our\nexperiments demonstrate that our method can effectively optimize a wide variety\nof rewards, with binary AI feedback driving the most significant improvements\nin video quality for dynamic interactions, as confirmed by both AI and human\nevaluations. Notably, we observe substantial gains when using reward signals\nderived from AI feedback, particularly in scenarios involving complex\ninteractions between multiple objects and realistic depictions of objects\nfalling.\n","authors":["Hiroki Furuta","Heiga Zen","Dale Schuurmans","Aleksandra Faust","Yutaka Matsuo","Percy Liang","Sherry Yang"],"pdf_url":"https://arxiv.org/pdf/2412.02617v1.pdf","comment":"Website: https://sites.google.com/view/aif-dynamic-t2v/"},{"id":"http://arxiv.org/abs/2412.02615v1","updated":"2024-12-03T17:43:28Z","published":"2024-12-03T17:43:28Z","title":"Projection Abstractions in Planning Under the Lenses of Abstractions for\n MDPs","summary":" The concept of abstraction has been independently developed both in the\ncontext of AI Planning and discounted Markov Decision Processes (MDPs).\nHowever, the way abstractions are built and used in the context of Planning and\nMDPs is different even though lots of commonalities can be highlighted. To this\nday there is no work trying to relate and unify the two fields on the matter of\nabstractions unraveling all the different assumptions and their effect on the\nway they can be used. Therefore, in this paper we aim to do so by looking at\nprojection abstractions in Planning through the lenses of discounted MDPs.\nStarting from a projection abstraction built according to Classical or\nProbabilistic Planning techniques, we will show how the same abstraction can be\nobtained under the abstraction frameworks available for discounted MDPs. Along\nthe way, we will focus on computational as well as representational advantages\nand disadvantages of both worlds pointing out new research directions that are\nof interest for both fields.\n","authors":["Giuseppe Canonaco","Alberto Pozanco","Daniel Borrajo"],"pdf_url":"https://arxiv.org/pdf/2412.02615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02611v1","updated":"2024-12-03T17:41:23Z","published":"2024-12-03T17:41:23Z","title":"AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand\n Audio-Visual Information?","summary":" Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini\n1.5 Pro, and Reka Core, have expanded their capabilities to include vision and\naudio modalities. While these models demonstrate impressive performance across\na wide range of audio-visual applications, our proposed DeafTest reveals that\nMLLMs often struggle with simple tasks humans find trivial: 1) determining\nwhich of two sounds is louder, and 2) determining which of two sounds has a\nhigher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a\ncomprehensive audio-visual benchmark designed to assess whether those MLLMs can\ntruly understand the audio-visual information. This benchmark encompasses 4,555\ncarefully crafted problems, each incorporating text, visual, and audio\ncomponents. To successfully infer answers, models must effectively leverage\nclues from both visual and audio inputs. To ensure precise and objective\nevaluation of MLLM responses, we have structured the questions as\nmultiple-choice, eliminating the need for human evaluation or LLM-assisted\nassessment. We benchmark a series of closed-source and open-source models and\nsummarize the observations. By revealing the limitations of current models, we\naim to provide useful insight for future dataset collection and model\ndevelopment.\n","authors":["Kaixiong Gong","Kaituo Feng","Bohao Li","Yibing Wang","Mofan Cheng","Shijia Yang","Jiaming Han","Benyou Wang","Yutong Bai","Zhuoran Yang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2412.02611v1.pdf","comment":"Project page: https://av-odyssey.github.io/"},{"id":"http://arxiv.org/abs/2412.02610v1","updated":"2024-12-03T17:41:08Z","published":"2024-12-03T17:41:08Z","title":"AI-Driven Resource Allocation Framework for Microservices in Hybrid\n Cloud Platforms","summary":" The increasing demand for scalable, efficient resource management in hybrid\ncloud environments has led to the exploration of AI-driven approaches for\ndynamic resource allocation. This paper presents an AI-driven framework for\nresource allocation among microservices in hybrid cloud platforms. The\nframework employs reinforcement learning (RL)-based resource utilization\noptimization to reduce costs and improve performance. The framework integrates\nAI models with cloud management tools to respond to challenges of dynamic\nscaling and cost-efficient low-latency service delivery. The reinforcement\nlearning model continuously adjusts provisioned resources as required by the\nmicroservices and predicts the future consumption trends to minimize both\nunder- and over-provisioning of resources. Preliminary simulation results\nindicate that using AI in the provision of resources related to costs can\nreduce expenditure by up to 30-40% compared to manual provisioning and\nthreshold-based auto-scaling approaches. It is also estimated that the\nefficiency in resource utilization is expected to improve by 20%-30% with a\ncorresponding latency cut of 15%-20% during the peak demand periods. This study\ncompares the AI-driven approach with existing static and rule-based resource\nallocation methods, demonstrating the capability of this new model to\noutperform them in terms of flexibility and real-time interests. The results\nindicate that reinforcement learning can make optimization of hybrid cloud\nplatforms even better, offering a 25-35% improvement in cost efficiency and the\npower of scaling for microservice-based applications. The proposed framework is\na strong and scalable solution to managing cloud resources in dynamic and\nperformance-critical environments.\n","authors":["Biman Barua","M. Shamim Kaiser"],"pdf_url":"https://arxiv.org/pdf/2412.02610v1.pdf","comment":"25 pages, 14 figures"},{"id":"http://arxiv.org/abs/2411.17404v2","updated":"2024-12-03T17:38:54Z","published":"2024-11-26T13:05:53Z","title":"BPP-Search: Enhancing Tree of Thought Reasoning for Mathematical\n Modeling Problem Solving","summary":" LLMs exhibit advanced reasoning capabilities, offering the potential to\ntransform natural language questions into mathematical models. However,\nexisting open-source datasets in operations research domain lack detailed\nannotations of the modeling process, such as variable definitions, focusing\nsolely on objective values, which hinders reinforcement learning applications.\nTo address this, we release the StructuredOR dataset, annotated with\ncomprehensive labels that capture the complete mathematical modeling process.\nWe further propose BPP-Search, a algorithm that integrates reinforcement\nlearning into a tree-of-thought structure using Beam search, a Process reward\nmodel, and a pairwise Preference algorithm. This approach enables efficient\nexploration of tree structures, avoiding exhaustive search while improving\naccuracy. Extensive experiments on StructuredOR, NL4OPT, and MAMO-ComplexLP\ndatasets show that BPP-Search significantly outperforms state-of-the-art\nmethods. In tree-based reasoning, BPP-Search excels in accuracy and efficiency,\nenabling faster retrieval of correct solutions.\n","authors":["Teng Wang","Wing-Yin Yu","Zhenqi He","Zehua Liu","Xiongwei Han","Hailei Gong","Han Wu","Wei Shi","Ruifeng She","Fangzhou Zhu","Tao Zhong"],"pdf_url":"https://arxiv.org/pdf/2411.17404v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02602v1","updated":"2024-12-03T17:32:47Z","published":"2024-12-03T17:32:47Z","title":"CEGI: Measuring the trade-off between efficiency and carbon emissions\n for SLMs and VLMs","summary":" This paper analyzes the performance of Small Language Models (SLMs) and\nVision Language Models (VLMs) and evaluates the trade-off between model\nperformance and carbon emissions across 4 essential tasks: Image Captioning,\nVisual Question Answering (VQA), Dialogue Summarization and Text-to-SQL\nconversion. Various SLMs and VLMs belonging to the Qwen and LLaMA architecture\nfamily are chosen and variants based on model size in terms of the number of\nparameters, quantization level and fine-tuning parameters are evaluated. The\nmodel variant's performance and carbon emissions are calculated. To quantify\nthe trade-off between model performance and carbon emissions, we introduce a\nnovel metric called CEGI (Carbon Efficient Gain Index). This metric represents\nthe carbon emission per unit percentage gain per million trainable parameters .\nThis metric provides a normalized measure to compare model's efficiency in\nterms of performance improvement relative to their environmental cost. The\nexperiment's outcome demonstrates that fine-tuning SLMs and VLMs can achieve\nperformance levels comparable to Large Language Models (LLMs) while producing\nsignificantly less carbon emissions. Our findings suggest that the marginal\ngains in accuracy from larger models do not justify the substantial increase in\ncarbon emissions. Leveraging lower-bit quantization levels, the proposed metric\nfurther enhances energy efficiency without compromising performance. This study\nhighlights balancing high performance and environmental sustainability. It\noffers a valuable metric for selecting models suitable for\nenvironmentally-friendly AI development.\n","authors":["Abhas Kumar","Kapil Pathak","Rajesh Kavuru","Prabhakar Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2412.02602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02594v1","updated":"2024-12-03T17:26:42Z","published":"2024-12-03T17:26:42Z","title":"PrefixLLM: LLM-aided Prefix Circuit Design","summary":" Prefix circuits are fundamental components in digital adders, widely used in\ndigital systems due to their efficiency in calculating carry signals.\nSynthesizing prefix circuits with minimized area and delay is crucial for\nenhancing the performance of modern computing systems. Recently, large language\nmodels (LLMs) have demonstrated a surprising ability to perform text generation\ntasks. We propose PrefixLLM, that leverages LLMs for prefix circuit synthesis.\nPrefixLLM transforms the prefix circuit synthesis task into a structured text\ngeneration problem, termed the Structured Prefix Circuit Representation (SPCR),\nand introduces an iterative framework to automatically and accurately generate\nvalid SPCRs. We further present a design space exploration (DSE) framework that\nuses LLMs to iteratively search for area and delay optimized prefix circuits.\nCompared to state-of-the-art, PrefixLLM can reduce the area by 3.70% under the\nsame delay constraint. This work highlights the use of LLMs in the synthesis of\narithmetic circuits, which can be transformed into the structured text\ngeneration.\n","authors":["Weihua Xiao","Venkata Sai Charan Putrevu","Raghu Vamshi Hemadri","Siddharth Garg","Ramesh Karri"],"pdf_url":"https://arxiv.org/pdf/2412.02594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13846v4","updated":"2024-12-03T17:22:01Z","published":"2024-04-22T03:05:19Z","title":"Filtered Direct Preference Optimization","summary":" Reinforcement learning from human feedback (RLHF) plays a crucial role in\naligning language models with human preferences. While the significance of\ndataset quality is generally recognized, explicit investigations into its\nimpact within the RLHF framework, to our knowledge, have been limited. This\npaper addresses the issue of text quality within the preference dataset by\nfocusing on direct preference optimization (DPO), an increasingly adopted\nreward-model-free RLHF method. We confirm that text quality significantly\ninfluences the performance of models optimized with DPO more than those\noptimized with reward-model-based RLHF. Building on this new insight, we\npropose an extension of DPO, termed filtered direct preference optimization\n(fDPO). fDPO uses a trained reward model to monitor the quality of texts within\nthe preference dataset during DPO training. Samples of lower quality are\ndiscarded based on comparisons with texts generated by the model being\noptimized, resulting in a more accurate dataset. Experimental results\ndemonstrate that fDPO enhances the final model performance. Our code is\navailable at https://github.com/CyberAgentAILab/filtered-dpo.\n","authors":["Tetsuro Morimura","Mitsuki Sakamoto","Yuu Jinnai","Kenshi Abe","Kaito Ariu"],"pdf_url":"https://arxiv.org/pdf/2404.13846v4.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2412.02588v1","updated":"2024-12-03T17:17:27Z","published":"2024-12-03T17:17:27Z","title":"Explainable CTR Prediction via LLM Reasoning","summary":" Recommendation Systems have become integral to modern user experiences, but\nlack transparency in their decision-making processes. Existing explainable\nrecommendation methods are hindered by reliance on a post-hoc paradigm, wherein\nexplanation generators are trained independently of the underlying recommender\nmodels. This paradigm necessitates substantial human effort in data\nconstruction and raises concerns about explanation reliability. In this paper,\nwe present ExpCTR, a novel framework that integrates large language model based\nexplanation generation directly into the CTR prediction process. Inspired by\nrecent advances in reinforcement learning, we employ two carefully designed\nreward mechanisms, LC alignment, which ensures explanations reflect user\nintentions, and IC alignment, which maintains consistency with traditional\nID-based CTR models. Our approach incorporates an efficient training paradigm\nwith LoRA and a three-stage iterative process. ExpCTR circumvents the need for\nextensive explanation datasets while fostering synergy between CTR prediction\nand explanation generation. Experimental results demonstrate that ExpCTR\nsignificantly enhances both recommendation accuracy and interpretability across\nthree real-world datasets.\n","authors":["Xiaohan Yu","Li Zhang","Chong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02588v1.pdf","comment":"WSDM 2025"},{"id":"http://arxiv.org/abs/2412.02579v1","updated":"2024-12-03T17:04:20Z","published":"2024-12-03T17:04:20Z","title":"Factored space models: Towards causality between levels of abstraction","summary":" Causality plays an important role in understanding intelligent behavior, and\nthere is a wealth of literature on mathematical models for causality, most of\nwhich is focused on causal graphs. Causal graphs are a powerful tool for a wide\nrange of applications, in particular when the relevant variables are known and\nat the same level of abstraction. However, the given variables can also be\nunstructured data, like pixels of an image. Meanwhile, the causal variables,\nsuch as the positions of objects in the image, can be arbitrary deterministic\nfunctions of the given variables. Moreover, the causal variables may form a\nhierarchy of abstractions, in which the macro-level variables are deterministic\nfunctions of the micro-level variables. Causal graphs are limited when it comes\nto modeling this kind of situation. In the presence of deterministic\nrelationships there is generally no causal graph that satisfies both the Markov\ncondition and the faithfulness condition. We introduce factored space models as\nan alternative to causal graphs which naturally represent both probabilistic\nand deterministic relationships at all levels of abstraction. Moreover, we\nintroduce structural independence and establish that it is equivalent to\nstatistical independence in every distribution that factorizes over the\nfactored space. This theorem generalizes the classical soundness and\ncompleteness theorem for d-separation.\n","authors":["Scott Garrabrant","Matthias Georg Mayer","Magdalena Wache","Leon Lang","Sam Eisenstat","Holger Dell"],"pdf_url":"https://arxiv.org/pdf/2412.02579v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2409.05305v2","updated":"2024-12-03T17:03:57Z","published":"2024-09-09T03:26:07Z","title":"Closed-Form Interpretation of Neural Network Latent Spaces with Symbolic\n Gradients","summary":" It has been demonstrated in many scientific fields that artificial neural\nnetworks like autoencoders or Siamese networks encode meaningful concepts in\ntheir latent spaces. However, there does not exist a comprehensive framework\nfor retrieving this information in a human-readable form without prior\nknowledge. In order to extract these concepts, we introduce a framework for\nfinding closed-form interpretations of neurons in latent spaces of artificial\nneural networks. The interpretation framework is based on embedding trained\nneural networks into an equivalence class of functions that encode the same\nconcept. We interpret these neural networks by finding an intersection between\nthe equivalence class and human-readable equations defined by a symbolic search\nspace. The approach is demonstrated by retrieving invariants of matrices and\nconserved quantities of dynamical systems from latent spaces of Siamese neural\nnetworks.\n","authors":["Zakaria Patel","Sebastian J. Wetzel"],"pdf_url":"https://arxiv.org/pdf/2409.05305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02574v1","updated":"2024-12-03T16:59:30Z","published":"2024-12-03T16:59:30Z","title":"Generating Critical Scenarios for Testing Automated Driving Systems","summary":" Autonomous vehicles (AVs) have demonstrated significant potential in\nrevolutionizing transportation, yet ensuring their safety and reliability\nremains a critical challenge, especially when exposed to dynamic and\nunpredictable environments. Real-world testing of an Autonomous Driving System\n(ADS) is both expensive and risky, making simulation-based testing a preferred\napproach. In this paper, we propose AVASTRA, a Reinforcement Learning\n(RL)-based approach to generate realistic critical scenarios for testing ADSs\nin simulation environments. To capture the complexity of driving scenarios,\nAVASTRA comprehensively represents the environment by both the internal states\nof an ADS under-test (e.g., the status of the ADS's core components, speed, or\nacceleration) and the external states of the surrounding factors in the\nsimulation environment (e.g., weather, traffic flow, or road condition).\nAVASTRA trains the RL agent to effectively configure the simulation environment\nthat places the AV in dangerous situations and potentially leads it to\ncollisions. We introduce a diverse set of actions that allows the RL agent to\nsystematically configure both environmental conditions and traffic\nparticipants. Additionally, based on established safety requirements, we\nenforce heuristic constraints to ensure the realism and relevance of the\ngenerated test scenarios. AVASTRA is evaluated on two popular simulation maps\nwith four different road configurations. Our results show AVASTRA's ability to\noutperform the state-of-the-art approach by generating 30% to 115% more\ncollision scenarios. Compared to the baseline based on Random Search, AVASTRA\nachieves up to 275% better performance. These results highlight the\neffectiveness of AVASTRA in enhancing the safety testing of AVs through\nrealistic comprehensive critical scenario generation.\n","authors":["Trung-Hieu Nguyen","Truong-Giang Vuong","Hong-Nam Duong","Son Nguyen","Hieu Dinh Vo","Toshiaki Aoki","Thu-Trang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2412.02574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02570v1","updated":"2024-12-03T16:55:27Z","published":"2024-12-03T16:55:27Z","title":"TAB-Fields: A Maximum Entropy Framework for Mission-Aware Adversarial\n Planning","summary":" Autonomous agents operating in adversarial scenarios face a fundamental\nchallenge: while they may know their adversaries' high-level objectives, such\nas reaching specific destinations within time constraints, the exact policies\nthese adversaries will employ remain unknown. Traditional approaches address\nthis challenge by treating the adversary's state as a partially observable\nelement, leading to a formulation as a Partially Observable Markov Decision\nProcess (POMDP). However, the induced belief-space dynamics in a POMDP require\nknowledge of the system's transition dynamics, which, in this case, depend on\nthe adversary's unknown policy. Our key observation is that while an\nadversary's exact policy is unknown, their behavior is necessarily constrained\nby their mission objectives and the physical environment, allowing us to\ncharacterize the space of possible behaviors without assuming specific\npolicies. In this paper, we develop Task-Aware Behavior Fields (TAB-Fields), a\nrepresentation that captures adversary state distributions over time by\ncomputing the most unbiased probability distribution consistent with known\nconstraints. We construct TAB-Fields by solving a constrained optimization\nproblem that minimizes additional assumptions about adversary behavior beyond\nmission and environmental requirements. We integrate TAB-Fields with standard\nplanning algorithms by introducing TAB-conditioned POMCP, an adaptation of\nPartially Observable Monte Carlo Planning. Through experiments in simulation\nwith underwater robots and hardware implementations with ground robots, we\ndemonstrate that our approach achieves superior performance compared to\nbaselines that either assume specific adversary policies or neglect mission\nconstraints altogether. Evaluation videos and code are available at\nhttps://tab-fields.github.io.\n","authors":["Gokul Puthumanaillam","Jae Hyuk Song","Nurzhan Yesmagambet","Shinkyu Park","Melkior Ornik"],"pdf_url":"https://arxiv.org/pdf/2412.02570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00535v2","updated":"2024-12-03T16:55:24Z","published":"2024-11-30T16:58:42Z","title":"FullStack Bench: Evaluating LLMs as Full Stack Coders","summary":" As the capabilities of code large language models (LLMs) continue to expand,\ntheir applications across diverse code intelligence domains are rapidly\nincreasing. However, most existing datasets only evaluate limited application\ndomains. To address this gap, we have developed a comprehensive code evaluation\ndataset FullStack Bench focusing on full-stack programming, which encompasses a\nwide range of application domains (e.g., basic programming, data analysis,\nsoftware engineering, mathematics, and machine learning). Besides, to assess\nmultilingual programming capabilities, in FullStack Bench, we design real-world\ninstructions and corresponding unit test cases from 16 widely-used programming\nlanguages to reflect real-world usage scenarios rather than simple\ntranslations. Moreover, we also release an effective code sandbox execution\ntool (i.e., SandboxFusion) supporting various programming languages and\npackages to evaluate the performance of our FullStack Bench efficiently.\nComprehensive experimental results on our FullStack Bench demonstrate the\nnecessity and effectiveness of our FullStack Bench and SandboxFusion.\n","authors":["Siyao Liu","He Zhu","Jerry Liu","Shulin Xin","Aoyan Li","Rui Long","Li Chen","Jack Yang","Jinxiang Xia","Z. Y. Peng","Shukai Liu","Zhaoxiang Zhang","Jing Mai","Ge Zhang","Wenhao Huang","Kai Shen","Liang Xiang"],"pdf_url":"https://arxiv.org/pdf/2412.00535v2.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2412.02568v1","updated":"2024-12-03T16:54:46Z","published":"2024-12-03T16:54:46Z","title":"Segmentation of Coronary Artery Stenosis in X-ray Angiography using\n Mamba Models","summary":" Coronary artery disease stands as one of the primary contributors to global\nmortality rates. The automated identification of coronary artery stenosis from\nX-ray images plays a critical role in the diagnostic process for coronary heart\ndisease. This task is challenging due to the complex structure of coronary\narteries, intrinsic noise in X-ray images, and the fact that stenotic coronary\narteries appear narrow and blurred in X-ray angiographies. This study employs\nfive different variants of the Mamba-based model and one variant of the Swin\nTransformer-based model, primarily based on the U-Net architecture, for the\nlocalization of stenosis in Coronary artery disease. Our best results showed an\nF1 score of 68.79% for the U-Mamba BOT model, representing an 11.8% improvement\nover the semi-supervised approach.\n","authors":["Ali Rostami","Fatemeh Fouladi","Hedieh Sajedi"],"pdf_url":"https://arxiv.org/pdf/2412.02568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02563v1","updated":"2024-12-03T16:52:06Z","published":"2024-12-03T16:52:06Z","title":"Semantic Tokens in Retrieval Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) architectures have recently garnered\nsignificant attention for their ability to improve truth grounding and\ncoherence in natural language processing tasks. However, the reliability of RAG\nsystems in producing accurate answers diminishes as the volume of data they\naccess increases. Even with smaller datasets, these systems occasionally fail\nto address simple queries. This issue arises from their dependence on\nstate-of-the-art large language models (LLMs), which can introduce uncertainty\ninto the system's outputs. In this work, I propose a novel Comparative RAG\nsystem that introduces an evaluator module to bridge the gap between\nprobabilistic RAG systems and deterministically verifiable responses. The\nevaluator compares external recommendations with the retrieved document chunks,\nadding a decision-making layer that enhances the system's reliability. This\napproach ensures that the chunks retrieved are both semantically relevant and\nlogically consistent with deterministic insights, thereby improving the\naccuracy and overall efficiency of RAG systems. This framework paves the way\nfor more reliable and scalable question-answering applications in domains\nrequiring high precision and verifiability.\n","authors":["Joel Suro"],"pdf_url":"https://arxiv.org/pdf/2412.02563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02539v1","updated":"2024-12-03T16:32:57Z","published":"2024-12-03T16:32:57Z","title":"Graph-Powered Defense: Controller Area Network Intrusion Detection for\n Unmanned Aerial Vehicles","summary":" The network of services, including delivery, farming, and environmental\nmonitoring, has experienced exponential expansion in the past decade with\nUnmanned Aerial Vehicles (UAVs). Yet, UAVs are not robust enough against\ncyberattacks, especially on the Controller Area Network (CAN) bus. The CAN bus\nis a general-purpose vehicle-bus standard to enable microcontrollers and\nin-vehicle computers to interact, primarily connecting different Electronic\nControl Units (ECUs). In this study, we focus on solving some of the most\ncritical security weaknesses in UAVs by developing a novel graph-based\nintrusion detection system (IDS) leveraging the Uncomplicated Application-level\nVehicular Communication and Networking (UAVCAN) protocol. First, we decode CAN\nmessages based on UAVCAN protocol specification; second, we present a\ncomprehensive method of transforming tabular UAVCAN messages into graph\nstructures. Lastly, we apply various graph-based machine learning models for\ndetecting cyber-attacks on the CAN bus, including graph convolutional neural\nnetworks (GCNNs), graph attention networks (GATs), Graph Sample and Aggregate\nNetworks (GraphSAGE), and graph structure-based transformers. Our findings show\nthat inductive models such as GATs, GraphSAGE, and graph-based transformers can\nachieve competitive and even better accuracy than transductive models like\nGCNNs in detecting various types of intrusions, with minimum information on\nprotocol specification, thus providing a generic robust solution for CAN bus\nsecurity for the UAVs. We also compared our results with baseline single-layer\nLong Short-Term Memory (LSTM) and found that all our graph-based models perform\nbetter without using any decoded features based on the UAVCAN protocol,\nhighlighting higher detection performance with protocol-independent capability.\n","authors":["Reek Majumder","Gurcan Comert","David Werth","Adrian Gale","Mashrur Chowdhury","M Sabbir Salek"],"pdf_url":"https://arxiv.org/pdf/2412.02539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10578v5","updated":"2024-12-03T16:26:09Z","published":"2024-10-14T14:52:23Z","title":"Burning RED: Unlocking Subtask-Driven Reinforcement Learning and\n Risk-Awareness in Average-Reward Markov Decision Processes","summary":" Average-reward Markov decision processes (MDPs) provide a foundational\nframework for sequential decision-making under uncertainty. However,\naverage-reward MDPs have remained largely unexplored in reinforcement learning\n(RL) settings, with the majority of RL-based efforts having been allocated to\nepisodic and discounted MDPs. In this work, we study a unique structural\nproperty of average-reward MDPs and utilize it to introduce Reward-Extended\nDifferential (or RED) reinforcement learning: a novel RL framework that can be\nused to effectively and efficiently solve various subtasks simultaneously in\nthe average-reward setting. We introduce a family of RED learning algorithms\nfor prediction and control, including proven-convergent algorithms for the\ntabular case. We then showcase the power of these algorithms by demonstrating\nhow they can be used to learn a policy that optimizes, for the first time, the\nwell-known conditional value-at-risk (CVaR) risk measure in a fully-online\nmanner, without the use of an explicit bi-level optimization scheme or an\naugmented state-space.\n","authors":["Juan Sebastian Rojas","Chi-Guhn Lee"],"pdf_url":"https://arxiv.org/pdf/2410.10578v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02530v1","updated":"2024-12-03T16:23:02Z","published":"2024-12-03T16:23:02Z","title":"WEM-GAN: Wavelet transform based facial expression manipulation","summary":" Facial expression manipulation aims to change human facial expressions\nwithout affecting face recognition. In order to transform the facial\nexpressions to target expressions, previous methods relied on expression labels\nto guide the manipulation process. However, these methods failed to preserve\nthe details of facial features, which causes the weakening or the loss of\nidentity information in the output image. In our work, we propose WEM-GAN, in\nshort for wavelet-based expression manipulation GAN, which puts more efforts on\npreserving the details of the original image in the editing process. Firstly,\nwe take advantage of the wavelet transform technique and combine it with our\ngenerator with a U-net autoencoder backbone, in order to improve the\ngenerator's ability to preserve more details of facial features. Secondly, we\nalso implement the high-frequency component discriminator, and use\nhigh-frequency domain adversarial loss to further constrain the optimization of\nour model, providing the generated face image with more abundant details.\nAdditionally, in order to narrow the gap between generated facial expressions\nand target expressions, we use residual connections between encoder and\ndecoder, while also using relative action units (AUs) several times. Extensive\nqualitative and quantitative experiments have demonstrated that our model\nperforms better in preserving identity features, editing capability, and image\ngeneration quality on the AffectNet dataset. It also shows superior performance\nin metrics such as Average Content Distance (ACD) and Expression Distance (ED).\n","authors":["Dongya Sun","Yunfei Hu","Xianzhe Zhang","Yingsong Hu"],"pdf_url":"https://arxiv.org/pdf/2412.02530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02528v1","updated":"2024-12-03T16:21:37Z","published":"2024-12-03T16:21:37Z","title":"Bias Analysis of AI Models for Undergraduate Student Admissions","summary":" Bias detection and mitigation is an active area of research in machine\nlearning. This work extends previous research done by the authors to provide a\nrigorous and more complete analysis of the bias found in AI predictive models.\nAdmissions data spanning six years was used to create an AI model to determine\nwhether a given student would be directly admitted into the School of Science\nunder various scenarios at a large urban research university. During this time,\nsubmission of standardized test scores as part of an application became\noptional which led to interesting questions about the impact of standardized\ntest scores on admission decisions. We developed and analyzed AI models to\nunderstand which variables are important in admissions decisions, and how the\ndecision to exclude test scores affects the demographics of the students who\nare admitted. We then evaluated the predictive models to detect and analyze\nbiases these models may carry with respect to three variables chosen to\nrepresent sensitive populations: gender, race, and whether a student was the\nfirst in his or her family to attend college. We also extended our analysis to\nshow that the biases detected were persistent. Finally, we included several\nfairness metrics in our analysis and discussed the uses and limitations of\nthese metrics.\n","authors":["Kelly Van Busum","Shiaofen Fang"],"pdf_url":"https://arxiv.org/pdf/2412.02528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.07712v3","updated":"2024-12-03T16:17:32Z","published":"2024-08-13T23:08:06Z","title":"Introduction to Reinforcement Learning","summary":" Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI),\nfocuses on training agents to make decisions by interacting with their\nenvironment to maximize cumulative rewards. This paper provides an overview of\nRL, covering its core concepts, methodologies, and resources for further\nlearning. It offers a thorough explanation of fundamental components such as\nstates, actions, policies, and reward signals, ensuring readers develop a solid\nfoundational understanding. Additionally, the paper presents a variety of RL\nalgorithms, categorized based on the key factors such as model-free,\nmodel-based, value-based, policy-based, and other key factors. Resources for\nlearning and implementing RL, such as books, courses, and online communities\nare also provided. By offering a clear, structured introduction, this paper\naims to simplify the complexities of RL for beginners, providing a\nstraightforward pathway to understanding.\n","authors":["Majid Ghasemi","Dariush Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2408.07712v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2412.02520v1","updated":"2024-12-03T16:13:42Z","published":"2024-12-03T16:13:42Z","title":"Cooperative Cruising: Reinforcement Learning based Time-Headway Control\n for Increased Traffic Efficiency","summary":" The proliferation of Connected Automated Vehicles represents an unprecedented\nopportunity for improving driving efficiency and alleviating traffic\ncongestion. However, existing research fails to address realistic multi-lane\nhighway scenarios without assuming connectivity, perception, and control\ncapabilities that are typically unavailable in current vehicles. This paper\nproposes a novel AI system that is the first to improve highway traffic\nefficiency compared with human-like traffic in realistic, simulated multi-lane\nscenarios, while relying on existing connectivity, perception, and control\ncapabilities. At the core of our approach is a reinforcement learning based\ncontroller that dynamically communicates time-headways to automated vehicles\nnear bottlenecks based on real-time traffic conditions. These desired\ntime-headways are then used by Adaptive Cruise Control (ACC) systems to adjust\ntheir following distance. By (i) integrating existing traffic estimation\ntechnology and low-bandwidth vehicle-to-infrastructure connectivity, (ii)\nleveraging safety-certified ACC systems, and (iii) targeting localized\nbottleneck challenges that can be addressed independently in different\nlocations, we propose a practical, safe, and scalable system that can\npositively impact numerous road users.\n","authors":["Yaron Veksler","Sharon Hornstein","Han Wang","Maria Laura Delle Monache","Daniel Urieli"],"pdf_url":"https://arxiv.org/pdf/2412.02520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00876v2","updated":"2024-12-03T16:12:09Z","published":"2024-12-01T16:32:31Z","title":"Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic\n Vision-language Context Sparsification","summary":" Multimodal Large Language Models (MLLMs) have achieved remarkable success in\nvision understanding, reasoning, and interaction. However, the inference\ncomputation and memory increase progressively with the generation of output\ntokens during decoding, directly affecting the efficacy of MLLMs. Existing\nmethods attempt to reduce the vision context redundancy to achieve efficient\nMLLMs. Unfortunately, the efficiency benefits of the vision context reduction\nin the prefill stage gradually diminish during the decoding stage. To address\nthis problem, we proposed a dynamic vision-language context sparsification\nframework Dynamic-LLaVA, which dynamically reduces the redundancy of vision\ncontext in the prefill stage and decreases the memory and computation overhead\nof the generated language context during decoding. Dynamic-LLaVA designs a\ntailored sparsification inference scheme for different inference modes, i.e.,\nprefill, decoding with and without KV cache, to achieve efficient inference of\nMLLMs. In practice, Dynamic-LLaVA can reduce computation consumption by\n$\\sim$75\\% in the prefill stage. Meanwhile, throughout the entire generation\nprocess of MLLMs, Dynamic-LLaVA reduces the $\\sim$50\\% computation consumption\nunder decoding without KV cache, while saving $\\sim$50\\% GPU memory overhead\nwhen decoding with KV cache, due to the vision-language context sparsification.\nExtensive experiments also demonstrate that Dynamic-LLaVA achieves efficient\ninference for MLLMs with negligible understanding and generation ability\ndegradation or even performance gains compared to the full-context inference\nbaselines. Code is available at https://github.com/Osilly/dynamic_llava .\n","authors":["Wenxuan Huang","Zijie Zhai","Yunhang Shen","Shaoshen Cao","Fei Zhao","Xiangfeng Xu","Zheyu Ye","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2412.00876v2.pdf","comment":"Code is available at https://github.com/Osilly/dynamic_llava"},{"id":"http://arxiv.org/abs/2412.01491v2","updated":"2024-12-03T16:01:54Z","published":"2024-12-02T13:42:36Z","title":"Understanding complex crowd dynamics with generative neural simulators","summary":" Understanding the dynamics of pedestrian crowds is an outstanding challenge\ncrucial for designing efficient urban infrastructure and ensuring safe crowd\nmanagement. To this end, both small-scale laboratory and large-scale real-world\nmeasurements have been used. However, these approaches respectively lack\nstatistical resolution and parametric controllability, both essential to\ndiscovering physical relationships underlying the complex stochastic dynamics\nof crowds. Here, we establish an investigation paradigm that offers\nlaboratory-like controllability, while ensuring the statistical resolution of\nlarge-scale real-world datasets. Using our data-driven Neural Crowd Simulator\n(NeCS), which we train on large-scale data and validate against key statistical\nfeatures of crowd dynamics, we show that we can perform effective surrogate\ncrowd dynamics experiments without training on specific scenarios. We not only\nreproduce known experimental results on pairwise avoidance, but also uncover\nthe vision-guided and topological nature of N-body interactions. These findings\nshow how virtual experiments based on neural simulation enable data-driven\nscientific discovery.\n","authors":["Koen Minartz","Fleur Hendriks","Simon Martinus Koop","Alessandro Corbetta","Vlado Menkovski"],"pdf_url":"https://arxiv.org/pdf/2412.01491v2.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.13220v2","updated":"2024-12-03T16:00:40Z","published":"2024-05-21T22:00:34Z","title":"Paired Autoencoders for Likelihood-free Estimation in Inverse Problems","summary":" We consider the solution of nonlinear inverse problems where the forward\nproblem is a discretization of a partial differential equation. Such problems\nare notoriously difficult to solve in practice and require minimizing a\ncombination of a data-fit term and a regularization term. The main\ncomputational bottleneck of typical algorithms is the direct estimation of the\ndata misfit. Therefore, likelihood-free approaches have become appealing\nalternatives. Nonetheless, difficulties in generalization and limitations in\naccuracy have hindered their broader utility and applicability. In this work,\nwe use a paired autoencoder framework as a likelihood-free estimator for\ninverse problems. We show that the use of such an architecture allows us to\nconstruct a solution efficiently and to overcome some known open problems when\nusing likelihood-free estimators. In particular, our framework can assess the\nquality of the solution and improve on it if needed. We demonstrate the\nviability of our approach using examples from full waveform inversion and\ninverse electromagnetic imaging.\n","authors":["Matthias Chung","Emma Hart","Julianne Chung","Bas Peters","Eldad Haber"],"pdf_url":"https://arxiv.org/pdf/2405.13220v2.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2402.12317v2","updated":"2024-12-03T15:56:26Z","published":"2024-02-19T17:37:28Z","title":"EVOR: Evolving Retrieval for Code Generation","summary":" Recently the retrieval-augmented generation (RAG) has been successfully\napplied in code generation. However, existing pipelines for retrieval-augmented\ncode generation (RACG) employ static knowledge bases with a single source,\nlimiting the adaptation capabilities of Large Language Models (LLMs) to domains\nthey have insufficient knowledge of. In this work, we develop a novel pipeline,\nEVOR, that employs the synchronous evolution of both queries and diverse\nknowledge bases. On two realistic settings where the external knowledge is\nrequired to solve code generation tasks, we compile four new datasets\nassociated with frequently updated libraries and long-tail programming\nlanguages, named EVOR-BENCH. Extensive experiments demonstrate that EVOR\nachieves two to four times of execution accuracy compared to other methods such\nas Reflexion (Shinn et al., 2024), DocPrompting (Zhou et al., 2023), etc. We\ndemonstrate that EVOR is flexible and can be easily combined with them to\nachieve further improvement. Further analysis reveals that EVOR benefits from\nthe synchronous evolution of queries and documents and the diverse information\nsources in the knowledge base. We hope that our studies will inspire more\ninsights into the design of advanced RACG pipelines in future research. Our\nmodel, code, and data are available at https://arks-codegen.github.io.\n","authors":["Hongjin Su","Shuyang Jiang","Yuhang Lai","Haoyuan Wu","Boao Shi","Che Liu","Qian Liu","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2402.12317v2.pdf","comment":"Retrieval-augmented code generation"},{"id":"http://arxiv.org/abs/2412.02509v1","updated":"2024-12-03T15:48:33Z","published":"2024-12-03T15:48:33Z","title":"FCL-ViT: Task-Aware Attention Tuning for Continual Learning","summary":" Continual Learning (CL) involves adapting the prior Deep Neural Network (DNN)\nknowledge to new tasks, without forgetting the old ones. However, modern CL\ntechniques focus on provisioning memory capabilities to existing DNN models\nrather than designing new ones that are able to adapt according to the task at\nhand. This paper presents the novel Feedback Continual Learning Vision\nTransformer (FCL-ViT) that uses a feedback mechanism to generate real-time\ndynamic attention features tailored to the current task. The FCL-ViT operates\nin two Phases. In phase 1, the generic image features are produced and\ndetermine where the Transformer should attend on the current image. In phase 2,\ntask-specific image features are generated that leverage dynamic attention. To\nthis end, Tunable self-Attention Blocks (TABs) and Task Specific Blocks (TSBs)\nare introduced that operate in both phases and are responsible for tuning the\nTABs attention, respectively. The FCL-ViT surpasses state-of-the-art\nperformance on Continual Learning compared to benchmark methods, while\nretaining a small number of trainable DNN parameters.\n","authors":["Anestis Kaimakamidis","Ioannis Pitas"],"pdf_url":"https://arxiv.org/pdf/2412.02509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00430v2","updated":"2024-12-03T15:43:49Z","published":"2024-11-30T10:56:30Z","title":"Predictive Models in Sequential Recommendations: Bridging Performance\n Laws with Data Quality Insights","summary":" Sequential Recommendation (SR) plays a critical role in predicting users'\nsequential preferences. Despite its growing prominence in various industries,\nthe increasing scale of SR models incurs substantial computational costs and\nunpredictability, challenging developers to manage resources efficiently. Under\nthis predicament, Scaling Laws have achieved significant success by examining\nthe loss as models scale up. However, there remains a disparity between loss\nand model performance, which is of greater concern in practical applications.\nMoreover, as data continues to expand, it incorporates repetitive and\ninefficient data. In response, we introduce the Performance Law for SR models,\nwhich aims to theoretically investigate and model the relationship between\nmodel performance and data quality. Specifically, we first fit the HR and NDCG\nmetrics to transformer-based SR models. Subsequently, we propose Approximate\nEntropy (ApEn) to assess data quality, presenting a more nuanced approach\ncompared to traditional data quantity metrics. Our method enables accurate\npredictions across various dataset scales and model sizes, demonstrating a\nstrong correlation in large SR models and offering insights into achieving\noptimal performance for any given model configuration.\n","authors":["Tingjia Shen","Hao Wang","Chuhan Wu","Jin Yao Chin","Wei Guo","Yong Liu","Huifeng Guo","Defu Lian","Ruiming Tang","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2412.00430v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.02508v1","updated":"2024-12-03T15:39:05Z","published":"2024-12-03T15:39:05Z","title":"Towards Rich Emotions in 3D Avatars: A Text-to-3D Avatar Generation\n Benchmark","summary":" Producing emotionally dynamic 3D facial avatars with text derived from spoken\nwords (Emo3D) has been a pivotal research topic in 3D avatar generation. While\nprogress has been made in general-purpose 3D avatar generation, the exploration\nof generating emotional 3D avatars remains scarce, primarily due to the\ncomplexities of identifying and rendering rich emotions from spoken words. This\npaper reexamines Emo3D generation and draws inspiration from human processes,\nbreaking down Emo3D into two cascading steps: Text-to-3D Expression Mapping\n(T3DEM) and 3D Avatar Rendering (3DAR). T3DEM is the most crucial step in\ndetermining the quality of Emo3D generation and encompasses three key\nchallenges: Expression Diversity, Emotion-Content Consistency, and Expression\nFluidity. To address these challenges, we introduce a novel benchmark to\nadvance research in Emo3D generation. First, we present EmoAva, a large-scale,\nhigh-quality dataset for T3DEM, comprising 15,000 text-to-3D expression\nmappings that characterize the aforementioned three challenges in Emo3D\ngeneration. Furthermore, we develop various metrics to effectively evaluate\nmodels against these identified challenges. Next, to effectively model the\nconsistency, diversity, and fluidity of human expressions in the T3DEM step, we\npropose the Continuous Text-to-Expression Generator, which employs an\nautoregressive Conditional Variational Autoencoder for expression code\ngeneration, enhanced with Latent Temporal Attention and Expression-wise\nAttention mechanisms. Finally, to further enhance the 3DAR step on rendering\nhigher-quality subtle expressions, we present the Globally-informed Gaussian\nAvatar (GiGA) model. GiGA incorporates a global information mechanism into 3D\nGaussian representations, enabling the capture of subtle micro-expressions and\nseamless transitions between emotional states.\n","authors":["Haidong Xu","Meishan Zhang","Hao Ju","Zhedong Zheng","Hongyuan Zhu","Erik Cambria","Min Zhang","Hao Fei"],"pdf_url":"https://arxiv.org/pdf/2412.02508v1.pdf","comment":"18 pages, 14 figures. Project website:\n https://github.com/WalkerMitty/EmoAva"},{"id":"http://arxiv.org/abs/2311.18644v2","updated":"2024-12-03T15:22:38Z","published":"2023-11-30T15:53:02Z","title":"Exploring the hierarchical structure of human plans via program\n generation","summary":" Human behavior is often assumed to be hierarchically structured, made up of\nabstract actions that can be decomposed into concrete actions. However,\nbehavior is typically measured as a sequence of actions, which makes it\ndifficult to infer its hierarchical structure. In this paper, we explore how\npeople form hierarchically structured plans, using an experimental paradigm\nwith observable hierarchical representations: participants create programs that\nproduce sequences of actions in a language with explicit hierarchical\nstructure. This task lets us test two well-established principles of human\nbehavior: utility maximization (i.e. using fewer actions) and minimum\ndescription length (MDL; i.e. having a shorter program). We find that humans\nare sensitive to both metrics, but that both accounts fail to predict a\nqualitative feature of human-created programs, namely that people prefer\nprograms with reuse over and above the predictions of MDL. We formalize this\npreference for reuse by extending the MDL account into a generative model over\nprograms, modeling hierarchy choice as the induction of a grammar over actions.\nOur account can explain the preference for reuse and provides better\npredictions of human behavior, going beyond simple accounts of compressibility\nto highlight a principle that guides hierarchical planning.\n","authors":["Carlos G. Correa","Sophia Sanborn","Mark K. Ho","Frederick Callaway","Nathaniel D. Daw","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2311.18644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02479v1","updated":"2024-12-03T14:42:31Z","published":"2024-12-03T14:42:31Z","title":"OODFace: Benchmarking Robustness of Face Recognition under Common\n Corruptions and Appearance Variations","summary":" With the rise of deep learning, facial recognition technology has seen\nextensive research and rapid development. Although facial recognition is\nconsidered a mature technology, we find that existing open-source models and\ncommercial algorithms lack robustness in certain real-world Out-of-Distribution\n(OOD) scenarios, raising concerns about the reliability of these systems. In\nthis paper, we introduce OODFace, which explores the OOD challenges faced by\nfacial recognition models from two perspectives: common corruptions and\nappearance variations. We systematically design 30 OOD scenarios across 9 major\ncategories tailored for facial recognition. By simulating these challenges on\npublic datasets, we establish three robustness benchmarks: LFW-C/V, CFP-FP-C/V,\nand YTF-C/V. We then conduct extensive experiments on 19 different facial\nrecognition models and 3 commercial APIs, along with extended experiments on\nface masks, Vision-Language Models (VLMs), and defense strategies to assess\ntheir robustness. Based on the results, we draw several key insights,\nhighlighting the vulnerability of facial recognition systems to OOD data and\nsuggesting possible solutions. Additionally, we offer a unified toolkit that\nincludes all corruption and variation types, easily extendable to other\ndatasets. We hope that our benchmarks and findings can provide guidance for\nfuture improvements in facial recognition model robustness.\n","authors":["Caixin Kang","Yubo Chen","Shouwei Ruan","Shiji Zhao","Ruochen Zhang","Jiayi Wang","Shan Fu","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2412.02479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02474v1","updated":"2024-12-03T14:36:24Z","published":"2024-12-03T14:36:24Z","title":"F-SE-LSTM: A Time Series Anomaly Detection Method with Frequency Domain\n Information","summary":" With the development of society, time series anomaly detection plays an\nimportant role in network and IoT services. However, most existing anomaly\ndetection methods directly analyze time series in the time domain and cannot\ndistinguish some relatively hidden anomaly sequences. We attempt to analyze the\nimpact of frequency on time series from a frequency domain perspective, thus\nproposing a new time series anomaly detection method called F-SE-LSTM. This\nmethod utilizes two sliding windows and fast Fourier transform (FFT) to\nconstruct a frequency matrix. Simultaneously, Squeeze-and-Excitation Networks\n(SENet) and Long Short-Term Memory (LSTM) are employed to extract\nfrequency-related features within and between periods. Through comparative\nexperiments on multiple datasets such as Yahoo Webscope S5 and Numenta Anomaly\nBenchmark, the results demonstrate that the frequency matrix constructed by\nF-SE-LSTM exhibits better discriminative ability than ordinary time domain and\nfrequency domain data. Furthermore, F-SE-LSTM outperforms existing\nstate-of-the-art deep learning anomaly detection methods in terms of anomaly\ndetection capability and execution efficiency.\n","authors":["Yi-Xiang Lu","Xiao-Bo Jin","Jian Chen","Dong-Jie Liu","Guang-Gang Geng"],"pdf_url":"https://arxiv.org/pdf/2412.02474v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.03523v4","updated":"2024-12-03T14:31:41Z","published":"2024-10-04T15:44:23Z","title":"A Probabilistic Perspective on Unlearning and Alignment for Large\n Language Models","summary":" Comprehensive evaluation of Large Language Models (LLMs) is an open research\nproblem. Existing evaluations rely on deterministic point estimates generated\nvia greedy decoding. However, we find that deterministic evaluations fail to\ncapture the whole output distribution of a model, yielding inaccurate\nestimations of model capabilities. This is particularly problematic in critical\ncontexts such as unlearning and alignment, where precise model evaluations are\ncrucial. To remedy this, we introduce the first formal probabilistic evaluation\nframework in LLMs. Namely, we derive novel metrics with high-probability\nguarantees concerning the output distribution of a model. Our metrics are\napplication-independent and allow practitioners to make more reliable estimates\nabout model capabilities before deployment. Through a case study focused on\nunlearning, we reveal that deterministic evaluations falsely indicate\nsuccessful unlearning, whereas our probabilistic evaluations demonstrate that\nmost if not all of the supposedly unlearned information remains accessible in\nthese models. Additionally, we propose a novel unlearning loss based on entropy\noptimization and adaptive temperature scaling, which significantly improves\nunlearning in probabilistic settings on recent benchmarks. Our proposed shift\nfrom point estimates to probabilistic evaluations of output distributions\nrepresents an important step toward comprehensive evaluations of LLMs. Code\navailable at https://github.com/yascho/probabilistic-unlearning.\n","authors":["Yan Scholten","Stephan Günnemann","Leo Schwinn"],"pdf_url":"https://arxiv.org/pdf/2410.03523v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16300v2","updated":"2024-12-03T14:17:41Z","published":"2024-11-25T11:35:08Z","title":"BayLing 2: A Multilingual Large Language Model with Efficient Language\n Alignment","summary":" Large language models (LLMs), with their powerful generative capabilities and\nvast knowledge, empower various tasks in everyday life. However, these\nabilities are primarily concentrated in high-resource languages, leaving\nlow-resource languages with weaker generative capabilities and relatively\nlimited knowledge. Enhancing the multilingual capabilities of LLMs is therefore\ncrucial for serving over 100 linguistic communities worldwide. An intuitive\napproach to enhance the multilingual capabilities would be to construct\ninstruction data for various languages, but constructing instruction data for\nover 100 languages is prohibitively costly. In this paper, we introduce BayLing\n2, which efficiently transfers generative capabilities and knowledge from\nhigh-resource languages to low-resource languages through language alignment.\nTo achieve this, we constructed a dataset of 3.2 million instructions,\ncomprising high-resource language instructions (Chinese and English) and\ncross-lingual instructions for 100+ languages and performed instruction tuning\nbased on the dataset to facilitate the capability transfer between languages.\nUsing Llama as the foundation model, we developed BayLing-2-7B, BayLing-2-13B,\nand BayLing-2-8B, and conducted a comprehensive evaluation of BayLing. For\nmultilingual translation across 100+ languages, BayLing shows superior\nperformance compared to open-source models of similar scale. For multilingual\nknowledge and understanding benchmarks, BayLing achieves significant\nimprovements across over 20 low-resource languages, demonstrating its\ncapability of effective knowledge transfer from high-resource to low-resource\nlanguages. Furthermore, results on English benchmarks indicate that BayLing\nmaintains high performance in highresource languages while enhancing the\nperformance in low-resource languages. Demo, homepage, code and models of\nBayLing are available.\n","authors":["Shaolei Zhang","Kehao Zhang","Qingkai Fang","Shoutao Guo","Yan Zhou","Xiaodong Liu","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2411.16300v2.pdf","comment":"BayLing 2's online demo: http://nlp.ict.ac.cn/bayling/demo. BayLing\n 2's code and models: https://github.com/ictnlp/BayLing"},{"id":"http://arxiv.org/abs/2310.09401v4","updated":"2024-12-03T14:10:01Z","published":"2023-10-13T20:53:50Z","title":"A Novel Approach to Comprehending Users' Preferences for Accurate\n Personalized News Recommendation","summary":" Personalized news recommendation aims to assist users in finding news\narticles that align with their interests, which plays a pivotal role in\nmitigating users' information overload problem. Although many recent works have\nbeen studied for better personalized news recommendation, the following\nchallenges should be explored more: (C1) Comprehending manifold intents coupled\nwithin a news article, (C2) Differentiating varying post-read preferences of\nnews articles, and (C3) Addressing the cold-start user problem. To tackle the\naforementioned challenges together, in this paper, we propose a novel\npersonalized news recommendation framework (CROWN) that employs (1)\ncategory-guided intent disentanglement for (C1), (2) consistency-based news\nrepresentation for (C2), and (3) GNN-enhanced hybrid user representation for\n(C3). Furthermore, we incorporate a category prediction into the training\nprocess of CROWN as an auxiliary task, which provides supplementary supervisory\nsignals to enhance intent disentanglement. Extensive experiments on two\nreal-world datasets reveal that (1) CROWN provides consistent performance\nimprovements over ten state-of-the-art news recommendation methods and (2) the\nproposed strategies significantly improve the accuracy of CROWN.\n","authors":["Yunyong Ko","Seongeun Ryu","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2310.09401v4.pdf","comment":"10 pages, 6 figures, 8 tables"},{"id":"http://arxiv.org/abs/2412.02454v1","updated":"2024-12-03T13:43:36Z","published":"2024-12-03T13:43:36Z","title":"Gracefully Filtering Backdoor Samples for Generative Large Language\n Models without Retraining","summary":" Backdoor attacks remain significant security threats to generative large\nlanguage models (LLMs). Since generative LLMs output sequences of\nhigh-dimensional token logits instead of low-dimensional classification logits,\nmost existing backdoor defense methods designed for discriminative models like\nBERT are ineffective for generative LLMs. Inspired by the observed differences\nin learning behavior between backdoor and clean mapping in the frequency space,\nwe transform gradients of each training sample, directly influencing parameter\nupdates, into the frequency space. Our findings reveal a distinct separation\nbetween the gradients of backdoor and clean samples in the frequency space.\nBased on this phenomenon, we propose Gradient Clustering in the Frequency Space\nfor Backdoor Sample Filtering (GraCeFul), which leverages sample-wise gradients\nin the frequency space to effectively identify backdoor samples without\nrequiring retraining LLMs. Experimental results show that GraCeFul outperforms\nbaselines significantly. Notably, GraCeFul exhibits remarkable computational\nefficiency, achieving nearly 100% recall and F1 scores in identifying backdoor\nsamples, reducing the average success rate of various backdoor attacks to 0%\nwith negligible drops in clean accuracy across multiple free-style question\nanswering datasets. Additionally, GraCeFul generalizes to Llama-2 and Vicuna.\nThe codes are publicly available at https://github.com/ZrW00/GraceFul.\n","authors":["Zongru Wu","Pengzhou Cheng","Lingyong Fang","Zhuosheng Zhang","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2412.02454v1.pdf","comment":"Accepted at COLING 2025"},{"id":"http://arxiv.org/abs/2412.02449v1","updated":"2024-12-03T13:34:42Z","published":"2024-12-03T13:34:42Z","title":"BYE: Build Your Encoder with One Sequence of Exploration Data for\n Long-Term Dynamic Scene Understanding","summary":" Dynamic scene understanding remains a persistent challenge in robotic\napplications. Early dynamic mapping methods focused on mitigating the negative\ninfluence of short-term dynamic objects on camera motion estimation by masking\nor tracking specific categories, which often fall short in adapting to\nlong-term scene changes. Recent efforts address object association in long-term\ndynamic environments using neural networks trained on synthetic datasets, but\nthey still rely on predefined object shapes and categories. Other methods\nincorporate visual, geometric, or semantic heuristics for the association but\noften lack robustness. In this work, we introduce BYE, a class-agnostic,\nper-scene point cloud encoder that removes the need for predefined categories,\nshape priors, or extensive association datasets. Trained on only a single\nsequence of exploration data, BYE can efficiently perform object association in\ndynamically changing scenes. We further propose an ensembling scheme combining\nthe semantic strengths of Vision Language Models (VLMs) with the scene-specific\nexpertise of BYE, achieving a 7% improvement and a 95% success rate in object\nassociation tasks. Code and dataset are available at\nhttps://byencoder.github.io.\n","authors":["Chenguang Huang","Shengchao Yan","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2412.02449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02441v1","updated":"2024-12-03T13:25:18Z","published":"2024-12-03T13:25:18Z","title":"Artificial Expert Intelligence through PAC-reasoning","summary":" Artificial Expert Intelligence (AEI) seeks to transcend the limitations of\nboth Artificial General Intelligence (AGI) and narrow AI by integrating\ndomain-specific expertise with critical, precise reasoning capabilities akin to\nthose of top human experts. Existing AI systems often excel at predefined tasks\nbut struggle with adaptability and precision in novel problem-solving. To\novercome this, AEI introduces a framework for ``Probably Approximately Correct\n(PAC) Reasoning\". This paradigm provides robust theoretical guarantees for\nreliably decomposing complex problems, with a practical mechanism for\ncontrolling reasoning precision. In reference to the division of human thought\ninto System 1 for intuitive thinking and System 2 for reflective\nreasoning~\\citep{tversky1974judgment}, we refer to this new type of reasoning\nas System 3 for precise reasoning, inspired by the rigor of the scientific\nmethod. AEI thus establishes a foundation for error-bounded, inference-time\nlearning.\n","authors":["Shai Shalev-Shwartz","Amnon Shashua","Gal Beniamini","Yoav Levine","Or Sharir","Noam Wies","Ido Ben-Shaul","Tomer Nussbaum","Shir Granot Peled"],"pdf_url":"https://arxiv.org/pdf/2412.02441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02427v1","updated":"2024-12-03T12:46:06Z","published":"2024-12-03T12:46:06Z","title":"GerPS-Compare: Comparing NER methods for legal norm analysis","summary":" We apply NER to a particular sub-genre of legal texts in German: the genre of\nlegal norms regulating administrative processes in public service\nadministration. The analysis of such texts involves identifying stretches of\ntext that instantiate one of ten classes identified by public service\nadministration professionals. We investigate and compare three methods for\nperforming Named Entity Recognition (NER) to detect these classes: a Rule-based\nsystem, deep discriminative models, and a deep generative model. Our results\nshow that Deep Discriminative models outperform both the Rule-based system as\nwell as the Deep Generative model, the latter two roughly performing equally\nwell, outperforming each other in different classes. The main cause for this\nsomewhat surprising result is arguably the fact that the classes used in the\nanalysis are semantically and syntactically heterogeneous, in contrast to the\nclasses used in more standard NER tasks. Deep Discriminative models appear to\nbe better equipped for dealing with this heterogenerity than both generic LLMs\nand human linguists designing rule-based NER systems.\n","authors":["Sarah T. Bachinger","Christoph Unger","Robin Erd","Leila Feddoul","Clara Lachenmaier","Sina Zarrieß","Birgitta König-Ries"],"pdf_url":"https://arxiv.org/pdf/2412.02427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.16200v2","updated":"2024-12-03T12:39:11Z","published":"2024-08-29T01:42:38Z","title":"PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object\n Detection in Bird's-Eye-View","summary":" Recently, LSS-based multi-view 3D object detection provides an economical and\ndeployment-friendly solution for autonomous driving. However, all the existing\nLSS-based methods transform multi-view image features into a Cartesian\nBird's-Eye-View(BEV) representation, which does not take into account the\nnon-uniform image information distribution and hardly exploits the view\nsymmetry. In this paper, in order to adapt the image information distribution\nand preserve the view symmetry by regular convolution, we propose to employ the\npolar BEV representation to substitute the Cartesian BEV representation. To\nachieve this, we elaborately tailor three modules: a polar view transformer to\ngenerate the polar BEV representation, a polar temporal fusion module for\nfusing historical polar BEV features and a polar detection head to predict the\npolar-parameterized representation of the object. In addition, we design a 2D\nauxiliary detection head and a spatial attention enhancement module to improve\nthe quality of feature extraction in perspective view and BEV, respectively.\nFinally, we integrate the above improvements into a novel multi-view 3D object\ndetector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves\nthe superior performance. The code is available at\nhttps://github.com/Yzichen/PolarBEVDet.git.\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16200v2.pdf","comment":"11 pages, 6 figures. This work has been submitted to the IEEE for\n possible publication"},{"id":"http://arxiv.org/abs/2412.02415v1","updated":"2024-12-03T12:20:56Z","published":"2024-12-03T12:20:56Z","title":"Knowledge-Enhanced Conversational Recommendation via Transformer-based\n Sequential Modelling","summary":" In conversational recommender systems (CRSs), conversations usually involve a\nset of items and item-related entities or attributes, e.g., director is a\nrelated entity of a movie. These items and item-related entities are often\nmentioned along the development of a dialog, leading to potential sequential\ndependencies among them. However, most of existing CRSs neglect these potential\nsequential dependencies. In this article, we first propose a Transformer-based\nsequential conversational recommendation method, named TSCR, to model the\nsequential dependencies in the conversations to improve CRS. In TSCR, we\nrepresent conversations by items and the item-related entities, and construct\nuser sequences to discover user preferences by considering both the mentioned\nitems and item-related entities. Based on the constructed sequences, we deploy\na Cloze task to predict the recommended items along a sequence. Meanwhile, in\ncertain domains, knowledge graphs formed by the items and their related\nentities are readily available, which provide various different kinds of\nassociations among them. Given that TSCR does not benefit from such knowledge\ngraphs, we then propose a knowledge graph enhanced version of TSCR, called\nTSCRKG. In specific, we leverage the knowledge graph to offline initialize our\nmodel TSCRKG, and augment the user sequence of conversations (i.e., sequence of\nthe mentioned items and item-related entities in the conversation) with\nmulti-hop paths in the knowledge graph. Experimental results demonstrate that\nour TSCR model significantly outperforms state-of-the-art baselines, and the\nenhanced version TSCRKG further improves recommendation performance on top of\nTSCR.\n","authors":["Jie Zou","Aixin Sun","Cheng Long","Evangelos Kanoulas"],"pdf_url":"https://arxiv.org/pdf/2412.02415v1.pdf","comment":"Accepted by ACM TOIS"},{"id":"http://arxiv.org/abs/2412.02412v1","updated":"2024-12-03T12:12:03Z","published":"2024-12-03T12:12:03Z","title":"VISTA: A Panoramic View of Neural Representations","summary":" We present VISTA (Visualization of Internal States and Their Associations), a\nnovel pipeline for visually exploring and interpreting neural network\nrepresentations. VISTA addresses the challenge of analyzing vast\nmultidimensional spaces in modern machine learning models by mapping\nrepresentations into a semantic 2D space. The resulting collages visually\nreveal patterns and relationships within internal representations. We\ndemonstrate VISTA's utility by applying it to sparse autoencoder latents\nuncovering new properties and interpretations. We review the VISTA methodology,\npresent findings from our case study ( https://got.drib.net/latents/ ), and\ndiscuss implications for neural network interpretability across various domains\nof machine learning.\n","authors":["Tom White"],"pdf_url":"https://arxiv.org/pdf/2412.02412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02410v1","updated":"2024-12-03T12:05:56Z","published":"2024-12-03T12:05:56Z","title":"A Multi-Agent Framework for Extensible Structured Text Generation in\n PLCs","summary":" Programmable Logic Controllers (PLCs) are microcomputers essential for\nautomating factory operations. Structured Text (ST), a high-level language\nadhering to the IEC 61131-3 standard, is pivotal for PLCs due to its ability to\nexpress logic succinctly and to seamlessly integrate with other languages\nwithin the same standard. However, vendors develop their own customized\nversions of ST, and the lack of comprehensive and standardized documentation\nfor the full semantics of ST has contributed to inconsistencies in how the\nlanguage is implemented. Consequently, the steep learning curve associated with\nST, combined with ever-evolving industrial requirements, presents significant\nchallenges for developers. In response to these issues, we present AutoPLC, an\nLLM-based approach designed to automate the generation of vendor-specific ST\ncode. To facilitate effective code generation, we first built a comprehensive\nknowledge base, including Rq2ST Case Library (requirements and corresponding\nimplementations) and Instruction libraries. Then we developed a retrieval\nmodule to incorporate the domain-specific knowledge by identifying pertinent\ncases and instructions, guiding the LLM to generate code that meets the\nrequirements. In order to verify and improve the quality of the generated code,\nwe designed an adaptable code checker. If errors are detected, we initiate an\niterative self-improvement process to instruct the LLM to revise the generated\ncode. We evaluate AutoPLC's performance against seven state-of-the-art\nbaselines using three benchmarks, one for open-source basic ST and two for\ncommercial Structured Control Language (SCL) from Siemens. The results show\nthat our approach consistently achieves superior performance across all\nbenchmarks. Ablation study emphasizes the significance of our modules. Further\nmanual analysis confirm the practical utility of the ST code generated by\nAutoPLC.\n","authors":["Donghao Yang","Aolang Wu","Tianyi Zhang","Li Zhang","Fang Liu","Xiaoli Lian","Yuming Ren","Jiaji Tian"],"pdf_url":"https://arxiv.org/pdf/2412.02410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02399v1","updated":"2024-12-03T11:49:01Z","published":"2024-12-03T11:49:01Z","title":"OMENN: One Matrix to Explain Neural Networks","summary":" Deep Learning (DL) models are often black boxes, making their decision-making\nprocesses difficult to interpret. This lack of transparency has driven\nadvancements in eXplainable Artificial Intelligence (XAI), a field dedicated to\nclarifying the reasoning behind DL model predictions. Among these,\nattribution-based methods such as LRP and GradCAM are widely used, though they\nrely on approximations that can be imprecise.\n To address these limitations, we introduce One Matrix to Explain Neural\nNetworks (OMENN), a novel post-hoc method that represents a neural network as a\nsingle, interpretable matrix for each specific input. This matrix is\nconstructed through a series of linear transformations that represent the\nprocessing of the input by each successive layer in the neural network. As a\nresult, OMENN provides locally precise, attribution-based explanations of the\ninput across various modern models, including ViTs and CNNs. We present a\ntheoretical analysis of OMENN based on dynamic linearity property and validate\nits effectiveness with extensive tests on two XAI benchmarks, demonstrating\nthat OMENN is competitive with state-of-the-art methods.\n","authors":["Adam Wróbel","Mikołaj Janusz","Bartosz Zieliński","Dawid Rymarczyk"],"pdf_url":"https://arxiv.org/pdf/2412.02399v1.pdf","comment":"Under review, code will be released after acceptance"},{"id":"http://arxiv.org/abs/2408.08488v2","updated":"2024-12-03T11:06:03Z","published":"2024-08-16T02:17:21Z","title":"PITN: Physics-Informed Temporal Networks for Cuffless Blood Pressure\n Estimation","summary":" Monitoring blood pressure with non-invasive sensors has gained popularity for\nproviding comfortable user experiences, one of which is a significant function\nof smart wearables. Although providing a comfortable user experience, such\nmethods are suffering from the demand for a significant amount of realistic\ndata to train an individual model for each subject, especially considering the\ninvasive or obtrusive BP ground-truth measurements. To tackle this challenge,\nwe introduce a novel physics-informed temporal network~(PITN) with adversarial\ncontrastive learning to enable precise BP estimation with very limited data.\nSpecifically, we first enhance the physics-informed neural network~(PINN) with\nthe temporal block for investigating BP dynamics' multi-periodicity for\npersonal cardiovascular cycle modeling and temporal variation. We then employ\nadversarial training to generate extra physiological time series data,\nimproving PITN's robustness in the face of sparse subject-specific training\ndata. Furthermore, we utilize contrastive learning to capture the\ndiscriminative variations of cardiovascular physiologic phenomena. This\napproach aggregates physiological signals with similar blood pressure values in\nlatent space while separating clusters of samples with dissimilar blood\npressure values. Experiments on three widely-adopted datasets with different\nmodailties (\\emph{i.e.,} bioimpedance, PPG, millimeter-wave) demonstrate the\nsuperiority and effectiveness of the proposed methods over previous\nstate-of-the-art approaches. The code is available\nat~\\url{https://github.com/Zest86/ACL-PITN}.\n","authors":["Rui Wang","Mengshi Qi","Yingxia Shao","Anfu Zhou","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2408.08488v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.03048v3","updated":"2024-12-03T11:05:48Z","published":"2023-06-05T17:21:05Z","title":"From Robustness to Explainability and Back Again","summary":" Formal explainability guarantees the rigor of computed explanations, and so\nit is paramount in domains where rigor is critical, including those deemed\nhigh-risk. Unfortunately, since its inception formal explainability has been\nhampered by poor scalability. At present, this limitation still holds true for\nsome families of classifiers, the most significant being deep neural networks.\nThis paper addresses the poor scalability of formal explainability and proposes\nnovel efficient algorithms for computing formal explanations. The novel\nalgorithm computes explanations by answering instead a number of robustness\nqueries, and such that the number of such queries is at most linear on the\nnumber of features. Consequently, the proposed algorithm establishes a direct\nrelationship between the practical complexity of formal explainability and that\nof robustness. To achieve the proposed goals, the paper generalizes the\ndefinition of formal explanations, thereby allowing the use of robustness tools\nthat are based on different distance norms, and also by reasoning in terms of\nsome target degree of robustness. Preliminary experiments validate the\npractical efficiency of the proposed approach.\n","authors":["Xuanxiang Huang","Joao Marques-Silva"],"pdf_url":"https://arxiv.org/pdf/2306.03048v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02372v1","updated":"2024-12-03T10:58:34Z","published":"2024-12-03T10:58:34Z","title":"HERO: Hint-Based Efficient and Reliable Query Optimizer","summary":" We propose a novel model for learned query optimization which provides query\nhints leading to better execution plans. The model addresses the three key\nchallenges in learned hint-based query optimization: reliable hint\nrecommendation (ensuring non-degradation of query latency), efficient hint\nexploration, and fast inference. We provide an in-depth analysis of existing\nNN-based approaches to hint-based optimization and experimentally confirm the\nnamed challenges for them. Our alternative solution consists of a new inference\nschema based on an ensemble of context-aware models and a graph storage for\nreliable hint suggestion and fast inference, and a budget-controlled training\nprocedure with a local search algorithm that solves the issue of exponential\nsearch space exploration. In experiments on standard benchmarks, our model\ndemonstrates optimization capability close to the best achievable with\ncoarse-grained hints. Controlling the degree of parallelism (query dop) in\naddition to operator-related hints enables our model to achieve 3x latency\nimprovement on JOB benchmark which sets a new standard for optimization. Our\nmodel is interpretable and easy to debug, which is particularly important for\ndeployment in production.\n","authors":["Sergey Zinchenko","Sergey Iazov"],"pdf_url":"https://arxiv.org/pdf/2412.02372v1.pdf","comment":"Submitted to VLDB 2025; 13 pages; 13 figures"},{"id":"http://arxiv.org/abs/2412.02368v1","updated":"2024-12-03T10:52:06Z","published":"2024-12-03T10:52:06Z","title":"ScImage: How Good Are Multimodal Large Language Models at Scientific\n Text-to-Image Generation?","summary":" Multimodal large language models (LLMs) have demonstrated impressive\ncapabilities in generating high-quality images from textual instructions.\nHowever, their performance in generating scientific images--a critical\napplication for accelerating scientific progress--remains underexplored. In\nthis work, we address this gap by introducing ScImage, a benchmark designed to\nevaluate the multimodal capabilities of LLMs in generating scientific images\nfrom textual descriptions. ScImage assesses three key dimensions of\nunderstanding: spatial, numeric, and attribute comprehension, as well as their\ncombinations, focusing on the relationships between scientific objects (e.g.,\nsquares, circles). We evaluate five models, GPT-4o, Llama, AutomaTikZ, Dall-E,\nand StableDiffusion, using two modes of output generation: code-based outputs\n(Python, TikZ) and direct raster image generation. Additionally, we examine\nfour different input languages: English, German, Farsi, and Chinese. Our\nevaluation, conducted with 11 scientists across three criteria (correctness,\nrelevance, and scientific accuracy), reveals that while GPT-4o produces outputs\nof decent quality for simpler prompts involving individual dimensions such as\nspatial, numeric, or attribute understanding in isolation, all models face\nchallenges in this task, especially for more complex prompts.\n","authors":["Leixin Zhang","Steffen Eger","Yinjie Cheng","Weihe Zhai","Jonas Belouadi","Christoph Leiter","Simone Paolo Ponzetto","Fahimeh Moafian","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.02368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02357v1","updated":"2024-12-03T10:27:04Z","published":"2024-12-03T10:27:04Z","title":"Dynamic Prompt Middleware: Contextual Prompt Refinement Controls for\n Comprehension Tasks","summary":" Effective prompting of generative AI is challenging for many users,\nparticularly in expressing context for comprehension tasks such as explaining\nspreadsheet formulas, Python code, and text passages. Prompt middleware aims to\naddress this barrier by assisting in prompt construction, but barriers remain\nfor users in expressing adequate control so that they can receive AI-responses\nthat match their preferences.\n We conduct a formative survey (n=38) investigating user needs for control\nover AI-generated explanations in comprehension tasks, which uncovers a\ntrade-off between standardized but predictable support for prompting, and\nadaptive but unpredictable support tailored to the user and task. To explore\nthis trade-off, we implement two prompt middleware approaches: Dynamic Prompt\nRefinement Control (Dynamic PRC) and Static Prompt Refinement Control (Static\nPRC). The Dynamic PRC approach generates context-specific UI elements that\nprovide prompt refinements based on the user's prompt and user needs from the\nAI, while the Static PRC approach offers a preset list of generally applicable\nrefinements.\n We evaluate these two approaches with a controlled user study (n=16) to\nassess the impact of these approaches on user control of AI responses for\ncrafting better explanations. Results show a preference for the Dynamic PRC\napproach as it afforded more control, lowered barriers to providing context,\nand encouraged exploration and reflection of the tasks, but that reasoning\nabout the effects of different generated controls on the final output remains\nchallenging. Drawing on participant feedback, we discuss design implications\nfor future Dynamic PRC systems that enhance user control of AI responses. Our\nfindings suggest that dynamic prompt middleware can improve the user experience\nof generative AI workflows by affording greater control and guide users to a\nbetter AI response.\n","authors":["Ian Drosos","Jack Williams","Advait Sarkar","Nicholas Wilson"],"pdf_url":"https://arxiv.org/pdf/2412.02357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16082v6","updated":"2024-12-03T10:18:20Z","published":"2023-07-29T21:37:55Z","title":"EnrichEvent: Enriching Social Data with Contextual Information for\n Emerging Event Extraction","summary":" Social platforms have emerged as crucial platforms for disseminating\ninformation and discussing real-life social events, offering researchers an\nexcellent opportunity to design and implement novel event detection frameworks.\nHowever, most existing approaches only exploit keyword burstiness or network\nstructures to detect unspecified events. Thus, they often need help identifying\nunknown events regarding the challenging nature of events and social data.\nSocial data, e.g., tweets, is characterized by misspellings, incompleteness,\nword sense ambiguation, irregular language, and variation in aspects of\nopinions. Moreover, extracting discriminative features and patterns for\nevolving events by exploiting the limited structural knowledge is almost\ninfeasible. To address these challenges, in this paper, we propose a novel\nframework, namely EnrichEvent, that leverages the linguistic and contextual\nrepresentations of streaming social data. In particular, we leverage contextual\nand linguistic knowledge to detect semantically related tweets and enhance the\neffectiveness of the event detection approaches. Eventually, our proposed\nframework produces cluster chains for each event to show the evolving variation\nof the event through time. We conducted extensive experiments to evaluate our\nframework, validating its high performance and effectiveness in detecting and\ndistinguishing unspecified social events.\n","authors":["Mohammadali Sefidi Esfahani","Mohammad Akbari"],"pdf_url":"https://arxiv.org/pdf/2307.16082v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02334v1","updated":"2024-12-03T09:53:32Z","published":"2024-12-03T09:53:32Z","title":"Reinforcement learning to learn quantum states for Heisenberg scaling\n accuracy","summary":" Learning quantum states is a crucial task for realizing the potential of\nquantum information technology. Recently, neural approaches have emerged as\npromising methods for learning quantum states. We propose a meta-learning model\nthat employs reinforcement learning (RL) to optimize the process of learning\nquantum states. For learning quantum states, our scheme trains a Hardware\nefficient ansatz with a blackbox optimization algorithm, called evolution\nstrategy (ES). To enhance the efficiency of ES, a RL agent dynamically adjusts\nthe hyperparameters of ES. To facilitate the RL training, we introduce an\naction repetition strategy inspired by curriculum learning. The RL agent\nsignificantly improves the sample efficiency of learning random quantum states,\nand achieves infidelity scaling close to the Heisenberg limit. We showcase that\nthe RL agent trained using 3-qubit states can be generalized to learning up to\n5-qubit states. These results highlight the utility of RL-driven meta-learning\nto enhance the efficiency and generalizability of learning quantum states. Our\napproach can be applicable to improve quantum control, quantum optimization,\nand quantum machine learning.\n","authors":["Jeongwoo Jae","Jeonghoon Hong","Jinho Choo","Yeong-Dae Kwon"],"pdf_url":"https://arxiv.org/pdf/2412.02334v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.04710v2","updated":"2024-12-03T09:51:23Z","published":"2024-06-07T07:38:33Z","title":"Morescient GAI for Software Engineering (Extended Version)","summary":" The ability of Generative AI (GAI) technology to automatically check,\nsynthesize and modify software engineering artifacts promises to revolutionize\nall aspects of software engineering. Using GAI for software engineering tasks\nis consequently one of the most rapidly expanding fields of software\nengineering research, with over a hundred LLM-based code models having been\npublished since 2021. However, the overwhelming majority of existing code\nmodels share a major weakness - they are exclusively trained on the syntactic\nfacet of software, significantly lowering their trustworthiness in tasks\ndependent on software semantics. To address this problem, a new class of\n\"Morescient\" GAI is needed that is \"aware\" of (i.e., trained on) both the\nsemantic and static facets of software. This, in turn, will require a new\ngeneration of software observation platforms capable of generating large\nquantities of execution observations in a structured and readily analyzable\nway. In this paper, we present a vision and roadmap for how such \"Morescient\"\nGAI models can be engineered, evolved and disseminated according to the\nprinciples of open science.\n","authors":["Marcus Kessel","Colin Atkinson"],"pdf_url":"https://arxiv.org/pdf/2406.04710v2.pdf","comment":"To appear in ACM Transactions on Software Engineering and\n Methodology, Special Issue \"2030 Roadmap Software Engineering\""},{"id":"http://arxiv.org/abs/2406.06644v3","updated":"2024-12-03T09:49:07Z","published":"2024-06-09T23:39:31Z","title":"Latent Diffusion Model-Enabled Low-Latency Semantic Communication in the\n Presence of Semantic Ambiguities and Wireless Channel Noises","summary":" Deep learning (DL)-based Semantic Communications (SemCom) is becoming\ncritical to maximize overall efficiency of communication networks.\nNevertheless, SemCom is sensitive to wireless channel uncertainties, source\noutliers, and suffer from poor generalization bottlenecks. To address the\nmentioned challenges, this paper develops a latent diffusion model-enabled\nSemCom system with three key contributions, i.e., i) to handle potential\noutliers in the source data, semantic errors obtained by projected gradient\ndescent based on the vulnerabilities of DL models, are utilized to update the\nparameters and obtain an outlier-robust encoder, ii) a lightweight single-layer\nlatent space transformation adapter completes one-shot learning at the\ntransmitter and is placed before the decoder at the receiver, enabling\nadaptation for out-of-distribution data and enhancing human-perceptual quality,\nand iii) an end-to-end consistency distillation (EECD) strategy is used to\ndistill the diffusion models trained in latent space, enabling deterministic\nsingle or few-step low-latency denoising in various noisy channels while\nmaintaining high semantic quality. Extensive numerical experiments across\ndifferent datasets demonstrate the superiority of the proposed SemCom system,\nconsistently proving its robustness to outliers, the capability to transmit\ndata with unknown distributions, and the ability to perform real-time channel\ndenoising tasks while preserving high human perceptual quality, outperforming\nthe existing denoising approaches in semantic metrics like learned perceptual\nimage path similarity (LPIPS).\n","authors":["Jianhua Pei","Cheng Feng","Ping Wang","Hina Tabassum","Dongyuan Shi"],"pdf_url":"https://arxiv.org/pdf/2406.06644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02331v1","updated":"2024-12-03T09:48:28Z","published":"2024-12-03T09:48:28Z","title":"Sample Efficient Robot Learning in Supervised Effect Prediction Tasks","summary":" In self-supervised robot learning, robots actively explore their environments\nand generate data by acting on entities in the environment. Therefore, an\nexploration policy is desired that ensures sample efficiency to minimize robot\nexecution costs while still providing accurate learning. For this purpose, the\nrobotic community has adopted Intrinsic Motivation (IM)-based approaches such\nas Learning Progress (LP). On the machine learning front, Active Learning (AL)\nhas been used successfully, especially for classification tasks. In this work,\nwe develop a novel AL framework geared towards robotics regression tasks, such\nas action-effect prediction and, more generally, for world model learning,\nwhich we call MUSEL - Model Uncertainty for Sample Efficient Learning. MUSEL\naims to extract model uncertainty from the total uncertainty estimate given by\na suitable learning engine by making use of earning progress and input\ndiversity and use it to improve sample efficiency beyond the state-of-the-art\naction-effect prediction methods. We demonstrate the feasibility of our model\nby using a Stochastic Variational Gaussian Process (SVGP) as the learning\nengine and testing the system on a set of robotic experiments in simulation.\nThe efficacy of MUSEL is demonstrated by comparing its performance to standard\nmethods used in robot action-effect learning. In a robotic tabletop environment\nin which a robot manipulator is tasked with learning the effect of its actions,\nthe experiments show that MUSEL facilitates higher accuracy in learning action\neffects while ensuring sample efficiency.\n","authors":["Mehmet Arda Eren","Erhan Oztop"],"pdf_url":"https://arxiv.org/pdf/2412.02331v1.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2405.11284v2","updated":"2024-12-03T09:41:22Z","published":"2024-05-18T13:09:33Z","title":"The Logic of Counterfactuals and the Epistemology of Causal Inference","summary":" The 2021 Nobel Prize in Economics recognizes a type of causal model known as\nthe Rubin causal model, or potential outcome framework, which deserves far more\nattention from philosophers than it currently receives. To spark philosophers'\ninterest, I develop a dialectic connecting the Rubin causal model to the\nLewis-Stalnaker debate on a logical principle of counterfactuals: Conditional\nExcluded Middle (CEM). I begin by playing good cop for CEM, developing a new\nargument in its favor -- a Quine-Putnam-style indispensability argument. This\nargument is based on the observation that CEM seems to be indispensable to the\nRubin causal model, which underpins our best scientific theory of causal\ninference in health and social sciences -- a Nobel Prize-winning theory.\nIndeed, CEM has long remained a core assumption of the Rubin causal model,\ndespite challenges from within the statistics and economics communities over\ntwenty years ago. I then switch sides to play bad cop for CEM, undermining the\nindispensability argument by developing a new theory of causal inference that\ndispenses with CEM while preserving the successes of the original theory\n(thanks to a new theorem proved here). The key, somewhat surprisingly, is to\nintegrate two approaches to causal modeling: the Rubin causal model, more\nfamiliar in health and social sciences, and the causal Bayes net, more familiar\nin philosophy. The good cop/bad cop dialectic is concluded with a connection to\nbroader philosophical issues, including intertheory relations, the revisability\nof logic, and the role of background assumptions in justifying scientific\ninference.\n","authors":["Hanti Lin"],"pdf_url":"https://arxiv.org/pdf/2405.11284v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02327v1","updated":"2024-12-03T09:40:59Z","published":"2024-12-03T09:40:59Z","title":"Switchable deep beamformer for high-quality and real-time passive\n acoustic mapping","summary":" Passive acoustic mapping (PAM) is a promising tool for monitoring acoustic\ncavitation activities in the applications of ultrasound therapy. Data-adaptive\nbeamformers for PAM have better image quality compared to the time exposure\nacoustics (TEA) algorithms. However, the computational cost of data-adaptive\nbeamformers is considerably expensive. In this work, we develop a deep\nbeamformer based on a generative adversarial network, which can switch between\ndifferent transducer arrays and reconstruct high-quality PAM images directly\nfrom radio frequency ultrasound signals with low computational cost. The deep\nbeamformer was trained on the dataset consisting of simulated and experimental\ncavitation signals of single and multiple microbubble clouds measured by\ndifferent (linear and phased) arrays covering 1-15 MHz. We compared the\nperformance of the deep beamformer to TEA and three different data-adaptive\nbeamformers using the simulated and experimental test dataset. Compared with\nTEA, the deep beamformer reduced the energy spread area by 18.9%-65.0% and\nimproved the image signal-to-noise ratio by 9.3-22.9 dB in average for the\ndifferent arrays in our data. Compared to the data-adaptive beamformers, the\ndeep beamformer reduced the computational cost by three orders of magnitude\nachieving 10.5 ms image reconstruction speed in our data, while the image\nquality was as good as that of the data-adaptive beamformers. These results\ndemonstrated the potential of the deep beamformer for high-resolution\nmonitoring of microbubble cavitation activities for ultrasound therapy.\n","authors":["Yi Zeng","Jinwei Li","Hui Zhu","Shukuan Lu","Jianfeng Li","Xiran Cai"],"pdf_url":"https://arxiv.org/pdf/2412.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08802v3","updated":"2024-12-03T09:39:57Z","published":"2024-02-05T14:20:19Z","title":"Governance of Generative Artificial Intelligence for Companies","summary":" Generative Artificial Intelligence (GenAI), specifically large language\nmodels like ChatGPT, has swiftly entered organizations without adequate\ngovernance, posing both opportunities and risks. Despite extensive debates on\nGenAI's transformative nature and regulatory measures, limited research\naddresses organizational governance, encompassing technical and business\nperspectives. Although numerous frameworks for governance of AI exist, it is\nnot clear to what extent they apply to GenAI. Our review paper fills this gap\nby surveying recent works with the purpose of better understanding fundamental\ncharacteristics of GenAI and adjusting prior frameworks specifically towards\nGenAI governance within companies. To do so, it extends Nickerson's framework\ndevelopment processes to include prior conceptualizations. Our framework\noutlines the scope, objectives, and governance mechanisms tailored to harness\nbusiness opportunities as well as mitigate risks associated with GenAI\nintegration. Our research contributes a focused approach to GenAI governance,\noffering practical insights for companies navigating the challenges of GenAI\nadoption and highlighting research gaps.\n","authors":["Johannes Schneider","Pauline Kuss","Rene Abraham","Christian Meske"],"pdf_url":"https://arxiv.org/pdf/2403.08802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18506v2","updated":"2024-12-03T09:25:11Z","published":"2024-11-27T16:48:24Z","title":"LLM-ABBA: Understanding time series via symbolic approximation","summary":" The success of large language models (LLMs) for time series has been\ndemonstrated in previous work. Utilizing a symbolic time series representation,\none can efficiently bridge the gap between LLMs and time series. However, the\nremaining challenge is to exploit the semantic information hidden in time\nseries by using symbols or existing tokens of LLMs, while aligning the\nembedding space of LLMs according to the hidden information of time series. The\nsymbolic time series approximation (STSA) method called adaptive Brownian\nbridge-based symbolic aggregation (ABBA) shows outstanding efficacy in\npreserving salient time series features by modeling time series patterns in\nterms of amplitude and period while using existing tokens of LLMs.\n In this paper, we introduce a method, called LLM-ABBA, that integrates ABBA\ninto large language models for various downstream time series tasks. By\nsymbolizing time series, LLM-ABBA compares favorably to the recent\nstate-of-the-art (SOTA) in UCR and three medical time series classification\ntasks. Meanwhile, a fixed-polygonal chain trick in ABBA is introduced to\n\\kc{avoid obvious drifting} during prediction tasks by significantly mitigating\nthe effects of cumulative error arising from misused symbols during the\ntransition from symbols to numerical values. In time series regression tasks,\nLLM-ABBA achieves the new SOTA on Time Series Extrinsic Regression (TSER)\nbenchmarks. LLM-ABBA also shows competitive prediction capability compared to\nrecent SOTA time series prediction results. We believe this framework can also\nseamlessly extend to other time series tasks.\n","authors":["Erin Carson","Xinye Chen","Cheng Kang"],"pdf_url":"https://arxiv.org/pdf/2411.18506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02302v1","updated":"2024-12-03T09:16:13Z","published":"2024-12-03T09:16:13Z","title":"Enhanced Photovoltaic Power Forecasting: An iTransformer and LSTM-Based\n Model Integrating Temporal and Covariate Interactions","summary":" Accurate photovoltaic (PV) power forecasting is critical for integrating\nrenewable energy sources into the grid, optimizing real-time energy management,\nand ensuring energy reliability amidst increasing demand. However, existing\nmodels often struggle with effectively capturing the complex relationships\nbetween target variables and covariates, as well as the interactions between\ntemporal dynamics and multivariate data, leading to suboptimal forecasting\naccuracy. To address these challenges, we propose a novel model architecture\nthat leverages the iTransformer for feature extraction from target variables\nand employs long short-term memory (LSTM) to extract features from covariates.\nA cross-attention mechanism is integrated to fuse the outputs of both models,\nfollowed by a Kolmogorov-Arnold network (KAN) mapping for enhanced\nrepresentation. The effectiveness of the proposed model is validated using\npublicly available datasets from Australia, with experiments conducted across\nfour seasons. Results demonstrate that the proposed model effectively capture\nseasonal variations in PV power generation and improve forecasting accuracy.\n","authors":["Guang Wu","Yun Wang","Qian Zhou","Ziyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02301v1","updated":"2024-12-03T09:13:52Z","published":"2024-12-03T09:13:52Z","title":"Large Multimodal Agents for Accurate Phishing Detection with Enhanced\n Token Optimization and Cost Reduction","summary":" With the rise of sophisticated phishing attacks, there is a growing need for\neffective and economical detection solutions. This paper explores the use of\nlarge multimodal agents, specifically Gemini 1.5 Flash and GPT-4o mini, to\nanalyze both URLs and webpage screenshots via APIs, thus avoiding the\ncomplexities of training and maintaining AI systems. Our findings indicate that\nintegrating these two data types substantially enhances detection performance\nover using either type alone. However, API usage incurs costs per query that\ndepend on the number of input and output tokens. To address this, we propose a\ntwo-tiered agentic approach: initially, one agent assesses the URL, and if\ninconclusive, a second agent evaluates both the URL and the screenshot. This\nmethod not only maintains robust detection performance but also significantly\nreduces API costs by minimizing unnecessary multi-input queries. Cost analysis\nshows that with the agentic approach, GPT-4o mini can process about 4.2 times\nas many websites per $100 compared to the multimodal approach (107,440 vs.\n25,626), and Gemini 1.5 Flash can process about 2.6 times more websites\n(2,232,142 vs. 862,068). These findings underscore the significant economic\nbenefits of the agentic approach over the multimodal method, providing a viable\nsolution for organizations aiming to leverage advanced AI for phishing\ndetection while controlling expenses.\n","authors":["Fouad Trad","Ali Chehab"],"pdf_url":"https://arxiv.org/pdf/2412.02301v1.pdf","comment":"Accepted in the 2nd International Conference on Foundation and Large\n Language Models (FLLM2024)"},{"id":"http://arxiv.org/abs/2412.02295v1","updated":"2024-12-03T09:09:52Z","published":"2024-12-03T09:09:52Z","title":"CADMR: Cross-Attention and Disentangled Learning for Multimodal\n Recommender Systems","summary":" The increasing availability and diversity of multimodal data in recommender\nsystems offer new avenues for enhancing recommendation accuracy and user\nsatisfaction. However, these systems must contend with high-dimensional, sparse\nuser-item rating matrices, where reconstructing the matrix with only small\nsubsets of preferred items for each user poses a significant challenge. To\naddress this, we propose CADMR, a novel autoencoder-based multimodal\nrecommender system framework. CADMR leverages multi-head cross-attention\nmechanisms and Disentangled Learning to effectively integrate and utilize\nheterogeneous multimodal data in reconstructing the rating matrix. Our approach\nfirst disentangles modality-specific features while preserving their\ninterdependence, thereby learning a joint latent representation. The multi-head\ncross-attention mechanism is then applied to enhance user-item interaction\nrepresentations with respect to the learned multimodal item latent\nrepresentations. We evaluate CADMR on three benchmark datasets, demonstrating\nsignificant performance improvements over state-of-the-art methods.\n","authors":["Yasser Khalafaoui","Martino Lovisetto","Basarab Matei","Nistor Grozavu"],"pdf_url":"https://arxiv.org/pdf/2412.02295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02294v1","updated":"2024-12-03T09:08:38Z","published":"2024-12-03T09:08:38Z","title":"Initial Study On Improving Segmentation By Combining Preoperative CT And\n Intraoperative CBCT Using Synthetic Data","summary":" Computer-Assisted Interventions enable clinicians to perform precise,\nminimally invasive procedures, often relying on advanced imaging methods.\nCone-beam computed tomography (CBCT) can be used to facilitate\ncomputer-assisted interventions, despite often suffering from artifacts that\npose challenges for accurate interpretation. While the degraded image quality\ncan affect image analysis, the availability of high quality, preoperative scans\noffers potential for improvements. Here we consider a setting where\npreoperative CT and intraoperative CBCT scans are available, however, the\nalignment (registration) between the scans is imperfect to simulate a real\nworld scenario. We propose a multimodal learning method that fuses roughly\naligned CBCT and CT scans and investigate the effect on segmentation\nperformance. For this experiment we use synthetically generated data containing\nreal CT and synthetic CBCT volumes with corresponding voxel annotations. We\nshow that this fusion setup improves segmentation performance in $18$ out of\n$20$ investigated setups.\n","authors":["Maximilian E. Tschuchnig","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2412.02294v1.pdf","comment":"Accepted at BVM 2025. arXiv admin note: text overlap with\n arXiv:2406.11650"},{"id":"http://arxiv.org/abs/2412.02292v1","updated":"2024-12-03T09:08:27Z","published":"2024-12-03T09:08:27Z","title":"Deep Matrix Factorization with Adaptive Weights for Multi-View\n Clustering","summary":" Recently, deep matrix factorization has been established as a powerful model\nfor unsupervised tasks, achieving promising results, especially for multi-view\nclustering. However, existing methods often lack effective feature selection\nmechanisms and rely on empirical hyperparameter selection. To address these\nissues, we introduce a novel Deep Matrix Factorization with Adaptive Weights\nfor Multi-View Clustering (DMFAW). Our method simultaneously incorporates\nfeature selection and generates local partitions, enhancing clustering results.\nNotably, the features weights are controlled and adjusted by a parameter that\nis dynamically updated using Control Theory inspired mechanism, which not only\nimproves the model's stability and adaptability to diverse datasets but also\naccelerates convergence. A late fusion approach is then proposed to align the\nweighted local partitions with the consensus partition. Finally, the\noptimization problem is solved via an alternating optimization algorithm with\ntheoretically guaranteed convergence. Extensive experiments on benchmark\ndatasets highlight that DMFAW outperforms state-of-the-art methods in terms of\nclustering performance.\n","authors":["Yasser Khalafaoui","Basarab Matei","Martino Lovisetto","Nistor Grozavu"],"pdf_url":"https://arxiv.org/pdf/2412.02292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02291v1","updated":"2024-12-03T09:07:31Z","published":"2024-12-03T09:07:31Z","title":"Conformal Symplectic Optimization for Stable Reinforcement Learning","summary":" Training deep reinforcement learning (RL) agents necessitates overcoming the\nhighly unstable nonconvex stochastic optimization inherent in the\ntrial-and-error mechanism. To tackle this challenge, we propose a\nphysics-inspired optimization algorithm called relativistic adaptive gradient\ndescent (RAD), which enhances long-term training stability. By conceptualizing\nneural network (NN) training as the evolution of a conformal Hamiltonian\nsystem, we present a universal framework for transferring long-term stability\nfrom conformal symplectic integrators to iterative NN updating rules, where the\nchoice of kinetic energy governs the dynamical properties of resulting\noptimization algorithms. By utilizing relativistic kinetic energy, RAD\nincorporates principles from special relativity and limits parameter updates\nbelow a finite speed, effectively mitigating abnormal gradient influences.\nAdditionally, RAD models NN optimization as the evolution of a multi-particle\nsystem where each trainable parameter acts as an independent particle with an\nindividual adaptive learning rate. We prove RAD's sublinear convergence under\ngeneral nonconvex settings, where smaller gradient variance and larger batch\nsizes contribute to tighter convergence. Notably, RAD degrades to the\nwell-known adaptive moment estimation (ADAM) algorithm when its speed\ncoefficient is chosen as one and symplectic factor as a small positive value.\nExperimental results show RAD outperforming nine baseline optimizers with five\nRL algorithms across twelve environments, including standard benchmarks and\nchallenging scenarios. Notably, RAD achieves up to a 155.1% performance\nimprovement over ADAM in Atari games, showcasing its efficacy in stabilizing\nand accelerating RL training.\n","authors":["Yao Lyu","Xiangteng Zhang","Shengbo Eben Li","Jingliang Duan","Letian Tao","Qing Xu","Lei He","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2412.02291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02285v1","updated":"2024-12-03T09:03:04Z","published":"2024-12-03T09:03:04Z","title":"GQWformer: A Quantum-based Transformer for Graph Representation Learning","summary":" Graph Transformers (GTs) have demonstrated significant advantages in graph\nrepresentation learning through their global attention mechanisms. However, the\nself-attention mechanism in GTs tends to neglect the inductive biases inherent\nin graph structures, making it chanllenging to effectively capture essential\nstructural information. To address this issue, we propose a novel approach that\nintegrate graph inductive bias into self-attention mechanisms by leveraging\nquantum technology for structural encoding. In this paper, we introduce the\nGraph Quantum Walk Transformer (GQWformer), a groundbreaking GNN framework that\nutilizes quantum walks on attributed graphs to generate node quantum states.\nThese quantum states encapsulate rich structural attributes and serve as\ninductive biases for the transformer, thereby enabling the generation of more\nmeaningful attention scores. By subsequently incorporating a recurrent neural\nnetwork, our design amplifies the model's ability to focus on both local and\nglobal information. We conducted comprehensive experiments across five publicly\navailable datasets to evaluate the effectiveness of our model. These results\nclearly indicate that GQWformer outperforms existing state-of-the-art graph\nclassification algorithms. These findings highlight the significant potential\nof integrating quantum computing methodologies with traditional GNNs to advance\nthe field of graph representation learning, providing a promising direction for\nfuture research and applications.\n","authors":["Lei Yu","Hongyang Chen","Jingsong Lv","Linyao Yang"],"pdf_url":"https://arxiv.org/pdf/2412.02285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02283v1","updated":"2024-12-03T08:59:12Z","published":"2024-12-03T08:59:12Z","title":"VR Based Emotion Recognition Using Deep Multimodal Fusion With\n Biosignals Across Multiple Anatomical Domains","summary":" Emotion recognition is significantly enhanced by integrating multimodal\nbiosignals and IMU data from multiple domains. In this paper, we introduce a\nnovel multi-scale attention-based LSTM architecture, combined with\nSqueeze-and-Excitation (SE) blocks, by leveraging multi-domain signals from the\nhead (Meta Quest Pro VR headset), trunk (Equivital Vest), and peripheral\n(Empatica Embrace Plus) during affect elicitation via visual stimuli. Signals\nfrom 23 participants were recorded, alongside self-assessed valence and arousal\nratings after each stimulus. LSTM layers extract features from each modality,\nwhile multi-scale attention captures fine-grained temporal dependencies, and SE\nblocks recalibrate feature importance prior to classification. We assess which\ndomain's signals carry the most distinctive emotional information during VR\nexperiences, identifying key biosignals contributing to emotion detection. The\nproposed architecture, validated in a user study, demonstrates superior\nperformance in classifying valance and arousal level (high / low), showcasing\nthe efficacy of multi-domain and multi-modal fusion with biosignals (e.g.,\nTEMP, EDA) with IMU data (e.g., accelerometer) for emotion recognition in\nreal-world applications.\n","authors":["Pubudu L. Indrasiri","Bipasha Kashyap","Chandima Kolambahewage","Bahareh Nakisa","Kiran Ijaz","Pubudu N. Pathirana"],"pdf_url":"https://arxiv.org/pdf/2412.02283v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.02280v1","updated":"2024-12-03T08:55:10Z","published":"2024-12-03T08:55:10Z","title":"AH-OCDA: Amplitude-based Curriculum Learning and Hopfield Segmentation\n Model for Open Compound Domain Adaptation","summary":" Open compound domain adaptation (OCDA) is a practical domain adaptation\nproblem that consists of a source domain, target compound domain, and unseen\nopen domain. In this problem, the absence of domain labels and pixel-level\nsegmentation labels for both compound and open domains poses challenges to the\ndirect application of existing domain adaptation and generalization methods. To\naddress this issue, we propose Amplitude-based curriculum learning and a\nHopfield segmentation model for Open Compound Domain Adaptation (AH-OCDA). Our\nmethod comprises two complementary components: 1) amplitude-based curriculum\nlearning and 2) Hopfield segmentation model. Without prior knowledge of target\ndomains within the compound domains, amplitude-based curriculum learning\ngradually induces the semantic segmentation model to adapt from the near-source\ncompound domain to the far-source compound domain by ranking unlabeled compound\ndomain images through Fast Fourier Transform (FFT). Additionally, the Hopfield\nsegmentation model maps segmentation feature distributions from arbitrary\ndomains to the feature distributions of the source domain. AH-OCDA achieves\nstate-of-the-art performance on two OCDA benchmarks and extended open domains,\ndemonstrating its adaptability to continuously changing compound domains and\nunseen open domains.\n","authors":["Jaehyun Choi","Junwon Ko","Dong-Jae Lee","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2412.02280v1.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2406.03848v3","updated":"2024-12-03T08:54:30Z","published":"2024-06-06T08:29:29Z","title":"OceanCastNet: A Deep Learning Ocean Wave Model with Energy Conservation","summary":" Traditional wave forecasting models, although based on energy conservation\nequations, are computationally expensive. On the other hand, existing deep\nlearning geophysical fluid models, while computationally efficient, often\nsuffer from issues such as energy dissipation in long-term forecasts. This\npaper proposes a novel energy-balanced deep learning wave forecasting model\ncalled OceanCastNet (OCN). By incorporating wind fields at the current,\nprevious, and future time steps, as well as wave fields at the current and\nprevious time steps as input variables, OCN maintains energy balance within the\nmodel. Furthermore, the model employs adaptive Fourier operators as its core\ncomponents and designs a masked loss function to better handle the impact of\nland-sea boundaries. A series of experiments on the ERA5 dataset demonstrate\nthat OCN can achieve short-term forecast accuracy comparable to traditional\nmodels while exhibiting an understanding of the wave generation process. In\ncomparative experiments under both normal and extreme conditions, OCN\nconsistently outperforms the widely used WaveWatch III model in the industry.\nEven after long-term forecasting, OCN maintains a stable and energy-rich state.\nBy further constructing a simple meteorological model, OCN-wind, which\nconsiders energy balance, this paper confirms the importance of energy\nconstraints for improving the long-term forecast performance of deep learning\nmeteorological models. This finding provides new ideas for future research on\ndeep learning geophysical fluid models.\n","authors":["Ziliang Zhang","Huaming Yu","Danqin Ren"],"pdf_url":"https://arxiv.org/pdf/2406.03848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02279v1","updated":"2024-12-03T08:54:17Z","published":"2024-12-03T08:54:17Z","title":"A Comprehensive Evaluation of Large Language Models on Aspect-Based\n Sentiment Analysis","summary":" Recently, Large Language Models (LLMs) have garnered increasing attention in\nthe field of natural language processing, revolutionizing numerous downstream\ntasks with powerful reasoning and generation abilities. For example, In-Context\nLearning (ICL) introduces a fine-tuning-free paradigm, allowing out-of-the-box\nLLMs to execute downstream tasks by analogy learning without any fine-tuning.\nBesides, in a fine-tuning-dependent paradigm where substantial training data\nexists, Parameter-Efficient Fine-Tuning (PEFT), as the cost-effective methods,\nenable LLMs to achieve excellent performance comparable to full fine-tuning.\n However, these fascinating techniques employed by LLMs have not been fully\nexploited in the ABSA field. Previous works probe LLMs in ABSA by merely using\nrandomly selected input-output pairs as demonstrations in ICL, resulting in an\nincomplete and superficial evaluation. In this paper, we shed light on a\ncomprehensive evaluation of LLMs in the ABSA field, involving 13 datasets, 8\nABSA subtasks, and 6 LLMs. Specifically, we design a unified task formulation\nto unify ``multiple LLMs for multiple ABSA subtasks in multiple paradigms.''\nFor the fine-tuning-dependent paradigm, we efficiently fine-tune LLMs using\ninstruction-based multi-task learning. For the fine-tuning-free paradigm, we\npropose 3 demonstration selection strategies to stimulate the few-shot\nabilities of LLMs. Our extensive experiments demonstrate that LLMs achieve a\nnew state-of-the-art performance compared to fine-tuned Small Language Models\n(SLMs) in the fine-tuning-dependent paradigm. More importantly, in the\nfine-tuning-free paradigm where SLMs are ineffective, LLMs with ICL still\nshowcase impressive potential and even compete with fine-tuned SLMs on some\nABSA subtasks.\n","authors":["Changzhi Zhou","Dandan Song","Yuhang Tian","Zhijing Wu","Hao Wang","Xinyu Zhang","Jun Yang","Ziyi Yang","Shuhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10929v5","updated":"2024-12-03T08:48:21Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v5.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2412.02270v1","updated":"2024-12-03T08:41:11Z","published":"2024-12-03T08:41:11Z","title":"Sustainable Self-evolution Adversarial Training","summary":" With the wide application of deep neural network models in various computer\nvision tasks, there has been a proliferation of adversarial example generation\nstrategies aimed at deeply exploring model security. However, existing\nadversarial training defense models, which rely on single or limited types of\nattacks under a one-time learning process, struggle to adapt to the dynamic and\nevolving nature of attack methods. Therefore, to achieve defense performance\nimprovements for models in long-term applications, we propose a novel\nSustainable Self-Evolution Adversarial Training (SSEAT) framework.\nSpecifically, we introduce a continual adversarial defense pipeline to realize\nlearning from various kinds of adversarial examples across multiple stages.\nAdditionally, to address the issue of model catastrophic forgetting caused by\ncontinual learning from ongoing novel attacks, we propose an adversarial data\nreplay module to better select more diverse and key relearning data.\nFurthermore, we design a consistency regularization strategy to encourage\ncurrent defense models to learn more from previously trained ones, guiding them\nto retain more past knowledge and maintain accuracy on clean samples. Extensive\nexperiments have been conducted to verify the efficacy of the proposed SSEAT\ndefense method, which demonstrates superior defense performance and\nclassification accuracy compared to competitors.\n","authors":["Wenxuan Wang","Chenglei Wang","Huihui Qi","Menghao Ye","Xuelin Qian","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02270v1.pdf","comment":"Accepted to ACMMM 2024"},{"id":"http://arxiv.org/abs/2412.02263v1","updated":"2024-12-03T08:35:51Z","published":"2024-12-03T08:35:51Z","title":"Connecting Large Language Models with Blockchain: Advancing the\n Evolution of Smart Contracts from Automation to Intelligence","summary":" Blockchain smart contracts have catalyzed the development of decentralized\napplications across various domains, including decentralized finance. However,\ndue to constraints in computational resources and the prevalence of data silos,\ncurrent smart contracts face significant challenges in fully leveraging the\npowerful capabilities of Large Language Models (LLMs) for tasks such as\nintelligent analysis and reasoning. To address this gap, this paper proposes\nand implements a universal framework for integrating LLMs with blockchain data,\n{\\sysname}, effectively overcoming the interoperability barriers between\nblockchain and LLMs. By combining semantic relatedness with truth discovery\nmethods, we introduce an innovative data aggregation approach, {\\funcname},\nwhich significantly enhances the accuracy and trustworthiness of data generated\nby LLMs. To validate the framework's effectiveness, we construct a dataset\nconsisting of three types of questions, capturing Q\\&A interactions between 10\noracle nodes and 5 LLM models. Experimental results demonstrate that, even with\n40\\% malicious nodes, the proposed solution improves data accuracy by an\naverage of 17.74\\% compared to the optimal baseline. This research not only\nprovides an innovative solution for the intelligent enhancement of smart\ncontracts but also highlights the potential for deep integration between LLMs\nand blockchain technology, paving the way for more intelligent and complex\napplications of smart contracts in the future.\n","authors":["Youquan Xian","Xueying Zeng","Duancheng Xuan","Danping Yang","Chunpei Li","Peng Fan","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2412.02263v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.02259v1","updated":"2024-12-03T08:33:50Z","published":"2024-12-03T08:33:50Z","title":"VideoGen-of-Thought: A Collaborative Framework for Multi-Shot Video\n Generation","summary":" Current video generation models excel at generating short clips but still\nstruggle with creating multi-shot, movie-like videos. Existing models trained\non large-scale data on the back of rich computational resources are\nunsurprisingly inadequate for maintaining a logical storyline and visual\nconsistency across multiple shots of a cohesive script since they are often\ntrained with a single-shot objective. To this end, we propose\nVideoGen-of-Thought (VGoT), a collaborative and training-free architecture\ndesigned specifically for multi-shot video generation. VGoT is designed with\nthree goals in mind as follows. Multi-Shot Video Generation: We divide the\nvideo generation process into a structured, modular sequence, including (1)\nScript Generation, which translates a curt story into detailed prompts for each\nshot; (2) Keyframe Generation, responsible for creating visually consistent\nkeyframes faithful to character portrayals; and (3) Shot-Level Video\nGeneration, which transforms information from scripts and keyframes into shots;\n(4) Smoothing Mechanism that ensures a consistent multi-shot output. Reasonable\nNarrative Design: Inspired by cinematic scriptwriting, our prompt generation\napproach spans five key domains, ensuring logical consistency, character\ndevelopment, and narrative flow across the entire video. Cross-Shot\nConsistency: We ensure temporal and identity consistency by leveraging\nidentity-preserving (IP) embeddings across shots, which are automatically\ncreated from the narrative. Additionally, we incorporate a cross-shot smoothing\nmechanism, which integrates a reset boundary that effectively combines latent\nfeatures from adjacent shots, resulting in smooth transitions and maintaining\nvisual coherence throughout the video. Our experiments demonstrate that VGoT\nsurpasses existing video generation methods in producing high-quality,\ncoherent, multi-shot videos.\n","authors":["Mingzhe Zheng","Yongqi Xu","Haojian Huang","Xuran Ma","Yexin Liu","Wenjie Shu","Yatian Pang","Feilong Tang","Qifeng Chen","Harry Yang","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2412.02259v1.pdf","comment":"Webpage: https://cheliosoops.github.io/VGoT"},{"id":"http://arxiv.org/abs/2410.13637v2","updated":"2024-12-03T08:29:54Z","published":"2024-10-17T15:07:56Z","title":"Normalizing self-supervised learning for provably reliable Change Point\n Detection","summary":" Change point detection (CPD) methods aim to identify abrupt shifts in the\ndistribution of input data streams. Accurate estimators for this task are\ncrucial across various real-world scenarios. Yet, traditional unsupervised CPD\ntechniques face significant limitations, often relying on strong assumptions or\nsuffering from low expressive power due to inherent model simplicity. In\ncontrast, representation learning methods overcome these drawbacks by offering\nflexibility and the ability to capture the full complexity of the data without\nimposing restrictive assumptions. However, these approaches are still emerging\nin the CPD field and lack robust theoretical foundations to ensure their\nreliability. Our work addresses this gap by integrating the expressive power of\nrepresentation learning with the groundedness of traditional CPD techniques. We\nadopt spectral normalization (SN) for deep representation learning in CPD tasks\nand prove that the embeddings after SN are highly informative for CPD. Our\nmethod significantly outperforms current state-of-the-art methods during the\ncomprehensive evaluation via three standard CPD datasets.\n","authors":["Alexandra Bazarova","Evgenia Romanenkova","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2410.13637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02251v1","updated":"2024-12-03T08:28:47Z","published":"2024-12-03T08:28:47Z","title":"Selective Reviews of Bandit Problems in AI via a Statistical View","summary":" Reinforcement Learning (RL) is a widely researched area in artificial\nintelligence that focuses on teaching agents decision-making through\ninteractions with their environment. A key subset includes stochastic\nmulti-armed bandit (MAB) and continuum-armed bandit (SCAB) problems, which\nmodel sequential decision-making under uncertainty. This review outlines the\nfoundational models and assumptions of bandit problems, explores non-asymptotic\ntheoretical tools like concentration inequalities and minimax regret bounds,\nand compares frequentist and Bayesian algorithms for managing\nexploration-exploitation trade-offs. We also extend the discussion to $K$-armed\ncontextual bandits and SCAB, examining their methodologies, regret analyses,\nand discussing the relation between the SCAB problems and the functional data\nanalysis. Finally, we highlight recent advances and ongoing challenges in the\nfield.\n","authors":["Pengjie Zhou","Haoyu Wei","Huiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02251v1.pdf","comment":"46 pages, 5 figures,"},{"id":"http://arxiv.org/abs/2412.02242v1","updated":"2024-12-03T08:11:06Z","published":"2024-12-03T08:11:06Z","title":"U-Net in Medical Image Segmentation: A Review of Its Applications Across\n Modalities","summary":" Medical imaging is essential in healthcare to provide key insights into\npatient anatomy and pathology, aiding in diagnosis and treatment. Non-invasive\ntechniques such as X-ray, Magnetic Resonance Imaging (MRI), Computed Tomography\n(CT), and Ultrasound (US), capture detailed images of organs, tissues, and\nabnormalities. Effective analysis of these images requires precise segmentation\nto delineate regions of interest (ROI), such as organs or lesions. Traditional\nsegmentation methods, relying on manual feature-extraction, are labor-intensive\nand vary across experts. Recent advancements in Artificial Intelligence (AI)\nand Deep Learning (DL), particularly convolutional models such as U-Net and its\nvariants (U-Net++ and U-Net 3+), have transformed medical image segmentation\n(MIS) by automating the process and enhancing accuracy. These models enable\nefficient, precise pixel-wise classification across various imaging modalities,\novercoming the limitations of manual segmentation. This review explores various\nmedical imaging techniques, examines the U-Net architectures and their\nadaptations, and discusses their application across different modalities. It\nalso identifies common challenges in MIS and proposes potential solutions.\n","authors":["Fnu Neha","Deepshikha Bhati","Deepak Kumar Shukla","Sonavi Makarand Dalvi","Nikolaos Mantzou","Safa Shubbar"],"pdf_url":"https://arxiv.org/pdf/2412.02242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06703v3","updated":"2024-12-03T08:09:01Z","published":"2024-10-09T09:13:38Z","title":"ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness\n in Web Agents","summary":" Recent advancements in Web agents have introduced novel architectures and\nbenchmarks showcasing progress in autonomous web navigation and interaction.\nHowever, most existing benchmarks prioritize effectiveness and accuracy,\noverlooking factors like safety and trustworthiness which are essential for\ndeploying web agents in enterprise settings. We present STWebAgentBench, a\nbenchmark designed to evaluate web agents safety and trustworthiness across six\ncritical dimensions, essential for reliability in enterprise applications. This\nbenchmark is grounded in a detailed framework that defines safe and trustworthy\n(ST) agent behavior. Our work extends WebArena with safety templates and\nevaluation functions to assess safety policy compliance rigorously. We\nintroduce the Completion Under Policy to measure task success while adhering to\npolicies, alongside the Risk Ratio, which quantifies policy violations across\ndimensions, providing actionable insights to address safety gaps. Our\nevaluation reveals that current SOTA agents struggle with policy adherence and\ncannot yet be relied upon for critical business applications. We open-source\nthis benchmark and invite the community to contribute, with the goal of\nfostering a new generation of safer, more trustworthy AI agents. All code,\ndata, environment reproduction resources, and video demonstrations are\navailable at https://sites.google.com/view/st-webagentbench/home.\n","authors":["Ido Levy","Ben Wiesel","Sami Marreed","Alon Oved","Avi Yaeli","Segev Shlomov"],"pdf_url":"https://arxiv.org/pdf/2410.06703v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02237v1","updated":"2024-12-03T08:05:56Z","published":"2024-12-03T08:05:56Z","title":"Cross-Attention Head Position Patterns Can Align with Human Visual\n Concepts in Text-to-Image Generative Models","summary":" Recent text-to-image diffusion models leverage cross-attention layers, which\nhave been effectively utilized to enhance a range of visual generative tasks.\nHowever, our understanding of cross-attention layers remains somewhat limited.\nIn this study, we present a method for constructing Head Relevance Vectors\n(HRVs) that align with useful visual concepts. An HRV for a given visual\nconcept is a vector with a length equal to the total number of cross-attention\nheads, where each element represents the importance of the corresponding head\nfor the given visual concept. We develop and employ an ordered weakening\nanalysis to demonstrate the effectiveness of HRVs as interpretable features. To\ndemonstrate the utility of HRVs, we propose concept strengthening and concept\nadjusting methods and apply them to enhance three visual generative tasks. We\nshow that misinterpretations of polysemous words in image generation can be\ncorrected in most cases, five challenging attributes in image editing can be\nsuccessfully modified, and catastrophic neglect in multi-concept generation can\nbe mitigated. Overall, our work provides an advancement in understanding\ncross-attention layers and introduces new approaches for fine-controlling these\nlayers at the head level.\n","authors":["Jungwon Park","Jungmin Ko","Dongnam Byun","Jangwon Suh","Wonjong Rhee"],"pdf_url":"https://arxiv.org/pdf/2412.02237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01408v2","updated":"2024-12-03T07:52:35Z","published":"2024-12-02T11:51:19Z","title":"Towards Cross-Lingual Audio Abuse Detection in Low-Resource Settings\n with Few-Shot Learning","summary":" Online abusive content detection, particularly in low-resource settings and\nwithin the audio modality, remains underexplored. We investigate the potential\nof pre-trained audio representations for detecting abusive language in\nlow-resource languages, in this case, in Indian languages using Few Shot\nLearning (FSL). Leveraging powerful representations from models such as Wav2Vec\nand Whisper, we explore cross-lingual abuse detection using the ADIMA dataset\nwith FSL. Our approach integrates these representations within the\nModel-Agnostic Meta-Learning (MAML) framework to classify abusive language in\n10 languages. We experiment with various shot sizes (50-200) evaluating the\nimpact of limited data on performance. Additionally, a feature visualization\nstudy was conducted to better understand model behaviour. This study highlights\nthe generalization ability of pre-trained models in low-resource scenarios and\noffers valuable insights into detecting abusive language in multilingual\ncontexts.\n","authors":["Aditya Narayan Sankaran","Reza Farahbakhsh","Noel Crespi"],"pdf_url":"https://arxiv.org/pdf/2412.01408v2.pdf","comment":"Accepted as part of the proceedings of COLING 2025"},{"id":"http://arxiv.org/abs/2412.02228v1","updated":"2024-12-03T07:51:14Z","published":"2024-12-03T07:51:14Z","title":"BANER: Boundary-Aware LLMs for Few-Shot Named Entity Recognition","summary":" Despite the recent success of two-stage prototypical networks in few-shot\nnamed entity recognition (NER), challenges such as over/under-detected false\nspans in the span detection stage and unaligned entity prototypes in the type\nclassification stage persist. Additionally, LLMs have not proven to be\neffective few-shot information extractors in general. In this paper, we propose\nan approach called Boundary-Aware LLMs for Few-Shot Named Entity Recognition to\naddress these issues. We introduce a boundary-aware contrastive learning\nstrategy to enhance the LLM's ability to perceive entity boundaries for\ngeneralized entity spans. Additionally, we utilize LoRAHub to align information\nfrom the target domain to the source domain, thereby enhancing adaptive\ncross-domain classification capabilities. Extensive experiments across various\nbenchmarks demonstrate that our framework outperforms prior methods, validating\nits effectiveness. In particular, the proposed strategies demonstrate\neffectiveness across a range of LLM architectures. The code and data are\nreleased on https://github.com/UESTC-GQJ/BANER.\n","authors":["Quanjiang Guo","Yihong Dong","Ling Tian","Zhao Kang","Yu Zhang","Sijie Wang"],"pdf_url":"https://arxiv.org/pdf/2412.02228v1.pdf","comment":"Appear on COLING 2025"},{"id":"http://arxiv.org/abs/2306.06272v2","updated":"2024-12-03T07:34:36Z","published":"2023-06-09T21:54:13Z","title":"A Domain-Independent Agent Architecture for Adaptive Operation in\n Evolving Open Worlds","summary":" Model-based reasoning agents are ill-equipped to act in novel situations in\nwhich their model of the environment no longer sufficiently represents the\nworld. We propose HYDRA - a framework for designing model-based agents\noperating in mixed discrete-continuous worlds, that can autonomously detect\nwhen the environment has evolved from its canonical setup, understand how it\nhas evolved, and adapt the agents' models to perform effectively. HYDRA is\nbased upon PDDL+, a rich modeling language for planning in mixed,\ndiscrete-continuous environments. It augments the planning module with visual\nreasoning, task selection, and action execution modules for closed-loop\ninteraction with complex environments. HYDRA implements a novel meta-reasoning\nprocess that enables the agent to monitor its own behavior from a variety of\naspects. The process employs a diverse set of computational methods to maintain\nexpectations about the agent's own behavior in an environment. Divergences from\nthose expectations are useful in detecting when the environment has evolved and\nidentifying opportunities to adapt the underlying models. HYDRA builds upon\nideas from diagnosis and repair and uses a heuristics-guided search over model\nchanges such that they become competent in novel conditions. The HYDRA\nframework has been used to implement novelty-aware agents for three diverse\ndomains - CartPole++ (a higher dimension variant of a classic control problem),\nScience Birds (an IJCAI competition problem), and PogoStick (a specific problem\ndomain in Minecraft). We report empirical observations from these domains to\ndemonstrate the efficacy of various components in the novelty meta-reasoning\nprocess.\n","authors":["Shiwali Mohan","Wiktor Piotrowski","Roni Stern","Sachin Grover","Sookyung Kim","Jacob Le","Johan De Kleer"],"pdf_url":"https://arxiv.org/pdf/2306.06272v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02222v1","updated":"2024-12-03T07:28:52Z","published":"2024-12-03T07:28:52Z","title":"Deep learning approach for predicting the replicator equation in\n evolutionary game theory","summary":" This paper presents a physics-informed deep learning approach for predicting\nthe replicator equation, allowing accurate forecasting of population dynamics.\nThis methodological innovation allows us to derive governing differential or\ndifference equations for systems that lack explicit mathematical models. We\nused the SINDy model first introduced by Fasel, Kaiser, Kutz, Brunton, and\nBrunt 2016a to get the replicator equation, which will significantly advance\nour understanding of evolutionary biology, economic systems, and social\ndynamics. By refining predictive models across multiple disciplines, including\necology, social structures, and moral behaviours, our work offers new insights\ninto the complex interplay of variables shaping evolutionary outcomes in\ndynamic systems\n","authors":["Advait Chandorkar"],"pdf_url":"https://arxiv.org/pdf/2412.02222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02220v1","updated":"2024-12-03T07:25:30Z","published":"2024-12-03T07:25:30Z","title":"Unlocking Tuning-Free Few-Shot Adaptability in Visual Foundation Models\n by Recycling Pre-Tuned LoRAs","summary":" Large Language Models (LLMs) such as ChatGPT demonstrate strong few-shot\nadaptability without requiring fine-tuning, positioning them ideal for\ndata-limited and real-time applications. However, this adaptability has not yet\nbeen replicated in current Visual Foundation Models (VFMs), which require\nexplicit fine-tuning with sufficient tuning data. Besides, the\npretraining-finetuning paradigm has led to the surge of numerous task-specific\nmodular components, such as Low-Rank Adaptation (LoRA). For the first time, we\nexplore the potential of reusing diverse pre-tuned LoRAs without accessing\ntheir original training data, to achieve tuning-free few-shot adaptation in\nVFMs. Our framework, LoRA Recycle, distills a meta-LoRA from diverse pre-tuned\nLoRAs with a meta-learning objective, using surrogate data generated inversely\nfrom pre-tuned LoRAs themselves. The VFM, once equipped with the meta-LoRA, is\nempowered to solve new few-shot tasks in a single forward pass, akin to the\nin-context learning of LLMs. Additionally, we incorporate a double-efficient\nmechanism tailored to our framework, significantly accelerating the\nmeta-training process while maintaining or even improving performance.\nExtensive experiments across various few-shot classification benchmarks across\nboth in- and cross-domain scenarios demonstrate the superiority of our\nframework.\n","authors":["Zixuan Hu","Yongxian Wei","Li Shen","Chun Yuan","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2412.02220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08830v2","updated":"2024-12-03T07:23:25Z","published":"2024-06-13T05:49:29Z","title":"Center-Sensitive Kernel Optimization for Efficient On-Device Incremental\n Learning","summary":" To facilitate the evolution of edge intelligence in ever-changing\nenvironments, we study on-device incremental learning constrained in limited\ncomputation resource in this paper. Current on-device training methods just\nfocus on efficient training without considering the catastrophic forgetting,\npreventing the model getting stronger when continually exploring the world. To\nsolve this problem, a direct solution is to involve the existing incremental\nlearning mechanisms into the on-device training framework. Unfortunately, such\na manner cannot work well as those mechanisms usually introduce large\nadditional computational cost to the network optimization process, which would\ninevitably exceed the memory capacity of the edge devices. To address this\nissue, this paper makes an early effort to propose a simple but effective\nedge-friendly incremental learning framework. Based on an empirical study on\nthe knowledge intensity of the kernel elements of the neural network, we find\nthat the center kernel is the key for maximizing the knowledge intensity for\nlearning new data, while freezing the other kernel elements would get a good\nbalance on the model's capacity for overcoming catastrophic forgetting. Upon\nthis finding, we further design a center-sensitive kernel optimization\nframework to largely alleviate the cost of the gradient computation and\nback-propagation. Besides, a dynamic channel element selection strategy is also\nproposed to facilitate a sparse orthogonal gradient projection for further\nreducing the optimization complexity, upon the knowledge explored from the new\ntask data. Extensive experiments validate our method is efficient and\neffective, e.g., our method achieves average accuracy boost of 38.08% with even\nless memory and approximate computation compared to existing on-device training\nmethods, indicating its significant potential for on-device incremental\nlearning.\n","authors":["Dingwen Zhang","Yan Li","De Cheng","Nannan Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2406.08830v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00156v2","updated":"2024-12-03T07:18:25Z","published":"2024-11-29T08:10:49Z","title":"VISION-XL: High Definition Video Inverse Problem Solver using Latent\n Image Diffusion Models","summary":" In this paper, we propose a novel framework for solving high-definition video\ninverse problems using latent image diffusion models. Building on recent\nadvancements in spatio-temporal optimization for video inverse problems using\nimage diffusion models, our approach leverages latent-space diffusion models to\nachieve enhanced video quality and resolution. To address the high\ncomputational demands of processing high-resolution frames, we introduce a\npseudo-batch consistent sampling strategy, allowing efficient operation on a\nsingle GPU. Additionally, to improve temporal consistency, we present\nbatch-consistent inversion, an initialization technique that incorporates\ninformative latents from the measurement frame. By integrating with SDXL, our\nframework achieves state-of-the-art video reconstruction across a wide range of\nspatio-temporal inverse problems, including complex combinations of frame\naveraging and various spatial degradations, such as deblurring,\nsuper-resolution, and inpainting. Unlike previous methods, our approach\nsupports multiple aspect ratios (landscape, vertical, and square) and delivers\nHD-resolution reconstructions (exceeding 1280x720) in under 2.5 minutes on a\nsingle NVIDIA 4090 GPU.\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2412.00156v2.pdf","comment":"Project page: https://vision-xl.github.io/"},{"id":"http://arxiv.org/abs/2412.02215v1","updated":"2024-12-03T07:11:21Z","published":"2024-12-03T07:11:21Z","title":"Recovering implicit physics model under real-world constraints","summary":" Recovering a physics-driven model, i.e. a governing set of equations of the\nunderlying dynamical systems, from the real-world data has been of recent\ninterest. Most existing methods either operate on simulation data with\nunrealistically high sampling rates or require explicit measurements of all\nsystem variables, which is not amenable in real-world deployments. Moreover,\nthey assume the timestamps of external perturbations to the physical system are\nknown a priori, without uncertainty, implicitly discounting any sensor\ntime-synchronization or human reporting errors. In this paper, we propose a\nnovel liquid time constant neural network (LTC-NN) based architecture to\nrecover underlying model of physical dynamics from real-world data. The\nautomatic differentiation property of LTC-NN nodes overcomes problems\nassociated with low sampling rates, the input dependent time constant in the\nforward pass of the hidden layer of LTC-NN nodes creates a massive search space\nof implicit physical dynamics, the physics model solver based data\nreconstruction loss guides the search for the correct set of implicit dynamics,\nand the use of the dropout regularization in the dense layer ensures extraction\nof the sparsest model. Further, to account for the perturbation timing error,\nwe utilize dense layer nodes to search through input shifts that results in the\nlowest reconstruction loss. Experiments on four benchmark dynamical systems,\nthree with simulation data and one with the real-world data show that the\nLTC-NN architecture is more accurate in recovering implicit physics model\ncoefficients than the state-of-the-art sparse model recovery approaches. We\nalso introduce four additional case studies (total eight) on real-life medical\nexamples in simulation and with real-world clinical data to show effectiveness\nof our approach in recovering underlying model in practice.\n","authors":["Ayan Banerjee","Sandeep K. S. Gupta"],"pdf_url":"https://arxiv.org/pdf/2412.02215v1.pdf","comment":"This paper is published in ECAI 2024,\n https://ebooks.iospress.nl/volumearticle/69651"},{"id":"http://arxiv.org/abs/2412.01572v2","updated":"2024-12-03T06:58:06Z","published":"2024-12-02T14:55:02Z","title":"MBA-RAG: a Bandit Approach for Adaptive Retrieval-Augmented Generation\n through Question Complexity","summary":" Retrieval Augmented Generation (RAG) has proven to be highly effective in\nboosting the generative performance of language model in knowledge-intensive\ntasks. However, existing RAG framework either indiscriminately perform\nretrieval or rely on rigid single-class classifiers to select retrieval\nmethods, leading to inefficiencies and suboptimal performance across queries of\nvarying complexity. To address these challenges, we propose a reinforcement\nlearning-based framework that dynamically selects the most suitable retrieval\nstrategy based on query complexity. % our solution Our approach leverages a\nmulti-armed bandit algorithm, which treats each retrieval method as a distinct\n``arm'' and adapts the selection process by balancing exploration and\nexploitation. Additionally, we introduce a dynamic reward function that\nbalances accuracy and efficiency, penalizing methods that require more\nretrieval steps, even if they lead to a correct result. Our method achieves new\nstate of the art results on multiple single-hop and multi-hop datasets while\nreducing retrieval costs. Our code are available at\nhttps://github.com/FUTUREEEEEE/MBA .\n","authors":["Xiaqiang Tang","Qiang Gao","Jian Li","Nan Du","Qi Li","Sihong Xie"],"pdf_url":"https://arxiv.org/pdf/2412.01572v2.pdf","comment":"COLING 2025"},{"id":"http://arxiv.org/abs/2410.13973v2","updated":"2024-12-03T06:55:36Z","published":"2024-10-17T18:57:15Z","title":"MarineFormer: A Spatio-Temporal Attention Model for USV Navigation in\n Dynamic Marine Environments","summary":" Navigating autonomously in marine environments including dynamic and static\nobstacles, and strong flow disturbances, such as in high-flow rivers, poses\nsignificant challenges for USVs. To address these challenges, we propose a\nnovel methodology that leverages two types of attention: spatial attention,\nwhich learns to integrate diverse environmental factors and sensory information\ninto navigation decisions, and temporal attention within a transformer\nframework to account for the dynamic, continuously changing nature of the\nenvironment. We devise MarineFormer, a Trans{\\bf former}-based navigation\npolicy for dynamic {\\bf Marine} environments, trained end-to-end through\nreinforcement learning (RL). At its core, MarineFormer uses graph attention to\ncapture spatial information and a transformer architecture to process temporal\nsequences in an environment that simulates a 2D turbulent marine condition\ninvolving multiple static and dynamic obstacles. We extensively evaluate the\nperformance of the proposed method versus the state-of-the-art methods, as well\nas other classical planners. Our approach outperforms the state-of-the-art by\nnearly $20\\%$ in episode completion success rate and additionally enhances the\nUSV's path length efficiency.\n","authors":["Ehsan Kazemi","Iman Soltani"],"pdf_url":"https://arxiv.org/pdf/2410.13973v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.17355v3","updated":"2024-12-03T06:53:58Z","published":"2024-08-30T15:39:34Z","title":"Bidirectional Decoding: Improving Action Chunking via Closed-Loop\n Resampling","summary":" Predicting and executing a sequence of actions without intermediate\nreplanning, known as action chunking, is increasingly used in robot learning\nfrom human demonstrations. Yet, its reported effects on the learned policy are\ninconsistent: some studies find it crucial for achieving strong results, while\nothers observe decreased performance. In this paper, we first dissect how\naction chunking impacts the divergence between a learner and a demonstrator. We\nfind that action chunking allows the learner to better capture the temporal\ndependencies in demonstrations but at the cost of reduced reactivity in\nstochastic environments. To address this tradeoff, we propose Bidirectional\nDecoding (BID), a test-time inference algorithm that bridges action chunking\nwith closed-loop operations. BID samples multiple predictions at each time step\nand searches for the optimal one based on two criteria: (i) backward coherence,\nwhich favors samples that align with previous decisions; (ii) forward contrast,\nwhich seeks samples of high likelihood for future plans. By coupling decisions\nwithin and across action chunks, BID promotes consistency over time while\nmaintaining reactivity to unexpected changes. Experimental results show that\nBID boosts the performance of two state-of-the-art generative policies across\nseven simulation benchmarks and two real-world tasks. Code and videos are\navailable at https://bid-robot.github.io.\n","authors":["Yuejiang Liu","Jubayer Ibn Hamid","Annie Xie","Yoonho Lee","Maximilian Du","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2408.17355v3.pdf","comment":"Project website: https://bid-robot.github.io/"},{"id":"http://arxiv.org/abs/2402.10946v3","updated":"2024-12-03T06:52:34Z","published":"2024-02-09T04:02:43Z","title":"CultureLLM: Incorporating Cultural Differences into Large Language\n Models","summary":" Large language models (LLMs) are reported to be partial to certain cultures\nowing to the training data dominance from the English corpora. Since\nmultilingual cultural data are often expensive to collect, existing efforts\nhandle this by prompt engineering or culture-specific pre-training. However,\nthey might overlook the knowledge deficiency of low-resource culture and\nrequire extensive computing resources. In this paper, we propose CultureLLM, a\ncost-effective solution to incorporate cultural differences into LLMs.\nCultureLLM adopts World Value Survey (WVS) as seed data and generates\nsemantically equivalent training data via the proposed semantic data\naugmentation. Using only 50 seed samples from WVS with augmented data, we\nfine-tune culture-specific LLMs and one unified model (CultureLLM-One) for 9\ncultures covering rich and low-resource languages. Extensive experiments on 60\nculture-related datasets demonstrate that CultureLLM significantly outperforms\nvarious counterparts such as GPT-3.5 (by 8.1%) and Gemini Pro (by 9.5%) with\ncomparable performance to GPT-4 or even better. Our human study shows that the\ngenerated samples are semantically equivalent to the original samples,\nproviding an effective solution for LLMs augmentation. Code is released at\nhttps://github.com/Scarelette/CultureLLM.\n","authors":["Cheng Li","Mengzhou Chen","Jindong Wang","Sunayana Sitaram","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2402.10946v3.pdf","comment":"NeurIPS 2024; Code is at https://github.com/Scarelette/CultureLLM"},{"id":"http://arxiv.org/abs/2409.18169v5","updated":"2024-12-03T06:52:11Z","published":"2024-09-26T17:55:22Z","title":"Harmful Fine-tuning Attacks and Defenses for Large Language Models: A\n Survey","summary":" Recent research demonstrates that the nascent fine-tuning-as-a-service\nbusiness model exposes serious safety concerns -- fine-tuning over a few\nharmful data uploaded by the users can compromise the safety alignment of the\nmodel. The attack, known as harmful fine-tuning attack, has raised a broad\nresearch interest among the community. However, as the attack is still new,\n\\textbf{we observe that there are general misunderstandings within the research\ncommunity.} To clear up concern, this paper provide a comprehensive overview to\nthree aspects of harmful fine-tuning: attacks setting, defense design and\nevaluation methodology. Specifically, we first present the threat model of the\nproblem, and introduce the harmful fine-tuning attack and its variants. Then we\nsystematically survey the existing literature on attacks/defenses/mechanical\nanalysis of the problem. Finally, we introduce the evaluation methodology and\noutline future research directions that might contribute to the development of\nthe field. Additionally, we present a list of questions of interest, which\nmight be useful to refer to when reviewers in the peer review process question\nthe realism of the experiment/attack/defense setting. A curated list of\nrelevant papers is maintained and made accessible at:\nhttps://github.com/git-disl/awesome_LLM-harmful-fine-tuning-papers.\n","authors":["Tiansheng Huang","Sihao Hu","Fatih Ilhan","Selim Furkan Tekin","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18169v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02205v1","updated":"2024-12-03T06:47:15Z","published":"2024-12-03T06:47:15Z","title":"DataLab: A Unifed Platform for LLM-Powered Business Intelligence","summary":" Business intelligence (BI) transforms large volumes of data within modern\norganizations into actionable insights for informed decision-making. Recently,\nlarge language model (LLM)-based agents have streamlined the BI workflow by\nautomatically performing task planning, reasoning, and actions in executable\nenvironments based on natural language (NL) queries. However, existing\napproaches primarily focus on individual BI tasks such as NL2SQL and NL2VIS.\nThe fragmentation of tasks across different data roles and tools lead to\ninefficiencies and potential errors due to the iterative and collaborative\nnature of BI. In this paper, we introduce DataLab, a unified BI platform that\nintegrates a one-stop LLM-based agent framework with an augmented computational\nnotebook interface. DataLab supports a wide range of BI tasks for different\ndata roles by seamlessly combining LLM assistance with user customization\nwithin a single environment. To achieve this unification, we design a domain\nknowledge incorporation module tailored for enterprise-specific BI tasks, an\ninter-agent communication mechanism to facilitate information sharing across\nthe BI workflow, and a cell-based context management strategy to enhance\ncontext utilization efficiency in BI notebooks. Extensive experiments\ndemonstrate that DataLab achieves state-of-the-art performance on various BI\ntasks across popular research benchmarks. Moreover, DataLab maintains high\neffectiveness and efficiency on real-world datasets from Tencent, achieving up\nto a 58.58% increase in accuracy and a 61.65% reduction in token cost on\nenterprise-specific BI tasks.\n","authors":["Luoxuan Weng","Yinghao Tang","Yingchaojie Feng","Zhuo Chang","Peng Chen","Ruiqin Chen","Haozhe Feng","Chen Hou","Danqing Huang","Yang Li","Huaming Rao","Haonan Wang","Canshi Wei","Xiaofeng Yang","Yuhui Zhang","Yifeng Zheng","Xiuqi Huang","Minfeng Zhu","Yuxin Ma","Bin Cui","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15143v3","updated":"2024-12-03T06:43:39Z","published":"2024-05-24T01:45:27Z","title":"Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation\n Models","summary":" Go-Explore is a powerful family of algorithms designed to solve\nhard-exploration problems built on the principle of archiving discovered\nstates, and iteratively returning to and exploring from the most promising\nstates. This approach has led to superhuman performance across a wide variety\nof challenging problems including Atari games and robotic control, but requires\nmanually designing heuristics to guide exploration (i.e., determine which\nstates to save and explore from, and what actions to consider next), which is\ntime-consuming and infeasible in general. To resolve this, we propose\nIntelligent Go-Explore (IGE) which greatly extends the scope of the original\nGo-Explore by replacing these handcrafted heuristics with the intelligence and\ninternalized human notions of interestingness captured by giant pretrained\nfoundation models (FMs). This provides IGE with a human-like ability to\ninstinctively identify how interesting or promising any new state is (e.g.,\ndiscovering new objects, locations, or behaviors), even in complex environments\nwhere heuristics are hard to define. Moreover, IGE offers the exciting\nopportunity to recognize and capitalize on serendipitous discoveries-states\nencountered during exploration that are valuable in terms of exploration, yet\nwhere what makes them interesting was not anticipated by the human user. We\nevaluate our algorithm on a diverse range of language and vision-based tasks\nthat require search and exploration. Across these tasks, IGE strongly exceeds\nclassic reinforcement learning and graph search baselines, and also succeeds\nwhere prior state-of-the-art FM agents like Reflexion completely fail. Overall,\nIntelligent Go-Explore combines the tremendous strengths of FMs and the\npowerful Go-Explore algorithm, opening up a new frontier of research into\ncreating more generally capable agents with impressive exploration\ncapabilities.\n","authors":["Cong Lu","Shengran Hu","Jeff Clune"],"pdf_url":"https://arxiv.org/pdf/2405.15143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.01003v4","updated":"2024-12-03T06:43:25Z","published":"2024-07-01T06:35:53Z","title":"Embedded Prompt Tuning: Towards Enhanced Calibration of Pretrained\n Models for Medical Images","summary":" Foundation models pre-trained on large-scale data have been widely witnessed\nto achieve success in various natural imaging downstream tasks.\nParameter-efficient fine-tuning (PEFT) methods aim to adapt foundation models\nto new domains by updating only a small portion of parameters in order to\nreduce computational overhead. However, the effectiveness of these PEFT\nmethods, especially in cross-domain few-shot scenarios, e.g., medical image\nanalysis, has not been fully explored. In this work, we facilitate the study of\nthe performance of PEFT when adapting foundation models to medical image\nclassification tasks. Furthermore, to alleviate the limitations of prompt\nintroducing ways and approximation capabilities on Transformer architectures of\nmainstream prompt tuning methods, we propose the Embedded Prompt Tuning (EPT)\nmethod by embedding prompt tokens into the expanded channels. We also find that\nthere are anomalies in the feature space distribution of foundation models\nduring pre-training process, and prompt tuning can help mitigate this negative\nimpact. To explain this phenomenon, we also introduce a novel perspective to\nunderstand prompt tuning: Prompt tuning is a distribution calibrator. And we\nsupport it by analyzing patch-wise scaling and feature separation operations\ncontained in EPT. Our experiments show that EPT outperforms several\nstate-of-the-art fine-tuning methods by a significant margin on few-shot\nmedical image classification tasks, and completes the fine-tuning process\nwithin highly competitive time, indicating EPT is an effective PEFT method. The\nsource code is available at github.com/zuwenqiang/EPT.\n","authors":["Wenqiang Zu","Shenghao Xie","Qing Zhao","Guoqi Li","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2407.01003v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02193v1","updated":"2024-12-03T06:15:04Z","published":"2024-12-03T06:15:04Z","title":"LayoutVLM: Differentiable Optimization of 3D Layout via Vision-Language\n Models","summary":" Open-universe 3D layout generation arranges unlabeled 3D assets conditioned\non language instruction. Large language models (LLMs) struggle with generating\nphysically plausible 3D scenes and adherence to input instructions,\nparticularly in cluttered scenes. We introduce LayoutVLM, a framework and scene\nlayout representation that exploits the semantic knowledge of Vision-Language\nModels (VLMs) and supports differentiable optimization to ensure physical\nplausibility. LayoutVLM employs VLMs to generate two mutually reinforcing\nrepresentations from visually marked images, and a self-consistent decoding\nprocess to improve VLMs spatial planning. Our experiments show that LayoutVLM\naddresses the limitations of existing LLM and constraint-based approaches,\nproducing physically plausible 3D layouts better aligned with the semantic\nintent of input language instructions. We also demonstrate that fine-tuning\nVLMs with the proposed scene layout representation extracted from existing\nscene datasets can improve performance.\n","authors":["Fan-Yun Sun","Weiyu Liu","Siyi Gu","Dylan Lim","Goutam Bhat","Federico Tombari","Manling Li","Nick Haber","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2412.02193v1.pdf","comment":"project website: https://ai.stanford.edu/~sunfanyun/layoutvlm/"},{"id":"http://arxiv.org/abs/2412.02189v1","updated":"2024-12-03T06:02:47Z","published":"2024-12-03T06:02:47Z","title":"Comparative Performance of Machine Learning Algorithms for Early Genetic\n Disorder and Subclass Classification","summary":" A great deal of effort has been devoted to discovering a particular genetic\ndisorder, but its classification across a broad spectrum of disorder classes\nand types remains elusive. Early diagnosis of genetic disorders enables timely\ninterventions and improves outcomes. This study implements machine learning\nmodels using basic clinical indicators measurable at birth or infancy to enable\ndiagnosis in preliminary life stages. Supervised learning algorithms were\nimplemented on a dataset of 22083 instances with 42 features like family\nhistory, newborn metrics, and basic lab tests. Extensive hyperparameter tuning,\nfeature engineering, and selection were undertaken. Two multi-class classifiers\nwere developed: one for predicting disorder classes (mitochondrial,\nmultifactorial, and single-gene) and one for subtypes (9 disorders).\nPerformance was evaluated using accuracy, precision, recall, and the F1-score.\nThe CatBoost classifier achieved the highest accuracy of 77% for predicting\ngenetic disorder classes. For subtypes, SVM attained a maximum accuracy of 80%.\nThe study demonstrates the feasibility of using basic clinical data in machine\nlearning models for early categorization and diagnosis across various genetic\ndisorders. Applying ML with basic clinical indicators can enable timely\ninterventions once validated on larger datasets. It is necessary to conduct\nfurther studies to improve model performance on this dataset.\n","authors":["Abu Bakar Siddik","Faisal R. Badal","Afroza Islam"],"pdf_url":"https://arxiv.org/pdf/2412.02189v1.pdf","comment":"16 pages, 11 figures, 9 tables"},{"id":"http://arxiv.org/abs/2410.15876v3","updated":"2024-12-03T05:59:09Z","published":"2024-10-21T10:57:45Z","title":"FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL","summary":" Multi-agent reinforcement learning has demonstrated significant potential in\naddressing complex cooperative tasks across various real-world applications.\nHowever, existing MARL approaches often rely on the restrictive assumption that\nthe number of entities (e.g., agents, obstacles) remains constant between\ntraining and inference. This overlooks scenarios where entities are dynamically\nremoved or added during the inference trajectory -- a common occurrence in\nreal-world environments like search and rescue missions and dynamic combat\nsituations. In this paper, we tackle the challenge of intra-trajectory dynamic\nentity composition under zero-shot out-of-domain (OOD) generalization, where\nsuch dynamic changes cannot be anticipated beforehand. Our empirical studies\nreveal that existing MARL methods suffer significant performance degradation\nand increased uncertainty in these scenarios. In response, we propose\nFlickerFusion, a novel OOD generalization method that acts as a universally\napplicable augmentation technique for MARL backbone methods. FlickerFusion\nstochastically drops out parts of the observation space, emulating being\nin-domain when inferenced OOD. The results show that FlickerFusion not only\nachieves superior inference rewards but also uniquely reduces uncertainty\nvis-\\`a-vis the backbone, compared to existing methods. Benchmarks,\nimplementations, and model weights are organized and open-sourced at\nflickerfusion305.github.io, accompanied by ample demo video renderings.\n","authors":["Woosung Koh","Wonbeen Oh","Siyeol Kim","Suhin Shin","Hyeongjin Kim","Jaein Jang","Junghyun Lee","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2410.15876v3.pdf","comment":"NeurIPS '24 Open-World Agents Workshop"},{"id":"http://arxiv.org/abs/2412.02186v1","updated":"2024-12-03T05:54:43Z","published":"2024-12-03T05:54:43Z","title":"VideoICL: Confidence-based Iterative In-context Learning for\n Out-of-Distribution Video Understanding","summary":" Recent advancements in video large multimodal models (LMMs) have\nsignificantly improved their video understanding and reasoning capabilities.\nHowever, their performance drops on out-of-distribution (OOD) tasks that are\nunderrepresented in training data. Traditional methods like fine-tuning on OOD\ndatasets are impractical due to high computational costs. While In-context\nlearning (ICL) with demonstration examples has shown promising generalization\nperformance in language tasks and image-language tasks without fine-tuning,\napplying ICL to video-language tasks faces challenges due to the limited\ncontext length in Video LMMs, as videos require longer token lengths. To\naddress these issues, we propose VideoICL, a novel video in-context learning\nframework for OOD tasks that introduces a similarity-based relevant example\nselection strategy and a confidence-based iterative inference approach. This\nallows to select the most relevant examples and rank them based on similarity,\nto be used for inference. If the generated response has low confidence, our\nframework selects new examples and performs inference again, iteratively\nrefining the results until a high-confidence response is obtained. This\napproach improves OOD video understanding performance by extending effective\ncontext length without incurring high costs. The experimental results on\nmultiple benchmarks demonstrate significant performance gains, especially in\ndomain-specific scenarios, laying the groundwork for broader video\ncomprehension applications. Code will be released at\nhttps://github.com/KangsanKim07/VideoICL\n","authors":["Kangsan Kim","Geon Park","Youngwan Lee","Woongyeong Yeo","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2412.02186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01650v2","updated":"2024-12-03T05:46:35Z","published":"2024-12-02T15:59:35Z","title":"Privacy-Preserving Federated Learning via Homomorphic Adversarial\n Networks","summary":" Privacy-preserving federated learning (PPFL) aims to train a global model for\nmultiple clients while maintaining their data privacy. However, current PPFL\nprotocols exhibit one or more of the following insufficiencies: considerable\ndegradation in accuracy, the requirement for sharing keys, and cooperation\nduring the key generation or decryption processes. As a mitigation, we develop\nthe first protocol that utilizes neural networks to implement PPFL, as well as\nincorporating an Aggregatable Hybrid Encryption scheme tailored to the needs of\nPPFL. We name these networks as Homomorphic Adversarial Networks (HANs) which\ndemonstrate that neural networks are capable of performing tasks similar to\nmulti-key homomorphic encryption (MK-HE) while solving the problems of key\ndistribution and collaborative decryption. Our experiments show that HANs are\nrobust against privacy attacks. Compared with non-private federated learning,\nexperiments conducted on multiple datasets demonstrate that HANs exhibit a\nnegligible accuracy loss (at most 1.35%). Compared to traditional MK-HE\nschemes, HANs increase encryption aggregation speed by 6,075 times while\nincurring a 29.2 times increase in communication overhead.\n","authors":["Wenhan Dong","Chao Lin","Xinlei He","Xinyi Huang","Shengmin Xu"],"pdf_url":"https://arxiv.org/pdf/2412.01650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02181v1","updated":"2024-12-03T05:35:44Z","published":"2024-12-03T05:35:44Z","title":"Generalizing Weisfeiler-Lehman Kernels to Subgraphs","summary":" Subgraph representation learning has been effective in solving various\nreal-world problems. However, current graph neural networks (GNNs) produce\nsuboptimal results for subgraph-level tasks due to their inability to capture\ncomplex interactions within and between subgraphs. To provide a more expressive\nand efficient alternative, we propose WLKS, a Weisfeiler-Lehman (WL) kernel\ngeneralized for subgraphs by applying the WL algorithm on induced $k$-hop\nneighborhoods. We combine kernels across different $k$-hop levels to capture\nricher structural information that is not fully encoded in existing models. Our\napproach can balance expressiveness and efficiency by eliminating the need for\nneighborhood sampling. In experiments on eight real-world and synthetic\nbenchmarks, WLKS significantly outperforms leading approaches on five datasets\nwhile reducing training time, ranging from 0.01x to 0.25x compared to the\nstate-of-the-art.\n","authors":["Dongkwan Kim","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2412.02181v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2412.02177v1","updated":"2024-12-03T05:21:42Z","published":"2024-12-03T05:21:42Z","title":"Anatomically-Grounded Fact Checking of Automated Chest X-ray Reports","summary":" With the emergence of large-scale vision-language models, realistic radiology\nreports may be generated using only medical images as input guided by simple\nprompts. However, their practical utility has been limited due to the factual\nerrors in their description of findings. In this paper, we propose a novel\nmodel for explainable fact-checking that identifies errors in findings and\ntheir locations indicated through the reports. Specifically, we analyze the\ntypes of errors made by automated reporting methods and derive a new synthetic\ndataset of images paired with real and fake descriptions of findings and their\nlocations from a ground truth dataset. A new multi-label cross-modal\ncontrastive regression network is then trained on this datsaset. We evaluate\nthe resulting fact-checking model and its utility in correcting reports\ngenerated by several SOTA automated reporting tools on a variety of benchmark\ndatasets with results pointing to over 40\\% improvement in report quality\nthrough such error detection and correction.\n","authors":["R. Mahmood","K. C. L. Wong","D. M. Reyes","N. D'Souza","L. Shi","J. Wu","P. Kaviani","M. Kalra","G. Wang","P. Yan","T. Syeda-Mahmood"],"pdf_url":"https://arxiv.org/pdf/2412.02177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02176v1","updated":"2024-12-03T05:20:29Z","published":"2024-12-03T05:20:29Z","title":"Self-Supervised Learning-Based Path Planning and Obstacle Avoidance\n Using PPO and B-Splines in Unknown Environments","summary":" This paper introduces SmartBSP, an advanced self-supervised learning\nframework for real-time path planning and obstacle avoidance in autonomous\nrobotics navigating through complex environments. The proposed system\nintegrates Proximal Policy Optimization (PPO) with Convolutional Neural\nNetworks (CNN) and Actor-Critic architecture to process limited LIDAR inputs\nand compute spatial decision-making probabilities. The robot's perceptual field\nis discretized into a grid format, which the CNN analyzes to produce a spatial\nprobability distribution. During the training process a nuanced cost function\nis minimized that accounts for path curvature, endpoint proximity, and obstacle\navoidance. Simulations results in different scenarios validate the algorithm's\nresilience and adaptability across diverse operational scenarios. Subsequently,\nReal-time experiments, employing the Robot Operating System (ROS), were carried\nout to assess the efficacy of the proposed algorithm.\n","authors":["Shahab Shokouhi","Oguzhan Oruc","May-Win Thein"],"pdf_url":"https://arxiv.org/pdf/2412.02176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02173v1","updated":"2024-12-03T05:05:13Z","published":"2024-12-03T05:05:13Z","title":"Keeping Experts in the Loop: Expert-Guided Optimization for Clinical\n Data Classification using Large Language Models","summary":" Since the emergence of Large Language Models (LLMs), the challenge of\neffectively leveraging their potential in healthcare has taken center stage. A\ncritical barrier to using LLMs for extracting insights from unstructured\nclinical notes lies in the prompt engineering process. Despite its pivotal role\nin determining task performance, a clear framework for prompt optimization\nremains absent. Current methods to address this gap take either a manual prompt\nrefinement approach, where domain experts collaborate with prompt engineers to\ncreate an optimal prompt, which is time-intensive and difficult to scale, or\nthrough employing automatic prompt optimizing approaches, where the value of\nthe input of domain experts is not fully realized. To address this, we propose\nStructEase, a novel framework that bridges the gap between automation and the\ninput of human expertise in prompt engineering. A core innovation of the\nframework is SamplEase, an iterative sampling algorithm that identifies\nhigh-value cases where expert feedback drives significant performance\nimprovements. This approach minimizes expert intervention, to effectively\nenhance classification outcomes. This targeted approach reduces labeling\nredundancy, mitigates human error, and enhances classification outcomes. We\nevaluated the performance of StructEase using a dataset of de-identified\nclinical narratives from the US National Electronic Injury Surveillance System\n(NEISS), demonstrating significant gains in classification performance compared\nto current methods. Our findings underscore the value of expert integration in\nLLM workflows, achieving notable improvements in F1 score while maintaining\nminimal expert effort. By combining transparency, flexibility, and scalability,\nStructEase sets the foundation for a framework to integrate expert input into\nLLM workflows in healthcare and beyond.\n","authors":["Nader Karayanni","Aya Awwad","Chein-Lien Hsiao","Surish P Shanmugam"],"pdf_url":"https://arxiv.org/pdf/2412.02173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02172v1","updated":"2024-12-03T05:04:49Z","published":"2024-12-03T05:04:49Z","title":"VISCO: Benchmarking Fine-Grained Critique and Correction Towards\n Self-Improvement in Visual Reasoning","summary":" The ability of large vision-language models (LVLMs) to critique and correct\ntheir reasoning is an essential building block towards their self-improvement.\nHowever, a systematic analysis of such capabilities in LVLMs is still lacking.\nWe propose VISCO, the first benchmark to extensively analyze the fine-grained\ncritique and correction capabilities of LVLMs. Compared to existing work that\nuses a single scalar value to critique the entire reasoning [4], VISCO features\ndense and fine-grained critique, requiring LVLMs to evaluate the correctness of\neach step in the chain-of-thought and provide natural language explanations to\nsupport their judgments. Extensive evaluation of 24 LVLMs demonstrates that\nhuman-written critiques significantly enhance the performance after correction,\nshowcasing the potential of the self-improvement strategy. However, the\nmodel-generated critiques are less helpful and sometimes detrimental to the\nperformance, suggesting that critique is the crucial bottleneck. We identified\nthree common patterns in critique failures: failure to critique visual\nperception, reluctance to \"say no\", and exaggerated assumption of error\npropagation. To address these issues, we propose an effective LookBack strategy\nthat revisits the image to verify each piece of information in the initial\nreasoning. LookBack significantly improves critique and correction performance\nby up to 13.5%.\n","authors":["Xueqing Wu","Yuheng Ding","Bingxuan Li","Pan Lu","Da Yin","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2412.02172v1.pdf","comment":"Project: https://visco-benchmark.github.io/"},{"id":"http://arxiv.org/abs/2406.13706v2","updated":"2024-12-03T04:57:32Z","published":"2024-06-19T16:58:32Z","title":"Developing Story: Case Studies of Generative AI's Use in Journalism","summary":" Journalists are among the many users of large language models (LLMs). To\nbetter understand the journalist-AI interactions, we conduct a study of LLM\nusage by two news agencies through browsing the WildChat dataset, identifying\ncandidate interactions, and verifying them by matching to online published\narticles. Our analysis uncovers instances where journalists provide sensitive\nmaterial such as confidential correspondence with sources or articles from\nother agencies to the LLM as stimuli and prompt it to generate articles, and\npublish these machine-generated articles with limited intervention (median\noutput-publication ROUGE-L of 0.62). Based on our findings, we call for further\nresearch into what constitutes responsible use of AI, and the establishment of\nclear guidelines and best practices on using LLMs in a journalistic context.\n","authors":["Natalie Grace Brigham","Chongjiu Gao","Tadayoshi Kohno","Franziska Roesner","Niloofar Mireshghallah"],"pdf_url":"https://arxiv.org/pdf/2406.13706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02166v1","updated":"2024-12-03T04:51:57Z","published":"2024-12-03T04:51:57Z","title":"Analyzing the Impact of AI Tools on Student Study Habits and Academic\n Performance","summary":" This study explores the effectiveness of AI tools in enhancing student\nlearning, specifically in improving study habits, time management, and feedback\nmechanisms. The research focuses on how AI tools can support personalized\nlearning, adaptive test adjustments, and provide real-time classroom analysis.\nStudent feedback revealed strong support for these features, and the study\nfound a significant reduction in study hours alongside an increase in GPA,\nsuggesting positive academic outcomes. Despite these benefits, challenges such\nas over-reliance on AI and difficulties in integrating AI with traditional\nteaching methods were also identified, emphasizing the need for AI tools to\ncomplement conventional educational strategies rather than replace them. Data\nwere collected through a survey with a Likert scale and follow-up interviews,\nproviding both quantitative and qualitative insights. The analysis involved\ndescriptive statistics to summarize demographic data, AI usage patterns, and\nperceived effectiveness, as well as inferential statistics (T-tests, ANOVA) to\nexamine the impact of demographic factors on AI adoption. Regression analysis\nidentified predictors of AI adoption, and qualitative responses were\nthematically analyzed to understand students' perspectives on the future of AI\nin education. This mixed-methods approach provided a comprehensive view of AI's\nrole in education and highlighted the importance of privacy, transparency, and\ncontinuous refinement of AI features to maximize their educational benefits.\n","authors":["Ben Ward","Deepshikha Bhati","Fnu Neha","Angela Guercio"],"pdf_url":"https://arxiv.org/pdf/2412.02166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01253v2","updated":"2024-12-03T04:51:10Z","published":"2024-12-02T08:22:56Z","title":"Yi-Lightning Technical Report","summary":" This technical report presents Yi-Lightning, our latest flagship large\nlanguage model (LLM). It achieves exceptional performance, ranking 6th overall\non Chatbot Arena, with particularly strong results (2nd to 4th place) in\nspecialized categories including Chinese, Math, Coding, and Hard Prompts.\nYi-Lightning leverages an enhanced Mixture-of-Experts (MoE) architecture,\nfeaturing advanced expert segmentation and routing mechanisms coupled with\noptimized KV-caching techniques. Our development process encompasses\ncomprehensive pre-training, supervised fine-tuning (SFT), and reinforcement\nlearning from human feedback (RLHF), where we devise deliberate strategies for\nmulti-stage training, synthetic data construction, and reward modeling.\nFurthermore, we implement RAISE (Responsible AI Safety Engine), a\nfour-component framework to address safety issues across pre-training,\npost-training, and serving phases. Empowered by our scalable super-computing\ninfrastructure, all these innovations substantially reduce training, deployment\nand inference costs while maintaining high-performance standards. With further\nevaluations on public academic benchmarks, Yi-Lightning demonstrates\ncompetitive performance against top-tier LLMs, while we observe a notable\ndisparity between traditional, static benchmark results and real-world, dynamic\nhuman preferences. This observation prompts a critical reassessment of\nconventional benchmarks' utility in guiding the development of more intelligent\nand powerful AI systems for practical applications. Yi-Lightning is now\navailable through our developer platform at https://platform.lingyiwanwu.com.\n","authors":["01. AI"," :","Alan Wake","Albert Wang","Bei Chen","C. X. Lv","Chao Li","Chengen Huang","Chenglin Cai","Chujie Zheng","Daniel Cooper","Ethan Dai","Fan Zhou","Feng Hu","Heng Ji","Howard Qiu","Jiangcheng Zhu","Jun Tian","Katherine Su","Lihuan Zhang","Liying Li","Ming Song","Mou Li","Peng Liu","Qichen Hu","Shawn Wang","Shijun Zhou","Shiyong Li","Tianhang Zhu","Wen Xie","Xiang He","Xiaobo Chen","Xiaohui Hu","Xiaoyi Ren","Xinyao Niu","Yanpeng Li","Yongke Zhao","Yongzhen Luo","Yuchi Xu","Yuxuan Sha","Zhaodong Yan","Zhiyuan Liu","Zirui Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.01253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01269v2","updated":"2024-12-03T04:37:03Z","published":"2024-12-02T08:35:54Z","title":"CPRM: A LLM-based Continual Pre-training Framework for Relevance\n Modeling in Commercial Search","summary":" Relevance modeling between queries and items stands as a pivotal component in\ncommercial search engines, directly affecting the user experience. Given the\nremarkable achievements of large language models (LLMs) in various natural\nlanguage processing (NLP) tasks, LLM-based relevance modeling is gradually\nbeing adopted within industrial search systems. Nevertheless, foundational LLMs\nlack domain-specific knowledge and do not fully exploit the potential of\nin-context learning. Furthermore, structured item text remains underutilized,\nand there is a shortage in the supply of corresponding queries and background\nknowledge. We thereby propose CPRM (Continual Pre-training for Relevance\nModeling), a framework designed for the continual pre-training of LLMs to\naddress these issues. Our CPRM framework includes three modules: 1) employing\nboth queries and multi-field item to jointly pre-train for enhancing domain\nknowledge, 2) applying in-context pre-training, a novel approach where LLMs are\npre-trained on a sequence of related queries or items, and 3) conducting\nreading comprehension on items to produce associated domain knowledge and\nbackground information (e.g., generating summaries and corresponding queries)\nto further strengthen LLMs. Results on offline experiments and online A/B\ntesting demonstrate that our model achieves convincing performance compared to\nstrong baselines.\n","authors":["Kaixin Wu","Yixin Ji","Zeyuan Chen","Qiang Wang","Cunxiang Wang","Hong Liu","Baijun Ji","Jia Xu","Zhongyi Liu","Jinjie Gu","Yuan Zhou","Linjian Mo"],"pdf_url":"https://arxiv.org/pdf/2412.01269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02159v1","updated":"2024-12-03T04:34:58Z","published":"2024-12-03T04:34:58Z","title":"Jailbreak Defense in a Narrow Domain: Limitations of Existing Methods\n and a New Transcript-Classifier Approach","summary":" Defending large language models against jailbreaks so that they never engage\nin a broadly-defined set of forbidden behaviors is an open problem. In this\npaper, we investigate the difficulty of jailbreak-defense when we only want to\nforbid a narrowly-defined set of behaviors. As a case study, we focus on\npreventing an LLM from helping a user make a bomb. We find that popular\ndefenses such as safety training, adversarial training, and input/output\nclassifiers are unable to fully solve this problem. In pursuit of a better\nsolution, we develop a transcript-classifier defense which outperforms the\nbaseline defenses we test. However, our classifier defense still fails in some\ncircumstances, which highlights the difficulty of jailbreak-defense even in a\nnarrow domain.\n","authors":["Tony T. Wang","John Hughes","Henry Sleight","Rylan Schaeffer","Rajashree Agrawal","Fazl Barez","Mrinank Sharma","Jesse Mu","Nir Shavit","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2412.02159v1.pdf","comment":"Accepted to the AdvML-Frontiers and SoLaR workshops at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.12361v3","updated":"2024-12-03T04:34:09Z","published":"2024-10-16T08:24:09Z","title":"Proactive Agent: Shifting LLM Agents from Reactive Responses to Active\n Assistance","summary":" Agents powered by large language models have shown remarkable abilities in\nsolving complex tasks. However, most agent systems remain reactive, limiting\ntheir effectiveness in scenarios requiring foresight and autonomous\ndecision-making. In this paper, we tackle the challenge of developing proactive\nagents capable of anticipating and initiating tasks without explicit human\ninstructions. We propose a novel data-driven approach for this problem.\nFirstly, we collect real-world human activities to generate proactive task\npredictions. These predictions are then labeled by human annotators as either\naccepted or rejected. The labeled data is used to train a reward model that\nsimulates human judgment and serves as an automatic evaluator of the\nproactiveness of LLM agents. Building on this, we develop a comprehensive data\ngeneration pipeline to create a diverse dataset, ProactiveBench, containing\n6,790 events. Finally, we demonstrate that fine-tuning models with the proposed\nProactiveBench can significantly elicit the proactiveness of LLM agents.\nExperimental results show that our fine-tuned model achieves an F1-Score of\n66.47% in proactively offering assistance, outperforming all open-source and\nclose-source models. These results highlight the potential of our method in\ncreating more proactive and effective agent systems, paving the way for future\nadvancements in human-agent collaboration.\n","authors":["Yaxi Lu","Shenzhi Yang","Cheng Qian","Guirong Chen","Qinyu Luo","Yesai Wu","Huadong Wang","Xin Cong","Zhong Zhang","Yankai Lin","Weiwen Liu","Yasheng Wang","Zhiyuan Liu","Fangming Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2410.12361v3.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.02155v1","updated":"2024-12-03T04:29:27Z","published":"2024-12-03T04:29:27Z","title":"CausalMob: Causal Human Mobility Prediction with LLMs-derived Human\n Intentions toward Public Events","summary":" Large-scale human mobility exhibits spatial and temporal patterns that can\nassist policymakers in decision making. Although traditional prediction models\nattempt to capture these patterns, they often interfered by non-periodic public\nevents, such as disasters and occasional celebrations. Since regular human\nmobility patterns are heavily affected by these events, estimating their causal\neffects is critical to accurate mobility predictions. Although news articles\nprovide unique perspectives on these events in an unstructured format,\nprocessing is a challenge. In this study, we propose a causality-augmented\nprediction model, called \\textbf{CausalMob}, to analyze the causal effects of\npublic events. We first utilize large language models (LLMs) to extract human\nintentions from news articles and transform them into features that act as\ncausal treatments. Next, the model learns representations of spatio-temporal\nregional covariates from multiple data sources to serve as confounders for\ncausal inference. Finally, we present a causal effect estimation framework to\nensure event features remain independent of confounders during prediction.\nBased on large-scale real-world data, the experimental results show that the\nproposed model excels in human mobility prediction, outperforming\nstate-of-the-art models.\n","authors":["Xiaojie Yang","Hangli Ge","Jiawei Wang","Zipei Fan","Renhe Jiang","Ryosuke Shibasaki","Noboru Koshizuka"],"pdf_url":"https://arxiv.org/pdf/2412.02155v1.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2412.02154v1","updated":"2024-12-03T04:28:58Z","published":"2024-12-03T04:28:58Z","title":"Failure Probability Estimation for Black-Box Autonomous Systems using\n State-Dependent Importance Sampling Proposals","summary":" Estimating the probability of failure is a critical step in developing\nsafety-critical autonomous systems. Direct estimation methods such as Monte\nCarlo sampling are often impractical due to the rarity of failures in these\nsystems. Existing importance sampling approaches do not scale to sequential\ndecision-making systems with large state spaces and long horizons. We propose\nan adaptive importance sampling algorithm to address these limitations. Our\nmethod minimizes the forward Kullback-Leibler divergence between a\nstate-dependent proposal distribution and a relaxed form of the optimal\nimportance sampling distribution. Our method uses Markov score ascent methods\nto estimate this objective. We evaluate our approach on four sequential systems\nand show that it provides more accurate failure probability estimates than\nbaseline Monte Carlo and importance sampling techniques. This work is open\nsourced.\n","authors":["Harrison Delecki","Sydney M. Katz","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2412.02154v1.pdf","comment":"Submitted to L4DC 2025"},{"id":"http://arxiv.org/abs/2412.02153v1","updated":"2024-12-03T04:28:14Z","published":"2024-12-03T04:28:14Z","title":"Revisiting the Initial Steps in Adaptive Gradient Descent Optimization","summary":" Adaptive gradient optimization methods, such as Adam, are prevalent in\ntraining deep neural networks across diverse machine learning tasks due to\ntheir ability to achieve faster convergence. However, these methods often\nsuffer from suboptimal generalization compared to stochastic gradient descent\n(SGD) and exhibit instability, particularly when training Transformer models.\nIn this work, we show the standard initialization of the second-order moment\nestimation ($v_0 =0$) as a significant factor contributing to these\nlimitations. We introduce simple yet effective solutions: initializing the\nsecond-order moment estimation with non-zero values, using either data-driven\nor random initialization strategies. Empirical evaluations demonstrate that our\napproach not only stabilizes convergence but also enhances the final\nperformance of adaptive gradient optimizers. Furthermore, by adopting the\nproposed initialization strategies, Adam achieves performance comparable to\nmany recently proposed variants of adaptive gradient optimization methods,\nhighlighting the practical impact of this straightforward modification.\n","authors":["Abulikemu Abuduweili","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2412.02153v1.pdf","comment":"OPT workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.01490v2","updated":"2024-12-03T04:27:23Z","published":"2024-12-02T13:41:38Z","title":"Intelligent Spark Agents: A Modular LangGraph Framework for Scalable,\n Visualized, and Enhanced Big Data Machine Learning Workflows","summary":" Apache Spark is better suited for load data mining and machine learning that\nrequire a lot of iteration by using memory-distributed data sets. Due to the\ncomplexity of Spark, the high learning threshold of Scala, and the low\nreusability of its code, this paper designs and implements a Spark-based visual\nprocess AI+machine learning method under a big data environment. On the one\nhand, it designs component models to describe the basic steps of machine\nlearning, including data preprocessing, feature processing, and model training.\nPractice and validate evaluation. On the other hand, a visual process modeling\ntool is provided to support analysts to design machine learning processes,\nwhich can be translated automatically into Spark platform code for efficient\nexecution. This tool can greatly improve the AI machine learning efficiency of\nthe Spark platform. This paper introduces the method theory, key technologies,\nand effectiveness of the tool.\n This paper explores the application of Spark in the field of large model\nagents. Langchain, as an open-source framework, is committed to simplifying the\ndevelopment of end-to-end applications based on language models. It provides\ninterfaces for interacting with a variety of large language models, optimizing\nprompt engineering, and endowing large models with the ability to invoke\nexternal tools. LangGraph demonstrates its powerful state management and graph\nconstruction capabilities by defining node functions and graphs to build\ncomplex agent applications. The development of Spark agent applications based\non LangGraph has further promoted the development of AI applications in the big\ndata analysis environment .\n","authors":["Jialin Wang","Zhihua Duan"],"pdf_url":"https://arxiv.org/pdf/2412.01490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10374v2","updated":"2024-12-03T04:25:30Z","published":"2024-07-15T00:48:06Z","title":"An Empirical Study of Mamba-based Pedestrian Attribute Recognition","summary":" Current strong pedestrian attribute recognition models are developed based on\nTransformer networks, which are computationally heavy. Recently proposed models\nwith linear complexity (e.g., Mamba) have garnered significant attention and\nhave achieved a good balance between accuracy and computational cost across a\nvariety of visual tasks. Relevant review articles also suggest that while these\nmodels can perform well on some pedestrian attribute recognition datasets, they\nare generally weaker than the corresponding Transformer models. To further tap\ninto the potential of the novel Mamba architecture for PAR tasks, this paper\ndesigns and adapts Mamba into two typical PAR frameworks, i.e., the text-image\nfusion approach and pure vision Mamba multi-label recognition framework. It is\nfound that interacting with attribute tags as additional input does not always\nlead to an improvement, specifically, Vim can be enhanced, but VMamba cannot.\nThis paper further designs various hybrid Mamba-Transformer variants and\nconducts thorough experimental validations. These experimental results indicate\nthat simply enhancing Mamba with a Transformer does not always lead to\nperformance improvements but yields better results under certain settings. We\nhope this empirical study can further inspire research in Mamba for PAR, and\neven extend into the domain of multi-label recognition, through the design of\nthese network structures and comprehensive experimentation. The source code of\nthis work will be released at \\url{https://github.com/Event-AHU/OpenPAR}\n","authors":["Xiao Wang","Weizhe Kong","Jiandong Jin","Shiao Wang","Ruichong Gao","Qingchuan Ma","Chenglong Li","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2407.10374v2.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2410.00475v3","updated":"2024-12-03T04:23:06Z","published":"2024-10-01T08:05:19Z","title":"Probabilistic Analysis of Copyright Disputes and Generative AI Safety","summary":" This paper presents a probabilistic approach to analyzing copyright\ninfringement disputes by formalizing relevant judicial principles within a\ncoherent framework based on the random-worlds method. It provides a structured\nanalysis of key evidentiary principles, with a particular focus on the\n``inverse ratio rule\"--a controversial doctrine adopted by some courts.\nAlthough this rule has faced significant criticism, a formal proof demonstrates\nits validity, provided it is properly defined. Additionally, the paper examines\nthe heightened copyright risks posed by generative AI, highlighting how\nextensive access to copyrighted material by generative models increases the\nrisk of infringement. Utilizing the probabilistic approach, the Near\nAccess-Free (NAF) condition, previously proposed as a potential mitigation\nstrategy, is evaluated. The analysis reveals that while the NAF condition\nmitigates some infringement risks, its justifiability and efficacy are\nquestionable in certain contexts. These findings demonstrate how a rigorous\nprobabilistic approach can advance our understanding of copyright jurisprudence\nand its interaction with emerging technologies.\n","authors":["Hiroaki Chiba-Okabe"],"pdf_url":"https://arxiv.org/pdf/2410.00475v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2406.08666v2","updated":"2024-12-03T04:22:40Z","published":"2024-06-12T22:12:03Z","title":"Interventional Causal Discovery in a Mixture of DAGs","summary":" Causal interactions among a group of variables are often modeled by a single\ncausal graph. In some domains, however, these interactions are best described\nby multiple co-existing causal graphs, e.g., in dynamical systems or genomics.\nThis paper addresses the hitherto unknown role of interventions in learning\ncausal interactions among variables governed by a mixture of causal systems,\neach modeled by one directed acyclic graph (DAG). Causal discovery from\nmixtures is fundamentally more challenging than single-DAG causal discovery.\nTwo major difficulties stem from (i)~an inherent uncertainty about the\nskeletons of the component DAGs that constitute the mixture and (ii)~possibly\ncyclic relationships across these component DAGs. This paper addresses these\nchallenges and aims to identify edges that exist in at least one component DAG\nof the mixture, referred to as the true edges. First, it establishes matching\nnecessary and sufficient conditions on the size of interventions required to\nidentify the true edges. Next, guided by the necessity results, an adaptive\nalgorithm is designed that learns all true edges using $O(n^2)$ interventions,\nwhere $n$ is the number of nodes. Remarkably, the size of the interventions is\noptimal if the underlying mixture model does not contain cycles across its\ncomponents. More generally, the gap between the intervention size used by the\nalgorithm and the optimal size is quantified. It is shown to be bounded by the\ncyclic complexity number of the mixture model, defined as the size of the\nminimal intervention that can break the cycles in the mixture, which is upper\nbounded by the number of cycles among the ancestors of a node.\n","authors":["Burak Varıcı","Dmitriy Katz-Rogozhnikov","Dennis Wei","Prasanna Sattigeri","Ali Tajer"],"pdf_url":"https://arxiv.org/pdf/2406.08666v2.pdf","comment":"NeurIPS 2024 camera-ready version"},{"id":"http://arxiv.org/abs/2410.18142v2","updated":"2024-12-03T04:19:36Z","published":"2024-10-22T13:03:28Z","title":"Analyzing Nobel Prize Literature with Large Language Models","summary":" This study examines the capabilities of advanced Large Language Models\n(LLMs), particularly the o1 model, in the context of literary analysis. The\noutputs of these models are compared directly to those produced by\ngraduate-level human participants. By focusing on two Nobel Prize-winning short\nstories, 'Nine Chapters' by Han Kang, the 2024 laureate, and 'Friendship' by\nJon Fosse, the 2023 laureate, the research explores the extent to which AI can\nengage with complex literary elements such as thematic analysis,\nintertextuality, cultural and historical contexts, linguistic and structural\ninnovations, and character development. Given the Nobel Prize's prestige and\nits emphasis on cultural, historical, and linguistic richness, applying LLMs to\nthese works provides a deeper understanding of both human and AI approaches to\ninterpretation. The study uses qualitative and quantitative evaluations of\ncoherence, creativity, and fidelity to the text, revealing the strengths and\nlimitations of AI in tasks typically reserved for human expertise. While LLMs\ndemonstrate strong analytical capabilities, particularly in structured tasks,\nthey often fall short in emotional nuance and coherence, areas where human\ninterpretation excels. This research underscores the potential for human-AI\ncollaboration in the humanities, opening new opportunities in literary studies\nand beyond.\n","authors":["Zhenyuan Yang","Zhengliang Liu","Jing Zhang","Cen Lu","Jiaxin Tai","Tianyang Zhong","Yiwei Li","Siyan Zhao","Teng Yao","Qing Liu","Jinlin Yang","Qixin Liu","Zhaowei Li","Kexin Wang","Longjun Ma","Dajiang Zhu","Yudan Ren","Bao Ge","Wei Zhang","Ning Qiang","Tuo Zhang","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2410.18142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02148v1","updated":"2024-12-03T04:09:19Z","published":"2024-12-03T04:09:19Z","title":"Mining Tweets to Predict Future Bitcoin Price","summary":" Bitcoin has increased investment interests in people during the last decade.\nWe have seen an increase in the number of posts on social media platforms about\ncryptocurrency, especially Bitcoin. This project focuses on analyzing user\ntweet data in combination with Bitcoin price data to see the relevance between\nprice fluctuations and the conversation between millions of people on Twitter.\nThis study also exploits this relationship between user tweets and bitcoin\nprices to predict the future bitcoin price. We are utilizing novel techniques\nand methods to analyze the data and make price predictions.\n","authors":["Ashutosh Hathidara","Gaurav Atavale","Suyash Chaudhary"],"pdf_url":"https://arxiv.org/pdf/2412.02148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.00382v4","updated":"2024-12-03T04:07:32Z","published":"2024-06-29T09:35:12Z","title":"Towards Universal Mesh Movement Networks","summary":" Solving complex Partial Differential Equations (PDEs) accurately and\nefficiently is an essential and challenging problem in all scientific and\nengineering disciplines. Mesh movement methods provide the capability to\nimprove the accuracy of the numerical solution without increasing the overall\nmesh degree of freedom count. Conventional sophisticated mesh movement methods\nare extremely expensive and struggle to handle scenarios with complex boundary\ngeometries. However, existing learning-based methods require re-training from\nscratch given a different PDE type or boundary geometry, which limits their\napplicability, and also often suffer from robustness issues in the form of\ninverted elements. In this paper, we introduce the Universal Mesh Movement\nNetwork (UM2N), which -- once trained -- can be applied in a non-intrusive,\nzero-shot manner to move meshes with different size distributions and\nstructures, for solvers applicable to different PDE types and boundary\ngeometries. UM2N consists of a Graph Transformer (GT) encoder for extracting\nfeatures and a Graph Attention Network (GAT) based decoder for moving the mesh.\nWe evaluate our method on advection and Navier-Stokes based examples, as well\nas a real-world tsunami simulation case. Our method outperforms existing\nlearning-based mesh movement methods in terms of the benchmarks described\nabove. In comparison to the conventional sophisticated Monge-Amp\\`ere\nPDE-solver based method, our approach not only significantly accelerates mesh\nmovement, but also proves effective in scenarios where the conventional method\nfails. Our project page is at https://erizmr.github.io/UM2N/.\n","authors":["Mingrui Zhang","Chunyang Wang","Stephan Kramer","Joseph G. Wallwork","Siyi Li","Jiancheng Liu","Xiang Chen","Matthew D. Piggott"],"pdf_url":"https://arxiv.org/pdf/2407.00382v4.pdf","comment":"Accepted at NeurIPS 2024 as a spotlight paper"},{"id":"http://arxiv.org/abs/2412.02142v1","updated":"2024-12-03T03:59:03Z","published":"2024-12-03T03:59:03Z","title":"Personalized Multimodal Large Language Models: A Survey","summary":" Multimodal Large Language Models (MLLMs) have become increasingly important\ndue to their state-of-the-art performance and ability to integrate multiple\ndata modalities, such as text, images, and audio, to perform complex tasks with\nhigh accuracy. This paper presents a comprehensive survey on personalized\nmultimodal large language models, focusing on their architecture, training\nmethods, and applications. We propose an intuitive taxonomy for categorizing\nthe techniques used to personalize MLLMs to individual users, and discuss the\ntechniques accordingly. Furthermore, we discuss how such techniques can be\ncombined or adapted when appropriate, highlighting their advantages and\nunderlying rationale. We also provide a succinct summary of personalization\ntasks investigated in existing research, along with the evaluation metrics\ncommonly used. Additionally, we summarize the datasets that are useful for\nbenchmarking personalized MLLMs. Finally, we outline critical open challenges.\nThis survey aims to serve as a valuable resource for researchers and\npractitioners seeking to understand and advance the development of personalized\nmultimodal large language models.\n","authors":["Junda Wu","Hanjia Lyu","Yu Xia","Zhehao Zhang","Joe Barrow","Ishita Kumar","Mehrnoosh Mirtaheri","Hongjie Chen","Ryan A. Rossi","Franck Dernoncourt","Tong Yu","Ruiyi Zhang","Jiuxiang Gu","Nesreen K. Ahmed","Yu Wang","Xiang Chen","Hanieh Deilamsalehy","Namyong Park","Sungchul Kim","Huanrui Yang","Subrata Mitra","Zhengmian Hu","Nedim Lipka","Dang Nguyen","Yue Zhao","Jiebo Luo","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2412.02142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18938v2","updated":"2024-12-03T03:56:52Z","published":"2024-09-27T17:38:36Z","title":"From Seconds to Hours: Reviewing MultiModal Large Language Models on\n Comprehensive Long Video Understanding","summary":" The integration of Large Language Models (LLMs) with visual encoders has\nrecently shown promising performance in visual understanding tasks, leveraging\ntheir inherent capability to comprehend and generate human-like text for visual\nreasoning. Given the diverse nature of visual data, MultiModal Large Language\nModels (MM-LLMs) exhibit variations in model designing and training for\nunderstanding images, short videos, and long videos. Our paper focuses on the\nsubstantial differences and unique challenges posed by long video understanding\ncompared to static image and short video understanding. Unlike static images,\nshort videos encompass sequential frames with both spatial and within-event\ntemporal information, while long videos consist of multiple events with\nbetween-event and long-term temporal information. In this survey, we aim to\ntrace and summarize the advancements of MM-LLMs from image understanding to\nlong video understanding. We review the differences among various visual\nunderstanding tasks and highlight the challenges in long video understanding,\nincluding more fine-grained spatiotemporal details, dynamic events, and\nlong-term dependencies. We then provide a detailed summary of the advancements\nin MM-LLMs in terms of model design and training methodologies for\nunderstanding long videos. Finally, we compare the performance of existing\nMM-LLMs on video understanding benchmarks of various lengths and discuss\npotential future directions for MM-LLMs in long video understanding.\n","authors":["Heqing Zou","Tianze Luo","Guiyang Xie"," Victor"," Zhang","Fengmao Lv","Guangcong Wang","Junyang Chen","Zhuochen Wang","Hansheng Zhang","Huaijian Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.18938v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2412.02136v1","updated":"2024-12-03T03:49:27Z","published":"2024-12-03T03:49:27Z","title":"Graph Learning for Planning: The Story Thus Far and Open Challenges","summary":" Graph learning is naturally well suited for use in planning due to its\nability to exploit relational structures exhibited in planning domains and to\ntake as input planning instances with arbitrary number of objects. In this\npaper, we study the usage of graph learning for planning thus far by studying\nthe theoretical and empirical effects on learning and planning performance of\n(1) graph representations of planning tasks, (2) graph learning architectures,\nand (3) optimisation formulations for learning. Our studies accumulate in the\nGOOSE framework which learns domain knowledge from small planning tasks in\norder to scale up to much larger planning tasks. In this paper, we also\nhighlight and propose the 5 open challenges in the general Learning for\nPlanning field that we believe need to be addressed for advancing the\nstate-of-the-art.\n","authors":["Dillon Z. Chen","Mingyu Hao","Sylvie Thiébaux","Felipe Trevizan"],"pdf_url":"https://arxiv.org/pdf/2412.02136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16578v2","updated":"2024-12-03T03:49:24Z","published":"2024-06-24T12:14:24Z","title":"QuadrupedGPT: Towards a Versatile Quadruped Agent in Open-ended Worlds","summary":" As robotic agents increasingly assist humans in reality, quadruped robots\noffer unique opportunities for interaction in complex scenarios due to their\nagile movement. However, building agents that can autonomously navigate, adapt,\nand respond to versatile goals remains a significant challenge. In this work,\nwe introduce QuadrupedGPT designed to follow diverse commands with agility\ncomparable to that of a pet. The primary challenges addressed include: i)\neffectively utilizing multimodal observations for informed decision-making; ii)\nachieving agile control by integrating locomotion and navigation; iii)\ndeveloping advanced cognition to execute long-term objectives. Our QuadrupedGPT\ninterprets human commands and environmental contexts using a large multimodal\nmodel. Leveraging its extensive knowledge base, the agent autonomously assigns\nparameters for adaptive locomotion policies and devises safe yet efficient\npaths toward its goals. Additionally, it employs high-level reasoning to\ndecompose long-term goals into a sequence of executable subgoals. Through\ncomprehensive experiments, our agent shows proficiency in handling diverse\ntasks and intricate instructions, representing a significant step toward the\ndevelopment of versatile quadruped agents for open-ended environments.\n","authors":["Yuting Mei","Ye Wang","Sipeng Zheng","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2406.16578v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2412.02130v1","updated":"2024-12-03T03:36:42Z","published":"2024-12-03T03:36:42Z","title":"A privacy-preserving distributed credible evidence fusion algorithm for\n collective decision-making","summary":" The theory of evidence reasoning has been applied to collective\ndecision-making in recent years. However, existing distributed evidence fusion\nmethods lead to participants' preference leakage and fusion failures as they\ndirectly exchange raw evidence and do not assess evidence credibility like\ncentralized credible evidence fusion (CCEF) does. To do so, a\nprivacy-preserving distributed credible evidence fusion method with three-level\nconsensus (PCEF) is proposed in this paper. In evidence difference measure\n(EDM) neighbor consensus, an evidence-free equivalent expression of EDM among\nneighbored agents is derived with the shared dot product protocol for pignistic\nprobability and the identical judgment of two events with maximal subjective\nprobabilities, so that evidence privacy is guaranteed due to such irreversible\nevidence transformation. In EDM network consensus, the non-neighbored EDMs are\ninferred and neighbored EDMs reach uniformity via interaction between linear\naverage consensus (LAC) and low-rank matrix completion with rank adaptation to\nguarantee EDM consensus convergence and no solution of inferring raw evidence\nin numerical iteration style. In fusion network consensus, a privacy-preserving\nLAC with a self-cancelling differential privacy term is proposed, where each\nagent adds its randomness to the sharing content and step-by-step cancels such\nrandomness in consensus iterations. Besides, the sufficient condition of the\nconvergence to the CCEF is explored, and it is proven that raw evidence is\nimpossibly inferred in such an iterative consensus. The simulations show that\nPCEF is close to CCEF both in credibility and fusion results and obtains higher\ndecision accuracy with less time-comsuming than existing methods.\n","authors":["Chaoxiong Ma","Yan Liang","Xinyu Yang","Han Wu","Huixia Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00051v2","updated":"2024-12-03T03:33:50Z","published":"2024-11-24T06:39:06Z","title":"TransFair: Transferring Fairness from Ocular Disease Classification to\n Progression Prediction","summary":" The use of artificial intelligence (AI) in automated disease classification\nsignificantly reduces healthcare costs and improves the accessibility of\nservices. However, this transformation has given rise to concerns about the\nfairness of AI, which disproportionately affects certain groups, particularly\npatients from underprivileged populations. Recently, a number of methods and\nlarge-scale datasets have been proposed to address group performance\ndisparities. Although these methods have shown effectiveness in disease\nclassification tasks, they may fall short in ensuring fair prediction of\ndisease progression, mainly because of limited longitudinal data with diverse\ndemographics available for training a robust and equitable prediction model. In\nthis paper, we introduce TransFair to enhance demographic fairness in\nprogression prediction for ocular diseases. TransFair aims to transfer a\nfairness-enhanced disease classification model to the task of progression\nprediction with fairness preserved. Specifically, we train a fair EfficientNet,\ntermed FairEN, equipped with a fairness-aware attention mechanism using\nextensive data for ocular disease classification. Subsequently, this fair\nclassification model is adapted to a fair progression prediction model through\nknowledge distillation, which aims to minimize the latent feature distances\nbetween the classification and progression prediction models. We evaluate\nFairEN and TransFair for fairness-enhanced ocular disease classification and\nprogression prediction using both two-dimensional (2D) and 3D retinal images.\nExtensive experiments and comparisons with models with and without considering\nfairness learning show that TransFair effectively enhances demographic equity\nin predicting ocular disease progression.\n","authors":["Leila Gheisi","Henry Chu","Raju Gottumukkala","Yan Luo","Xingquan Zhu","Mengyu Wang","Min Shi"],"pdf_url":"https://arxiv.org/pdf/2412.00051v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2412.02126v1","updated":"2024-12-03T03:29:27Z","published":"2024-12-03T03:29:27Z","title":"Benchmarking symbolic regression constant optimization schemes","summary":" Symbolic regression is a machine learning technique, and it has seen many\nadvancements in recent years, especially in genetic programming approaches\n(GPSR). Furthermore, it has been known for many years that constant\noptimization of parameters, during the evolutionary search, greatly increases\nGPSR performance However, different authors approach such tasks differently and\nno consensus exists regarding which methods perform best. In this work, we\nevaluate eight different parameter optimization methods, applied during\nevolutionary search, over ten known benchmark problems, in two different\nscenarios. We also propose using an under-explored metric called Tree Edit\nDistance (TED), aiming to identify symbolic accuracy. In conjunction with\nclassical error measures, we develop a combined analysis of model performance\nin symbolic regression. We then show that different constant optimization\nmethods perform better in certain scenarios and that there is no overall best\nchoice for every problem. Finally, we discuss how common metric decisions may\nbe biased and appear to generate better models in comparison.\n","authors":["L. G. A dos Reis","V. L. P. S. Caminha","T. J. P. Penna"],"pdf_url":"https://arxiv.org/pdf/2412.02126v1.pdf","comment":"9 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2412.02125v1","updated":"2024-12-03T03:27:48Z","published":"2024-12-03T03:27:48Z","title":"Optimizing Latent Goal by Learning from Trajectory Preference","summary":" A glowing body of work has emerged focusing on instruction-following policies\nfor open-world agents, aiming to better align the agent's behavior with human\nintentions. However, the performance of these policies is highly susceptible to\nthe initial prompt, which leads to extra efforts in selecting the best\ninstructions. We propose a framework named Preference Goal Tuning (PGT). PGT\nallows an instruction following policy to interact with the environment to\ncollect several trajectories, which will be categorized into positive and\nnegative samples based on preference. Then we use preference learning to\nfine-tune the initial goal latent representation with the categorized\ntrajectories while keeping the policy backbone frozen. The experiment result\nshows that with minimal data and training, PGT achieves an average relative\nimprovement of 72.0% and 81.6% over 17 tasks in 2 different foundation policies\nrespectively, and outperforms the best human-selected instructions. Moreover,\nPGT surpasses full fine-tuning in the out-of-distribution (OOD) task-execution\nenvironments by 13.4%, indicating that our approach retains strong\ngeneralization capabilities. Since our approach stores a single latent\nrepresentation for each task independently, it can be viewed as an efficient\nmethod for continual learning, without the risk of catastrophic forgetting or\ntask interference. In short, PGT enhances the performance of agents across\nnearly all tasks in the Minecraft Skillforge benchmark and demonstrates\nrobustness to the execution environment.\n","authors":["Guangyu Zhao","Kewei Lian","Haowei Lin","Haobo Fu","Qiang Fu","Shaofei Cai","Zihao Wang","Yitao Liang"],"pdf_url":"https://arxiv.org/pdf/2412.02125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.05650v2","updated":"2024-12-03T03:24:23Z","published":"2022-07-12T16:30:34Z","title":"A Single-Loop Gradient Descent and Perturbed Ascent Algorithm for\n Nonconvex Functional Constrained Optimization","summary":" Nonconvex constrained optimization problems can be used to model a number of\nmachine learning problems, such as multi-class Neyman-Pearson classification\nand constrained Markov decision processes. However, such kinds of problems are\nchallenging because both the objective and constraints are possibly nonconvex,\nso it is difficult to balance the reduction of the loss value and reduction of\nconstraint violation. Although there are a few methods that solve this class of\nproblems, all of them are double-loop or triple-loop algorithms, and they\nrequire oracles to solve some subproblems up to certain accuracy by tuning\nmultiple hyperparameters at each iteration. In this paper, we propose a novel\ngradient descent and perturbed ascent (GDPA) algorithm to solve a class of\nsmooth nonconvex inequality constrained problems. The GDPA is a primal-dual\nalgorithm, which only exploits the first-order information of both the\nobjective and constraint functions to update the primal and dual variables in\nan alternating way. The key feature of the proposed algorithm is that it is a\nsingle-loop algorithm, where only two step-sizes need to be tuned. We show that\nunder a mild regularity condition GDPA is able to find Karush-Kuhn-Tucker (KKT)\npoints of nonconvex functional constrained problems with convergence rate\nguarantees. To the best of our knowledge, it is the first single-loop algorithm\nthat can solve the general nonconvex smooth problems with nonconvex inequality\nconstraints. Numerical results also showcase the superiority of GDPA compared\nwith the best-known algorithms (in terms of both stationarity measure and\nfeasibility of the obtained solutions).\n","authors":["Songtao Lu"],"pdf_url":"https://arxiv.org/pdf/2207.05650v2.pdf","comment":"This work was published in the Proceedings of the Thirty-Ninth\n International Conference on Machine Learning (ICML 2022)"},{"id":"http://arxiv.org/abs/2406.03341v6","updated":"2024-12-03T03:23:05Z","published":"2024-06-05T14:58:32Z","title":"Tackling GenAI Copyright Issues: Originality Estimation and\n Genericization","summary":" The rapid progress of generative AI technology has sparked significant\ncopyright concerns, leading to numerous lawsuits filed against AI developers.\nNotably, generative AI's capacity for generating copyrighted characters has\nbeen well documented in the literature, and while various techniques for\nmitigating copyright issues have been studied, significant risks remain. Here,\nwe propose a genericization method that modifies the outputs of a generative\nmodel to make them more generic and less likely to imitate distinctive features\nof copyrighted materials. To achieve this, we introduce a metric for\nquantifying the level of originality of data, estimated by drawing samples from\na generative model, and applied in the genericization process. As a practical\nimplementation, we introduce PREGen (Prompt Rewriting-Enhanced Genericization),\nwhich combines our genericization method with an existing mitigation technique.\nCompared to the existing method, PREGen reduces the likelihood of generating\ncopyrighted characters by more than half when the names of copyrighted\ncharacters are used as the prompt. Additionally, while generative models can\nproduce copyrighted characters even when their names are not directly mentioned\nin the prompt, PREGen almost entirely prevents the generation of such\ncharacters in these cases.\n","authors":["Hiroaki Chiba-Okabe","Weijie J. Su"],"pdf_url":"https://arxiv.org/pdf/2406.03341v6.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2410.08559v4","updated":"2024-12-03T03:21:51Z","published":"2024-10-11T06:30:48Z","title":"Learning General Representation of 12-Lead Electrocardiogram with a\n Joint-Embedding Predictive Architecture","summary":" Electrocardiogram (ECG) captures the heart's electrical signals, offering\nvaluable information for diagnosing cardiac conditions. However, the scarcity\nof labeled data makes it challenging to fully leverage supervised learning in\nmedical domain. Self-supervised learning (SSL) offers a promising solution,\nenabling models to learn from unlabeled data and uncover meaningful patterns.\nIn this paper, we show that masked modeling in the latent space can be a\npowerful alternative to existing self-supervised methods in the ECG domain. We\nintroduce ECG-JEPA, a SSL model for 12-lead ECG analysis that learns semantic\nrepresentations of ECG data by predicting in the hidden latent space, bypassing\nthe need to reconstruct raw signals. This approach offers several advantages in\nthe ECG domain: (1) it avoids producing unnecessary details, such as noise,\nwhich is common in ECG; and (2) it addresses the limitations of na\\\"ive L2 loss\nbetween raw signals. Another key contribution is the introduction of\nCross-Pattern Attention (CroPA), a specialized masked attention mechanism\ntailored for 12-lead ECG data. ECG-JEPA is trained on the union of several open\nECG datasets, totaling approximately 180,000 samples, and achieves\nstate-of-the-art performance in various downstream tasks including ECG\nclassification and feature prediction. Our code is openly available at\nhttps://github.com/sehunfromdaegu/ECG_JEPA.\n","authors":["Sehun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.08559v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01197v2","updated":"2024-12-03T03:16:54Z","published":"2024-12-02T06:59:52Z","title":"InstantSwap: Fast Customized Concept Swapping across Sharp Shape\n Differences","summary":" Recent advances in Customized Concept Swapping (CCS) enable a text-to-image\nmodel to swap a concept in the source image with a customized target concept.\nHowever, the existing methods still face the challenges of inconsistency and\ninefficiency. They struggle to maintain consistency in both the foreground and\nbackground during concept swapping, especially when the shape difference is\nlarge between objects. Additionally, they either require time-consuming\ntraining processes or involve redundant calculations during inference. To\ntackle these issues, we introduce InstantSwap, a new CCS method that aims to\nhandle sharp shape disparity at speed. Specifically, we first extract the bbox\nof the object in the source image automatically based on attention map analysis\nand leverage the bbox to achieve both foreground and background consistency.\nFor background consistency, we remove the gradient outside the bbox during the\nswapping process so that the background is free from being modified. For\nforeground consistency, we employ a cross-attention mechanism to inject\nsemantic information into both source and target concepts inside the box. This\nhelps learn semantic-enhanced representations that encourage the swapping\nprocess to focus on the foreground objects. To improve swapping speed, we avoid\ncomputing gradients at each timestep but instead calculate them periodically to\nreduce the number of forward passes, which improves efficiency a lot with a\nlittle sacrifice on performance. Finally, we establish a benchmark dataset to\nfacilitate comprehensive evaluation. Extensive evaluations demonstrate the\nsuperiority and versatility of InstantSwap. Project Page:\nhttps://instantswap.github.io/\n","authors":["Chenyang Zhu","Kai Li","Yue Ma","Longxiang Tang","Chengyu Fang","Chubin Chen","Qifeng Chen","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2412.01197v2.pdf","comment":"Project Page: https://instantswap.github.io/. Github Page:\n https://github.com/chenyangzhu1/InstantSwap"},{"id":"http://arxiv.org/abs/2411.18279v3","updated":"2024-12-03T03:16:27Z","published":"2024-11-27T12:13:39Z","title":"Large Language Model-Brained GUI Agents: A Survey","summary":" GUIs have long been central to human-computer interaction, providing an\nintuitive and visually-driven way to access and interact with digital systems.\nThe advent of LLMs, particularly multimodal models, has ushered in a new era of\nGUI automation. They have demonstrated exceptional capabilities in natural\nlanguage understanding, code generation, and visual processing. This has paved\nthe way for a new generation of LLM-brained GUI agents capable of interpreting\ncomplex GUI elements and autonomously executing actions based on natural\nlanguage instructions. These agents represent a paradigm shift, enabling users\nto perform intricate, multi-step tasks through simple conversational commands.\nTheir applications span across web navigation, mobile app interactions, and\ndesktop automation, offering a transformative user experience that\nrevolutionizes how individuals interact with software. This emerging field is\nrapidly advancing, with significant progress in both research and industry.\n To provide a structured understanding of this trend, this paper presents a\ncomprehensive survey of LLM-brained GUI agents, exploring their historical\nevolution, core components, and advanced techniques. We address research\nquestions such as existing GUI agent frameworks, the collection and utilization\nof data for training specialized GUI agents, the development of large action\nmodels tailored for GUI tasks, and the evaluation metrics and benchmarks\nnecessary to assess their effectiveness. Additionally, we examine emerging\napplications powered by these agents. Through a detailed analysis, this survey\nidentifies key research gaps and outlines a roadmap for future advancements in\nthe field. By consolidating foundational knowledge and state-of-the-art\ndevelopments, this work aims to guide both researchers and practitioners in\novercoming challenges and unlocking the full potential of LLM-brained GUI\nagents.\n","authors":["Chaoyun Zhang","Shilin He","Jiaxu Qian","Bowen Li","Liqun Li","Si Qin","Yu Kang","Minghua Ma","Guyue Liu","Qingwei Lin","Saravan Rajmohan","Dongmei Zhang","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.18279v3.pdf","comment":"The collection of papers reviewed in this survey will be hosted and\n regularly updated on the GitHub repository:\n https://github.com/vyokky/LLM-Brained-GUI-Agents-Survey Additionally, a\n searchable webpage is available at https://aka.ms/gui-agent for easier access\n and exploration"},{"id":"http://arxiv.org/abs/2412.02114v1","updated":"2024-12-03T03:10:19Z","published":"2024-12-03T03:10:19Z","title":"OmniCreator: Self-Supervised Unified Generation with Universal Editing","summary":" We introduce OmniCreator, a novel framework that can conduct text-prompted\nunified (image+video) generation as well as editing all in one place.\nOmniCreator acquires generative and universal editing capabilities in a\nself-supervised manner, taking original text-video pairs as conditions while\nutilizing the same video as a denoising target to learn the semantic\ncorrespondence between video and text. During inference, when presented with a\ntext prompt and a video, OmniCreator is capable of generating a target that is\nfaithful to both, achieving a universal editing effect that is unconstrained as\nopposed to existing editing work that primarily focuses on certain editing\ntypes or relies on additional controls (e.g., structural conditions, attention\nfeatures, or DDIM inversion). On the other hand, when presented with a text\nprompt only, OmniCreator becomes generative, producing high-quality video as a\nresult of the semantic correspondence learned. Importantly, we found that the\nsame capabilities extend to images as is, making OmniCreator a truly unified\nframework. Further, due to the lack of existing generative video editing\nbenchmarks, we introduce the OmniBench-99 dataset, designed to evaluate the\nperformance of generative video editing models comprehensively. Extensive\nexperiments demonstrate that OmniCreator exhibits substantial superiority over\nall other models.\n","authors":["Haodong Chen","Lan Wang","Harry Yang","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2412.02114v1.pdf","comment":"Project: https://haroldchen19.github.io/OmniCreator-Page/"},{"id":"http://arxiv.org/abs/2412.02912v1","updated":"2024-12-03T23:37:47Z","published":"2024-12-03T23:37:47Z","title":"ShapeWords: Guiding Text-to-Image Synthesis with 3D Shape-Aware Prompts","summary":" We introduce ShapeWords, an approach for synthesizing images based on 3D\nshape guidance and text prompts. ShapeWords incorporates target 3D shape\ninformation within specialized tokens embedded together with the input text,\neffectively blending 3D shape awareness with textual context to guide the image\nsynthesis process. Unlike conventional shape guidance methods that rely on\ndepth maps restricted to fixed viewpoints and often overlook full 3D structure\nor textual context, ShapeWords generates diverse yet consistent images that\nreflect both the target shape's geometry and the textual description.\nExperimental results show that ShapeWords produces images that are more\ntext-compliant, aesthetically plausible, while also maintaining 3D shape\nawareness.\n","authors":["Dmitry Petrov","Pradyumn Goyal","Divyansh Shivashok","Yuanming Tao","Melinos Averkiou","Evangelos Kalogerakis"],"pdf_url":"https://arxiv.org/pdf/2412.02912v1.pdf","comment":"Project webpage: https://lodurality.github.io/shapewords/"},{"id":"http://arxiv.org/abs/2402.04376v3","updated":"2024-12-03T23:22:26Z","published":"2024-02-06T20:30:19Z","title":"Scaling laws for learning with real and surrogate data","summary":" Collecting large quantities of high-quality data can be prohibitively\nexpensive or impractical, and a bottleneck in machine learning. One may instead\naugment a small set of $n$ data points from the target distribution with data\nfrom more accessible sources, e.g. data collected under different circumstances\nor synthesized by generative models. We refer to such data as `surrogate data'.\nWe study a weighted empirical risk minimization (ERM) approach for integrating\nsurrogate data into training. We analyze mathematically this method under\nseveral classical statistical models, and validate our findings empirically on\ndatasets from different domains. Our main findings are: $(i)$ Integrating\nsurrogate data can significantly reduce the test error on the original\ndistribution. Surprisingly, this can happen even when the surrogate data is\nunrelated to the original ones. We trace back this behavior to the classical\nStein's paradox. $(ii)$ In order to reap the benefit of surrogate data, it is\ncrucial to use optimally weighted ERM. $(iii)$ The test error of models trained\non mixtures of real and surrogate data is approximately described by a scaling\nlaw. This scaling law can be used to predict the optimal weighting scheme, and\nto choose the amount of surrogate data to add.\n","authors":["Ayush Jain","Andrea Montanari","Eren Sasoglu"],"pdf_url":"https://arxiv.org/pdf/2402.04376v3.pdf","comment":"Added new experiment and minor changes"},{"id":"http://arxiv.org/abs/2412.02906v1","updated":"2024-12-03T23:19:40Z","published":"2024-12-03T23:19:40Z","title":"Does Few-Shot Learning Help LLM Performance in Code Synthesis?","summary":" Large language models (LLMs) have made significant strides at code generation\nthrough improved model design, training, and chain-of-thought. However,\nprompt-level optimizations remain an important yet under-explored aspect of\nLLMs for coding. This work focuses on the few-shot examples present in most\ncode generation prompts, offering a systematic study on whether few-shot\nexamples improve LLM's coding capabilities, which few-shot examples have the\nlargest impact, and how to select impactful examples. Our work offers 2\napproaches for selecting few-shot examples, a model-free method,\nCODEEXEMPLAR-FREE, and a model-based method, CODEEXEMPLAR-BASED. The 2 methods\noffer a trade-off between improved performance and reliance on training data\nand interpretability. Both methods significantly improve CodeLlama's coding\nability across the popular HumanEval+ coding benchmark. In summary, our work\nprovides valuable insights into how to pick few-shot examples in code\ngeneration prompts to improve LLM code generation capabilities.\n","authors":["Derek Xu","Tong Xie","Botao Xia","Haoyu Li","Yunsheng Bai","Yizhou Sun","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.02906v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02904v1","updated":"2024-12-03T23:14:47Z","published":"2024-12-03T23:14:47Z","title":"Enhancing Trust in Large Language Models with Uncertainty-Aware\n Fine-Tuning","summary":" Large language models (LLMs) have revolutionized the field of natural\nlanguage processing with their impressive reasoning and question-answering\ncapabilities. However, these models are sometimes prone to generating\ncredible-sounding but incorrect information, a phenomenon known as LLM\nhallucinations. Reliable uncertainty estimation in LLMs is essential for\nfostering trust in their generated responses and serves as a critical tool for\nthe detection and prevention of erroneous or hallucinated outputs. To achieve\nreliable and well-calibrated uncertainty quantification in open-ended and\nfree-form natural language generation, we propose an uncertainty-aware\nfine-tuning approach for LLMs. This approach enhances the model's ability to\nprovide reliable uncertainty estimates without compromising accuracy, thereby\nguiding them to produce more trustworthy responses. We introduce a novel\nuncertainty-aware causal language modeling loss function, grounded in the\nprinciples of decision theory. Through rigorous evaluation on multiple\nfree-form question-answering datasets and models, we demonstrate that our\nuncertainty-aware fine-tuning approach yields better calibrated uncertainty\nestimates in natural language generation tasks than fine-tuning with the\nstandard causal language modeling loss. Furthermore, the experimental results\nshow that the proposed method significantly improves the model's ability to\ndetect hallucinations and identify out-of-domain prompts.\n","authors":["Ranganath Krishnan","Piyush Khanna","Omesh Tickoo"],"pdf_url":"https://arxiv.org/pdf/2412.02904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02897v1","updated":"2024-12-03T23:01:21Z","published":"2024-12-03T23:01:21Z","title":"MLD-EA: Check and Complete Narrative Coherence by Introducing Emotions\n and Actions","summary":" Narrative understanding and story generation are critical challenges in\nnatural language processing (NLP), with much of the existing research focused\non summarization and question-answering tasks. While previous studies have\nexplored predicting plot endings and generating extended narratives, they often\nneglect the logical coherence within stories, leaving a significant gap in the\nfield. To address this, we introduce the Missing Logic Detector by Emotion and\nAction (MLD-EA) model, which leverages large language models (LLMs) to identify\nnarrative gaps and generate coherent sentences that integrate seamlessly with\nthe story's emotional and logical flow. The experimental results demonstrate\nthat the MLD-EA model enhances narrative understanding and story generation,\nhighlighting LLMs' potential as effective logic checkers in story writing with\nlogical coherence and emotional consistency. This work fills a gap in NLP\nresearch and advances border goals of creating more sophisticated and reliable\nstory-generation systems.\n","authors":["Jinming Zhang","Yunfei Long"],"pdf_url":"https://arxiv.org/pdf/2412.02897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02893v1","updated":"2024-12-03T22:58:21Z","published":"2024-12-03T22:58:21Z","title":"Removing Spurious Correlation from Neural Network Interpretations","summary":" The existing algorithms for identification of neurons responsible for\nundesired and harmful behaviors do not consider the effects of confounders such\nas topic of the conversation. In this work, we show that confounders can create\nspurious correlations and propose a new causal mediation approach that controls\nthe impact of the topic. In experiments with two large language models, we\nstudy the localization hypothesis and show that adjusting for the effect of\nconversation topic, toxicity becomes less localized.\n","authors":["Milad Fotouhi","Mohammad Taha Bahadori","Oluwaseyi Feyisetan","Payman Arabshahi","David Heckerman"],"pdf_url":"https://arxiv.org/pdf/2412.02893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02889v1","updated":"2024-12-03T22:47:47Z","published":"2024-12-03T22:47:47Z","title":"Deep-Learning Based Docking Methods: Fair Comparisons to Conventional\n Docking Workflows","summary":" The diffusion learning method, DiffDock, for docking small-molecule ligands\ninto protein binding sites was recently introduced. Results included\ncomparisons to more conventional docking approaches, with DiffDock showing\nsuperior performance. Here, we employ a fully automatic workflow using the\nSurflex-Dock methods to generate a fair baseline for conventional docking\napproaches. Results were generated for the common and expected situation where\na binding site location is known and also for the condition of an unknown\nbinding site. For the known binding site condition, Surflex-Dock success rates\nat 2.0 Angstroms RMSD far exceeded those for DiffDock (Top-1/Top-5 success\nrates, respectively, were 68/81% compared with 45/51%). Glide performed with\nsimilar success rates (67/73%) to Surflex-Dock for the known binding site\ncondition, and results for AutoDock Vina and Gnina followed this pattern. For\nthe unknown binding site condition, using an automated method to identify\nmultiple binding pockets, Surflex-Dock success rates again exceeded those of\nDiffDock, but by a somewhat lesser margin. DiffDock made use of roughly 17,000\nco-crystal structures for learning (98% of PDBBind version 2020, pre-2019\nstructures) for a training set in order to predict on 363 test cases (2% of\nPDBBind 2020) from 2019 forward. DiffDock's performance was inextricably linked\nwith the presence of near-neighbor cases of close to identical protein-ligand\ncomplexes in the training set for over half of the test set cases. DiffDock\nexhibited a 40 percentage point difference on near-neighbor cases (two-thirds\nof all test cases) compared with cases with no near-neighbor training case.\nDiffDock has apparently encoded a type of table-lookup during its learning\nprocess, rendering meaningful applications beyond its reach. Further, it does\nnot perform even close to competitively with a competently run modern docking\nworkflow.\n","authors":["Ajay N. Jain","Ann E. Cleves","W. Patrick Walters"],"pdf_url":"https://arxiv.org/pdf/2412.02889v1.pdf","comment":"19 pages including references and appendices, 7 figures"},{"id":"http://arxiv.org/abs/2410.23391v2","updated":"2024-12-03T22:43:27Z","published":"2024-10-30T18:50:16Z","title":"Understanding Representation of Deep Equilibrium Models from Neural\n Collapse Perspective","summary":" Deep Equilibrium Model (DEQ), which serves as a typical implicit neural\nnetwork, emphasizes their memory efficiency and competitive performance\ncompared to explicit neural networks. However, there has been relatively\nlimited theoretical analysis on the representation of DEQ. In this paper, we\nutilize the Neural Collapse ($\\mathcal{NC}$) as a tool to systematically\nanalyze the representation of DEQ under both balanced and imbalanced\nconditions. $\\mathcal{NC}$ is an interesting phenomenon in the neural network\ntraining process that characterizes the geometry of class features and\nclassifier weights. While extensively studied in traditional explicit neural\nnetworks, the $\\mathcal{NC}$ phenomenon has not received substantial attention\nin the context of implicit neural networks. We theoretically show that\n$\\mathcal{NC}$ exists in DEQ under balanced conditions. Moreover, in imbalanced\nsettings, despite the presence of minority collapse, DEQ demonstrated\nadvantages over explicit neural networks. These advantages include the\nconvergence of extracted features to the vertices of a simplex equiangular\ntight frame and self-duality properties under mild conditions, highlighting\nDEQ's superiority in handling imbalanced datasets. Finally, we validate our\ntheoretical analyses through experiments in both balanced and imbalanced\nscenarios.\n","authors":["Haixiang Sun","Ye Shi"],"pdf_url":"https://arxiv.org/pdf/2410.23391v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14485v2","updated":"2024-12-03T22:27:12Z","published":"2024-11-20T02:49:18Z","title":"Mediating Modes of Thought: LLM's for design scripting","summary":" Architects adopt visual scripting and parametric design tools to explore more\nexpansive design spaces (Coates, 2010), refine their thinking about the\ngeometric logic of their design (Woodbury, 2010), and overcome conventional\nsoftware limitations (Burry, 2011). Despite two decades of effort to make\ndesign scripting more accessible, a disconnect between a designer's free ways\nof thinking and the rigidity of algorithms remains (Burry, 2011). Recent\ndevelopments in Large Language Models (LLMs) suggest this might soon change, as\nLLMs encode a general understanding of human context and exhibit the capacity\nto produce geometric logic. This project speculates that if LLMs can\neffectively mediate between user intent and algorithms, they become a powerful\ntool to make scripting in design more widespread and fun. We explore if such\nsystems can interpret natural language prompts to assemble geometric operations\nrelevant to computational design scripting. In the system, multiple layers of\nLLM agents are configured with specific context to infer the user intent and\nconstruct a sequential logic. Given a user's high-level text prompt, a\ngeometric description is created, distilled into a sequence of logic\noperations, and mapped to software-specific commands. The completed script is\nconstructed in the user's visual programming interface. The system succeeds in\ngenerating complete visual scripts up to a certain complexity but fails beyond\nthis complexity threshold. It shows how LLMs can make design scripting much\nmore aligned with human creativity and thought. Future research should explore\nconversational interactions, expand to multimodal inputs and outputs, and\nassess the performance of these tools.\n","authors":["Moritz Rietschel","Fang Guo","Kyle Steinfeld"],"pdf_url":"https://arxiv.org/pdf/2411.14485v2.pdf","comment":"Published at ACADIA 2024"},{"id":"http://arxiv.org/abs/2412.02878v1","updated":"2024-12-03T22:25:42Z","published":"2024-12-03T22:25:42Z","title":"Modeling and Discovering Direct Causes for Predictive Models","summary":" We introduce a causal modeling framework that captures the input-output\nbehavior of predictive models (e.g., machine learning models) by representing\nit using causal graphs. The framework enables us to define and identify\nfeatures that directly cause the predictions, which has broad implications for\ndata collection and model evaluation. We show two assumptions under which the\ndirect causes can be discovered from data, one of which further simplifies the\ndiscovery process. In addition to providing sound and complete algorithms, we\npropose an optimization technique based on an independence rule that can be\nintegrated with the algorithms to speed up the discovery process both\ntheoretically and empirically.\n","authors":["Yizuo Chen","Amit Bhatia"],"pdf_url":"https://arxiv.org/pdf/2412.02878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11598v2","updated":"2024-12-03T22:23:53Z","published":"2024-09-17T23:10:04Z","title":"Towards Fair RAG: On the Impact of Fair Ranking in Retrieval-Augmented\n Generation","summary":" Many language models now enhance their responses with retrieval capabilities,\nleading to the widespread adoption of retrieval-augmented generation (RAG)\nsystems. However, despite retrieval being a core component of RAG, much of the\nresearch in this area overlooks the extensive body of work on fair ranking,\nneglecting the importance of considering all stakeholders involved. This paper\npresents the first systematic evaluation of RAG systems integrated with fair\nrankings. We focus specifically on measuring the fair exposure of each relevant\nitem across the rankings utilized by RAG systems (i.e., item-side fairness),\naiming to promote equitable growth for relevant item providers. To gain a deep\nunderstanding of the relationship between item-fairness, ranking quality, and\ngeneration quality in the context of RAG, we analyze nine different RAG systems\nthat incorporate fair rankings across seven distinct datasets. Our findings\nindicate that RAG systems with fair rankings can maintain a high level of\ngeneration quality and, in many cases, even outperform traditional RAG systems,\ndespite the general trend of a tradeoff between ensuring fairness and\nmaintaining system-effectiveness. We believe our insights lay the groundwork\nfor responsible and equitable RAG systems and open new avenues for future\nresearch. We publicly release our codebase and dataset at\nhttps://github.com/kimdanny/Fair-RAG.\n","authors":["To Eun Kim","Fernando Diaz"],"pdf_url":"https://arxiv.org/pdf/2409.11598v2.pdf","comment":"Top 5 Spotlight at AFME Workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.02875v1","updated":"2024-12-03T22:20:52Z","published":"2024-12-03T22:20:52Z","title":"Out-of-Distribution Detection for Neurosymbolic Autonomous Cyber Agents","summary":" Autonomous agents for cyber applications take advantage of modern defense\ntechniques by adopting intelligent agents with conventional and\nlearning-enabled components. These intelligent agents are trained via\nreinforcement learning (RL) algorithms, and can learn, adapt to, reason about\nand deploy security rules to defend networked computer systems while\nmaintaining critical operational workflows. However, the knowledge available\nduring training about the state of the operational network and its environment\nmay be limited. The agents should be trustworthy so that they can reliably\ndetect situations they cannot handle, and hand them over to cyber experts. In\nthis work, we develop an out-of-distribution (OOD) Monitoring algorithm that\nuses a Probabilistic Neural Network (PNN) to detect anomalous or OOD situations\nof RL-based agents with discrete states and discrete actions. To demonstrate\nthe effectiveness of the proposed approach, we integrate the OOD monitoring\nalgorithm with a neurosymbolic autonomous cyber agent that uses behavior trees\nwith learning-enabled components. We evaluate the proposed approach in a\nsimulated cyber environment under different adversarial strategies.\nExperimental results over a large number of episodes illustrate the overall\nefficiency of our proposed approach.\n","authors":["Ankita Samaddar","Nicholas Potteiger","Xenofon Koutsoukos"],"pdf_url":"https://arxiv.org/pdf/2412.02875v1.pdf","comment":"9 pages, 10 figures, IEEE International Conference on AI in\n Cybersecurity (ICAIC), 2025"},{"id":"http://arxiv.org/abs/2412.02869v1","updated":"2024-12-03T22:08:45Z","published":"2024-12-03T22:08:45Z","title":"Constrained Identifiability of Causal Effects","summary":" We study the identification of causal effects in the presence of different\ntypes of constraints (e.g., logical constraints) in addition to the causal\ngraph. These constraints impose restrictions on the models (parameterizations)\ninduced by the causal graph, reducing the set of models considered by the\nidentifiability problem. We formalize the notion of constrained\nidentifiability, which takes a set of constraints as another input to the\nclassical definition of identifiability. We then introduce a framework for\ntesting constrained identifiability by employing tractable Arithmetic Circuits\n(ACs), which enables us to accommodate constraints systematically. We show that\nthis AC-based approach is at least as complete as existing algorithms (e.g.,\ndo-calculus) for testing classical identifiability, which only assumes the\nconstraint of strict positivity. We use examples to demonstrate the\neffectiveness of this AC-based approach by showing that unidentifiable causal\neffects may become identifiable under different types of constraints.\n","authors":["Yizuo Chen","Adnan Darwiche"],"pdf_url":"https://arxiv.org/pdf/2412.02869v1.pdf","comment":null}],"Genomics":[{"id":"http://arxiv.org/abs/2412.01352v2","updated":"2024-12-03T08:50:51Z","published":"2024-12-02T10:29:52Z","title":"The influence of chromosomal inversions on genetic variation and clinal\n patterns in genomic data of Drosophila melanogaster","summary":" Chromosomal inversions are structural mutations resulting in the reversal of\nthe gene order along the corresponding genomic region. Due to their influence\non recombination patterns, they can have a major influence on genetic variation\nand the evolutionary process. Accordingly, inversions can act as supergenes\nthat keep together co-adapted gene complexes that form the genetic basis of\nmany complex phenotypes in diverse organisms. In this book chapter, I will\npresent an analysis pipeline to investigate the influence of two common\ncosmopolitan inversion, In(2L)t and In(3R)Payne, on genome-wide genetic\nvariation and differentiation in world-wide populations of the vinegar fly\nDrosophila melanogaster. We will use single-individual and pooled resequencing\ndata in combination with population genomics analysis tools to explore the\nimpact of these two inversions on genetic variation, population structure, and\nclinal variation in natural populations.\n","authors":["Martin Kapun"],"pdf_url":"https://arxiv.org/pdf/2412.01352v2.pdf","comment":"book chapter; 47 pages, 9 Figures"},{"id":"http://arxiv.org/abs/2412.02915v1","updated":"2024-12-03T23:58:35Z","published":"2024-12-03T23:58:35Z","title":"Single-Cell Omics Arena: A Benchmark Study for Large Language Models on\n Cell Type Annotation Using Single-Cell Data","summary":" Over the past decade, the revolution in single-cell sequencing has enabled\nthe simultaneous molecular profiling of various modalities across thousands of\nindividual cells, allowing scientists to investigate the diverse functions of\ncomplex tissues and uncover underlying disease mechanisms. Among all the\nanalytical steps, assigning individual cells to specific types is fundamental\nfor understanding cellular heterogeneity. However, this process is usually\nlabor-intensive and requires extensive expert knowledge. Recent advances in\nlarge language models (LLMs) have demonstrated their ability to efficiently\nprocess and synthesize vast corpora of text to automatically extract essential\nbiological knowledge, such as marker genes, potentially promoting more\nefficient and automated cell type annotations. To thoroughly evaluate the\ncapability of modern instruction-tuned LLMs in automating the cell type\nidentification process, we introduce SOAR, a comprehensive benchmarking study\nof LLMs for cell type annotation tasks in single-cell genomics. Specifically,\nwe assess the performance of 8 instruction-tuned LLMs across 11 datasets,\nspanning multiple cell types and species. Our study explores the potential of\nLLMs to accurately classify and annotate cell types in single-cell RNA\nsequencing (scRNA-seq) data, while extending their application to multiomics\ndata through cross-modality translation. Additionally, we evaluate the\neffectiveness of chain-of-thought (CoT) prompting techniques in generating\ndetailed biological insights during the annotation process. The results\ndemonstrate that LLMs can provide robust interpretations of single-cell data\nwithout requiring additional fine-tuning, advancing the automation of cell type\nannotation in genomics research.\n","authors":["Junhao Liu","Siwei Xu","Lei Zhang","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02882v1","updated":"2024-12-03T22:34:38Z","published":"2024-12-03T22:34:38Z","title":"iSEEtree: interactive explorer for hierarchical data","summary":" $\\textbf{Motivation:}$ Hierarchical data structures are prevalent across\nseveral fields of research, as they represent an organised and efficient\napproach to study complex interconnected systems. Their significance is\nparticularly evident in microbiome analysis, where microbial communities are\nclassified at various taxonomic levels along the phylogenetic tree. In light of\nthis trend, the R/Bioconductor community has established a reproducible\nanalytical framework for hierarchical data, which relies on the highly generic\nand optimised TreeSummarizedExperiment data container. However, using this\nframework requires basic proficiency in programming.\n $\\textbf{Results:}$ To reduce the entry requirements, we developed iSEEtree,\nan R shiny app which provides a visual interface for the analysis and\nexploration of TreeSummarizedExperiment objects, thereby expanding the\ninteractive graphics capabilities of related work to hierarchical structures.\nThis way, users can interactively explore several aspects of their data without\nthe need for extensive knowledge of R programming. We describe how iSEEtree\nenables the exploration of hierarchical multi-table data and demonstrate its\nfunctionality with applications to microbiome analysis.\n $\\textbf{Availability and Implementation:}$ iSEEtree was implemented in the R\nprogramming language and is available on Bioconductor at\n$\\href{https://bioconductor.org/packages/iSEEtree}{https\\text{:}//bioconductor\\text{.}org/packages/iSEEtree}$\nunder an Artistic 2.0 license.\n $\\textbf{Contact:}$ $\\href{email}{giulio\\text{.}benedetti@utu\\text{.}fi}$ or\n$\\href{email}{leo\\text{.}lahti@utu\\text{.}fi}$.\n","authors":["Giulio Benedetti","Ely Seraidarian","Theotime Pralas","Akewak Jeba","Tuomas Borman","Leo Lahti"],"pdf_url":"https://arxiv.org/pdf/2412.02882v1.pdf","comment":"4 pages, 1 figure"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2412.02698v1","updated":"2024-12-03T18:59:51Z","published":"2024-12-03T18:59:51Z","title":"Scaling BERT Models for Turkish Automatic Punctuation and Capitalization\n Correction","summary":" This paper investigates the effectiveness of BERT based models for automated\npunctuation and capitalization corrections in Turkish texts across five\ndistinct model sizes. The models are designated as Tiny, Mini, Small, Medium,\nand Base. The design and capabilities of each model are tailored to address the\nspecific challenges of the Turkish language, with a focus on optimizing\nperformance while minimizing computational overhead. The study presents a\nsystematic comparison of the performance metrics precision, recall, and F1\nscore of each model, offering insights into their applicability in diverse\noperational contexts. The results demonstrate a significant improvement in text\nreadability and accuracy as model size increases, with the Base model achieving\nthe highest correction precision. This research provides a comprehensive guide\nfor selecting the appropriate model size based on specific user needs and\ncomputational resources, establishing a framework for deploying these models in\nreal-world applications to enhance the quality of written Turkish.\n","authors":["Abdulkader Saoud","Mahmut Alomeyr","Himmet Toprak Kesgin","Mehmet Fatih Amasyali"],"pdf_url":"https://arxiv.org/pdf/2412.02698v1.pdf","comment":"2024 Innovations in Intelligent Systems and Applications Conference\n (ASYU)"},{"id":"http://arxiv.org/abs/2412.02695v1","updated":"2024-12-03T18:59:35Z","published":"2024-12-03T18:59:35Z","title":"An ADHD Diagnostic Interface Based on EEG Spectrograms and Deep Learning\n Techniques","summary":" This paper introduces an innovative approach to\nAttention-deficit/hyperactivity disorder (ADHD) diagnosis by employing deep\nlearning (DL) techniques on electroencephalography (EEG) signals. This method\naddresses the limitations of current behavior-based diagnostic methods, which\noften lead to misdiagnosis and gender bias. By utilizing a publicly available\nEEG dataset and converting the signals into spectrograms, a Resnet-18\nconvolutional neural network (CNN) architecture was used to extract features\nfor ADHD classification. The model achieved a high precision, recall, and an\noverall F1 score of 0.9. Feature extraction highlighted significant brain\nregions (frontopolar, parietal, and occipital lobes) associated with ADHD.\nThese insights guided the creation of a three-part digital diagnostic system,\nfacilitating cost-effective and accessible ADHD screening, especially in school\nenvironments. This system enables earlier and more accurate identification of\nstudents at risk for ADHD, providing timely support to enhance their\ndevelopmental outcomes. This study showcases the potential of integrating EEG\nanalysis with DL to enhance ADHD diagnostics, presenting a viable alternative\nto traditional methods.\n","authors":["Medha Pappula","Syed Muhammad Anwar"],"pdf_url":"https://arxiv.org/pdf/2412.02695v1.pdf","comment":"Presented at SIPAIM 2024"},{"id":"http://arxiv.org/abs/2412.02685v1","updated":"2024-12-03T18:56:07Z","published":"2024-12-03T18:56:07Z","title":"T-REG: Preference Optimization with Token-Level Reward Regularization","summary":" Reinforcement learning from human feedback (RLHF) has been crucial in\naligning large language models (LLMs) with human values. Traditionally, RLHF\ninvolves generating responses to a query and using a reward model to assign a\nreward to the entire response. However, this approach faces challenges due to\nits reliance on a single, sparse reward, which makes it challenging for the\nmodel to identify which parts of the sequence contribute most significantly to\nthe final reward. Recent methods have attempted to address this limitation by\nintroducing token-level rewards. However, these methods often rely on either a\ntrained credit assignment model or AI annotators, raising concerns about the\nquality and reliability of the rewards. In this paper, we propose token-level\nreward regularization (T-REG), a novel approach that leverages both\nsequence-level and token-level rewards for preference optimization. Harnessing\nthe self-refinement capabilities of LLMs, our method uses contrastive prompting\nto enable LLMs to self-generate token-level rewards. These self-generated\nrewards then act as reward regularization, guiding the model to more\neffectively distribute sequence-level rewards across tokens. This facilitates\nbetter token-level credit assignment and enhances alignment performance.\nExperiments on the instruction following benchmarks, including Alpaca Eval 2\nand Arena-Hard, show that our method consistently outperforms baseline methods\nby up to 3.8% and 4.4%, respectively. We will release the code and models at\nhttps://github.com/wzhouad/T-REG.\n","authors":["Wenxuan Zhou","Shujian Zhang","Lingxiao Zhao","Tao Meng"],"pdf_url":"https://arxiv.org/pdf/2412.02685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02682v1","updated":"2024-12-03T18:54:49Z","published":"2024-12-03T18:54:49Z","title":"The Asymptotic Behavior of Attention in Transformers","summary":" A key component of transformers is the attention mechanism orchestrating how\neach token influences the propagation of every other token through a\ntransformer. In this paper we provide a rigorous, mathematical analysis of the\nasymptotic properties of attention in transformers. Although we present several\nresults based on different assumptions, all of them point to the same\nconclusion, all tokens asymptotically converge to each other, a phenomenon that\nhas been empirically reported in the literature. Our findings are carefully\ncompared with existing theoretical results and illustrated by simulations and\nexperimental studies using the GPT-2 model.\n","authors":["Álvaro Rodríguez Abella","João Pedro Silvestre","Paulo Tabuada"],"pdf_url":"https://arxiv.org/pdf/2412.02682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02676v1","updated":"2024-12-03T18:51:39Z","published":"2024-12-03T18:51:39Z","title":"Planning-Guided Diffusion Policy Learning for Generalizable Contact-Rich\n Bimanual Manipulation","summary":" Contact-rich bimanual manipulation involves precise coordination of two arms\nto change object states through strategically selected contacts and motions.\nDue to the inherent complexity of these tasks, acquiring sufficient\ndemonstration data and training policies that generalize to unseen scenarios\nremain a largely unresolved challenge. Building on recent advances in planning\nthrough contacts, we introduce Generalizable Planning-Guided Diffusion Policy\nLearning (GLIDE), an approach that effectively learns to solve contact-rich\nbimanual manipulation tasks by leveraging model-based motion planners to\ngenerate demonstration data in high-fidelity physics simulation. Through\nefficient planning in randomized environments, our approach generates\nlarge-scale and high-quality synthetic motion trajectories for tasks involving\ndiverse objects and transformations. We then train a task-conditioned diffusion\npolicy via behavior cloning using these demonstrations. To tackle the\nsim-to-real gap, we propose a set of essential design options in feature\nextraction, task representation, action prediction, and data augmentation that\nenable learning robust prediction of smooth action sequences and generalization\nto unseen scenarios. Through experiments in both simulation and the real world,\nwe demonstrate that our approach can enable a bimanual robotic system to\neffectively manipulate objects of diverse geometries, dimensions, and physical\nproperties. Website: https://glide-manip.github.io/\n","authors":["Xuanlin Li","Tong Zhao","Xinghao Zhu","Jiuguang Wang","Tao Pang","Kuan Fang"],"pdf_url":"https://arxiv.org/pdf/2412.02676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14052v2","updated":"2024-12-03T18:48:00Z","published":"2024-10-17T21:47:11Z","title":"From Isolated Conversations to Hierarchical Schemas: Dynamic Tree Memory\n Representation for LLMs","summary":" Recent advancements in large language models have significantly improved\ntheir context windows, yet challenges in effective long-term memory management\nremain. We introduce MemTree, an algorithm that leverages a dynamic,\ntree-structured memory representation to optimize the organization, retrieval,\nand integration of information, akin to human cognitive schemas. MemTree\norganizes memory hierarchically, with each node encapsulating aggregated\ntextual content, corresponding semantic embeddings, and varying abstraction\nlevels across the tree's depths. Our algorithm dynamically adapts this memory\nstructure by computing and comparing semantic embeddings of new and existing\ninformation to enrich the model's context-awareness. This approach allows\nMemTree to handle complex reasoning and extended interactions more effectively\nthan traditional memory augmentation methods, which often rely on flat lookup\ntables. Evaluations on benchmarks for multi-turn dialogue understanding and\ndocument question answering show that MemTree significantly enhances\nperformance in scenarios that demand structured memory management.\n","authors":["Alireza Rezazadeh","Zichao Li","Wei Wei","Yujia Bao"],"pdf_url":"https://arxiv.org/pdf/2410.14052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02674v1","updated":"2024-12-03T18:47:26Z","published":"2024-12-03T18:47:26Z","title":"Mind the Gap: Examining the Self-Improvement Capabilities of Large\n Language Models","summary":" Self-improvement is a mechanism in Large Language Model (LLM) pre-training,\npost-training and test-time inference. We explore a framework where the model\nverifies its own outputs, filters or reweights data based on this verification,\nand distills the filtered data. Despite several empirical successes, a\nfundamental understanding is still lacking. In this work, we initiate a\ncomprehensive, modular and controlled study on LLM self-improvement. We provide\na mathematical formulation for self-improvement, which is largely governed by a\nquantity which we formalize as the generation-verification gap. Through\nexperiments with various model families and tasks, we discover a scaling\nphenomenon of self-improvement -- a variant of the generation-verification gap\nscales monotonically with the model pre-training flops. We also examine when\nself-improvement is possible, an iterative self-improvement procedure, and ways\nto improve its performance. Our findings not only advance understanding of LLM\nself-improvement with practical implications, but also open numerous avenues\nfor future research into its capabilities and boundaries.\n","authors":["Yuda Song","Hanlin Zhang","Carson Eisenach","Sham Kakade","Dean Foster","Udaya Ghai"],"pdf_url":"https://arxiv.org/pdf/2412.02674v1.pdf","comment":"41 pages, 19 figures"},{"id":"http://arxiv.org/abs/2411.17861v2","updated":"2024-12-03T18:38:45Z","published":"2024-11-26T20:22:31Z","title":"Accelerating Proximal Policy Optimization Learning Using Task Prediction\n for Solving Environments with Delayed Rewards","summary":" In this paper, we tackle the challenging problem of delayed rewards in\nreinforcement learning (RL). While Proximal Policy Optimization (PPO) has\nemerged as a leading Policy Gradient method, its performance can degrade under\ndelayed rewards. We introduce two key enhancements to PPO: a hybrid policy\narchitecture that combines an offline policy (trained on expert demonstrations)\nwith an online PPO policy, and a reward shaping mechanism using Time Window\nTemporal Logic (TWTL). The hybrid architecture leverages offline data\nthroughout training while maintaining PPO's theoretical guarantees. Building on\nthe monotonic improvement framework of Trust Region Policy Optimization (TRPO),\nwe prove that our approach ensures improvement over both the offline policy and\nprevious iterations, with a bounded performance gap of\n$(2\\varsigma\\gamma\\alpha^2)/(1-\\gamma)^2$, where $\\alpha$ is the mixing\nparameter, $\\gamma$ is the discount factor, and $\\varsigma$ bounds the expected\nadvantage. Additionally, we prove that our TWTL-based reward shaping preserves\nthe optimal policy of the original problem. TWTL enables formal translation of\ntemporal objectives into immediate feedback signals that guide learning. We\ndemonstrate the effectiveness of our approach through extensive experiments on\nan inverted pendulum and a lunar lander environments, showing improvements in\nboth learning speed and final performance compared to standard PPO and\noffline-only approaches.\n","authors":["Ahmad Ahmad","Mehdi Kermanshah","Kevin Leahy","Zachary Serlin","Ho Chit Siu","Makai Mann","Cristian-Ioan Vasile","Roberto Tron","Calin Belta"],"pdf_url":"https://arxiv.org/pdf/2411.17861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07636v2","updated":"2024-12-03T18:35:27Z","published":"2023-12-12T10:25:31Z","title":"Go beyond End-to-End Training: Boosting Greedy Local Learning with\n Context Supply","summary":" Traditional end-to-end (E2E) training of deep networks necessitates storing\nintermediate activations for back-propagation, resulting in a large memory\nfootprint on GPUs and restricted model parallelization. As an alternative,\ngreedy local learning partitions the network into gradient-isolated modules and\ntrains supervisely based on local preliminary losses, thereby providing\nasynchronous and parallel training methods that substantially reduce memory\ncost. However, empirical experiments reveal that as the number of segmentations\nof the gradient-isolated module increases, the performance of the local\nlearning scheme degrades substantially, severely limiting its expansibility. To\navoid this issue, we theoretically analyze the greedy local learning from the\nstandpoint of information theory and propose a ContSup scheme, which\nincorporates context supply between isolated modules to compensate for\ninformation loss. Experiments on benchmark datasets (i.e. CIFAR, SVHN, STL-10)\nachieve SOTA results and indicate that our proposed method can significantly\nimprove the performance of greedy local learning with minimal memory and\ncomputational overhead, allowing for the boost of the number of isolated\nmodules. Our codes are available at https://github.com/Tab-ct/ContSup.\n","authors":["Chengting Yu","Fengzhao Zhang","Hanzhi Ma","Aili Wang","Erping Li"],"pdf_url":"https://arxiv.org/pdf/2312.07636v2.pdf","comment":"9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2406.01378v2","updated":"2024-12-03T18:32:15Z","published":"2024-06-03T14:42:31Z","title":"A Fast Convergence Theory for Offline Decision Making","summary":" This paper proposes the first generic fast convergence result in general\nfunction approximation for offline decision making problems, which include\noffline reinforcement learning (RL) and off-policy evaluation (OPE) as special\ncases. To unify different settings, we introduce a framework called Decision\nMaking with Offline Feedback (DMOF), which captures a wide range of offline\ndecision making problems. Within this framework, we propose a simple yet\npowerful algorithm called Empirical Decision with Divergence (EDD), whose upper\nbound can be termed as a coefficient named Empirical Offline Estimation\nCoefficient (EOEC). We show that EOEC is instance-dependent and actually\nmeasures the correlation of the problem. When assuming partial coverage in the\ndataset, EOEC will reduce in a rate of $1/N$ where $N$ is the size of the\ndataset, endowing EDD with a fast convergence guarantee. Finally, we complement\nthe above results with a lower bound in the DMOF framework, which further\ndemonstrates the soundness of our theory.\n","authors":["Chenjie Mao","Qiaosheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.01378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.01547v2","updated":"2024-12-03T18:28:43Z","published":"2024-11-03T12:42:16Z","title":"Decoupling Dark Knowledge via Block-wise Logit Distillation for\n Feature-level Alignment","summary":" Knowledge Distillation (KD), a learning manner with a larger teacher network\nguiding a smaller student network, transfers dark knowledge from the teacher to\nthe student via logits or intermediate features, with the aim of producing a\nwell-performed lightweight model. Notably, many subsequent feature-based KD\nmethods outperformed the earliest logit-based KD method and iteratively\ngenerated numerous state-of-the-art distillation methods. Nevertheless, recent\nwork has uncovered the potential of the logit-based method, bringing the simple\nKD form based on logits back into the limelight. Features or logits? They\npartially implement the KD with entirely distinct perspectives; therefore,\nchoosing between logits and features is not straightforward. This paper\nprovides a unified perspective of feature alignment in order to obtain a better\ncomprehension of their fundamental distinction. Inheriting the design\nphilosophy and insights of feature-based and logit-based methods, we introduce\na block-wise logit distillation framework to apply implicit logit-based feature\nalignment by gradually replacing teacher's blocks as intermediate\nstepping-stone models to bridge the gap between the student and the teacher.\nOur method obtains comparable or superior results to state-of-the-art\ndistillation methods. This paper demonstrates the great potential of combining\nlogit and features, and we hope it will inspire future research to revisit KD\nfrom a higher vantage point.\n","authors":["Chengting Yu","Fengzhao Zhang","Ruizhe Chen","Aili Wang","Zuozhu Liu","Shurun Tan","Er-Ping Li"],"pdf_url":"https://arxiv.org/pdf/2411.01547v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02646v1","updated":"2024-12-03T18:21:20Z","published":"2024-12-03T18:21:20Z","title":"Interpretable Generalized Additive Models for Datasets with Missing\n Values","summary":" Many important datasets contain samples that are missing one or more feature\nvalues. Maintaining the interpretability of machine learning models in the\npresence of such missing data is challenging. Singly or multiply imputing\nmissing values complicates the model's mapping from features to labels. On the\nother hand, reasoning on indicator variables that represent missingness\nintroduces a potentially large number of additional terms, sacrificing\nsparsity. We solve these problems with M-GAM, a sparse, generalized, additive\nmodeling approach that incorporates missingness indicators and their\ninteraction terms while maintaining sparsity through l0 regularization. We show\nthat M-GAM provides similar or superior accuracy to prior methods while\nsignificantly improving sparsity relative to either imputation or naive\ninclusion of indicator variables.\n","authors":["Hayden McTavish","Jon Donnelly","Margo Seltzer","Cynthia Rudin"],"pdf_url":"https://arxiv.org/pdf/2412.02646v1.pdf","comment":"Published in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.02639v1","updated":"2024-12-03T18:11:37Z","published":"2024-12-03T18:11:37Z","title":"The Space Complexity of Approximating Logistic Loss","summary":" We provide space complexity lower bounds for data structures that approximate\nlogistic loss up to $\\epsilon$-relative error on a logistic regression problem\nwith data $\\mathbf{X} \\in \\mathbb{R}^{n \\times d}$ and labels $\\mathbf{y} \\in\n\\{-1,1\\}^d$. The space complexity of existing coreset constructions depend on a\nnatural complexity measure $\\mu_\\mathbf{y}(\\mathbf{X})$, first defined in\n(Munteanu, 2018). We give an $\\tilde{\\Omega}(\\frac{d}{\\epsilon^2})$ space\ncomplexity lower bound in the regime $\\mu_\\mathbf{y}(\\mathbf{X}) = O(1)$ that\nshows existing coresets are optimal in this regime up to lower order factors.\nWe also prove a general $\\tilde{\\Omega}(d\\cdot \\mu_\\mathbf{y}(\\mathbf{X}))$\nspace lower bound when $\\epsilon$ is constant, showing that the dependency on\n$\\mu_\\mathbf{y}(\\mathbf{X})$ is not an artifact of mergeable coresets. Finally,\nwe refute a prior conjecture that $\\mu_\\mathbf{y}(\\mathbf{X})$ is hard to\ncompute by providing an efficient linear programming formulation, and we\nempirically compare our algorithm to prior approximate methods.\n","authors":["Gregory Dexter","Petros Drineas","Rajiv Khanna"],"pdf_url":"https://arxiv.org/pdf/2412.02639v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.14284"},{"id":"http://arxiv.org/abs/2412.02631v1","updated":"2024-12-03T17:58:07Z","published":"2024-12-03T17:58:07Z","title":"Sharp-It: A Multi-view to Multi-view Diffusion Model for 3D Synthesis\n and Manipulation","summary":" Advancements in text-to-image diffusion models have led to significant\nprogress in fast 3D content creation. One common approach is to generate a set\nof multi-view images of an object, and then reconstruct it into a 3D model.\nHowever, this approach bypasses the use of a native 3D representation of the\nobject and is hence prone to geometric artifacts and limited in controllability\nand manipulation capabilities. An alternative approach involves native 3D\ngenerative models that directly produce 3D representations. These models,\nhowever, are typically limited in their resolution, resulting in lower quality\n3D objects. In this work, we bridge the quality gap between methods that\ndirectly generate 3D representations and ones that reconstruct 3D objects from\nmulti-view images. We introduce a multi-view to multi-view diffusion model\ncalled Sharp-It, which takes a 3D consistent set of multi-view images rendered\nfrom a low-quality object and enriches its geometric details and texture. The\ndiffusion model operates on the multi-view set in parallel, in the sense that\nit shares features across the generated views. A high-quality 3D model can then\nbe reconstructed from the enriched multi-view set. By leveraging the advantages\nof both 2D and 3D approaches, our method offers an efficient and controllable\nmethod for high-quality 3D content creation. We demonstrate that Sharp-It\nenables various 3D applications, such as fast synthesis, editing, and\ncontrolled generation, while attaining high-quality assets.\n","authors":["Yiftach Edelstein","Or Patashnik","Dana Cohen-Bar","Lihi Zelnik-Manor"],"pdf_url":"https://arxiv.org/pdf/2412.02631v1.pdf","comment":"Project page at https://yiftachede.github.io/Sharp-It/"},{"id":"http://arxiv.org/abs/2412.02623v1","updated":"2024-12-03T17:52:38Z","published":"2024-12-03T17:52:38Z","title":"The effect of priors on Learning with Restricted Boltzmann Machines","summary":" Restricted Boltzmann Machines (RBMs) are generative models designed to learn\nfrom data with a rich underlying structure. In this work, we explore a\nteacher-student setting where a student RBM learns from examples generated by a\nteacher RBM, with a focus on the effect of the unit priors on learning\nefficiency. We consider a parametric class of priors that interpolate between\ncontinuous (Gaussian) and binary variables. This approach models various\npossible choices of visible units, hidden units, and weights for both the\nteacher and student RBMs.\n By analyzing the phase diagram of the posterior distribution in both the\nBayes optimal and mismatched regimes, we demonstrate the existence of a triple\npoint that defines the critical dataset size necessary for learning through\ngeneralization. The critical size is strongly influenced by the properties of\nthe teacher, and thus the data, but is unaffected by the properties of the\nstudent RBM. Nevertheless, a prudent choice of student priors can facilitate\ntraining by expanding the so-called signal retrieval region, where the machine\ngeneralizes effectively.\n","authors":["Gianluca Manzan","Daniele Tantari"],"pdf_url":"https://arxiv.org/pdf/2412.02623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02621v1","updated":"2024-12-03T17:50:19Z","published":"2024-12-03T17:50:19Z","title":"Medical Multimodal Foundation Models in Clinical Diagnosis and\n Treatment: Applications, Challenges, and Future Directions","summary":" Recent advancements in deep learning have significantly revolutionized the\nfield of clinical diagnosis and treatment, offering novel approaches to improve\ndiagnostic precision and treatment efficacy across diverse clinical domains,\nthus driving the pursuit of precision medicine. The growing availability of\nmulti-organ and multimodal datasets has accelerated the development of\nlarge-scale Medical Multimodal Foundation Models (MMFMs). These models, known\nfor their strong generalization capabilities and rich representational power,\nare increasingly being adapted to address a wide range of clinical tasks, from\nearly diagnosis to personalized treatment strategies. This review offers a\ncomprehensive analysis of recent developments in MMFMs, focusing on three key\naspects: datasets, model architectures, and clinical applications. We also\nexplore the challenges and opportunities in optimizing multimodal\nrepresentations and discuss how these advancements are shaping the future of\nhealthcare by enabling improved patient outcomes and more efficient clinical\nworkflows.\n","authors":["Kai Sun","Siyan Xue","Fuchun Sun","Haoran Sun","Yu Luo","Ling Wang","Siyuan Wang","Na Guo","Lei Liu","Tian Zhao","Xinzhou Wang","Lei Yang","Shuo Jin","Jun Yan","Jiahong Dong"],"pdf_url":"https://arxiv.org/pdf/2412.02621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02313v4","updated":"2024-12-03T17:49:39Z","published":"2024-06-04T13:42:42Z","title":"Neural Thermodynamic Integration: Free Energies from Energy-based\n Diffusion Models","summary":" Thermodynamic integration (TI) offers a rigorous method for estimating\nfree-energy differences by integrating over a sequence of interpolating\nconformational ensembles. However, TI calculations are computationally\nexpensive and typically limited to coupling a small number of degrees of\nfreedom due to the need to sample numerous intermediate ensembles with\nsufficient conformational-space overlap. In this work, we propose to perform TI\nalong an alchemical pathway represented by a trainable neural network, which we\nterm Neural TI. Critically, we parametrize a time-dependent Hamiltonian\ninterpolating between the interacting and non-interacting systems, and optimize\nits gradient using a score matching objective. The ability of the resulting\nenergy-based diffusion model to sample all intermediate ensembles allows us to\nperform TI from a single reference calculation. We apply our method to\nLennard-Jones fluids, where we report accurate calculations of the excess\nchemical potential, demonstrating that Neural TI reproduces the underlying\nchanges in free energy without the need for simulations at interpolating\nHamiltonians.\n","authors":["Bálint Máté","François Fleuret","Tristan Bereau"],"pdf_url":"https://arxiv.org/pdf/2406.02313v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02617v1","updated":"2024-12-03T17:44:23Z","published":"2024-12-03T17:44:23Z","title":"Improving Dynamic Object Interactions in Text-to-Video Generation with\n AI Feedback","summary":" Large text-to-video models hold immense potential for a wide range of\ndownstream applications. However, these models struggle to accurately depict\ndynamic object interactions, often resulting in unrealistic movements and\nfrequent violations of real-world physics. One solution inspired by large\nlanguage models is to align generated outputs with desired outcomes using\nexternal feedback. This enables the model to refine its responses autonomously,\neliminating extensive manual data collection. In this work, we investigate the\nuse of feedback to enhance the object dynamics in text-to-video models. We aim\nto answer a critical question: what types of feedback, paired with which\nspecific self-improvement algorithms, can most effectively improve text-video\nalignment and realistic object interactions? We begin by deriving a unified\nprobabilistic objective for offline RL finetuning of text-to-video models. This\nperspective highlights how design elements in existing algorithms like KL\nregularization and policy projection emerge as specific choices within a\nunified framework. We then use derived methods to optimize a set of text-video\nalignment metrics (e.g., CLIP scores, optical flow), but notice that they often\nfail to align with human perceptions of generation quality. To address this\nlimitation, we propose leveraging vision-language models to provide more\nnuanced feedback specifically tailored to object dynamics in videos. Our\nexperiments demonstrate that our method can effectively optimize a wide variety\nof rewards, with binary AI feedback driving the most significant improvements\nin video quality for dynamic interactions, as confirmed by both AI and human\nevaluations. Notably, we observe substantial gains when using reward signals\nderived from AI feedback, particularly in scenarios involving complex\ninteractions between multiple objects and realistic depictions of objects\nfalling.\n","authors":["Hiroki Furuta","Heiga Zen","Dale Schuurmans","Aleksandra Faust","Yutaka Matsuo","Percy Liang","Sherry Yang"],"pdf_url":"https://arxiv.org/pdf/2412.02617v1.pdf","comment":"Website: https://sites.google.com/view/aif-dynamic-t2v/"},{"id":"http://arxiv.org/abs/2412.02609v1","updated":"2024-12-03T17:40:26Z","published":"2024-12-03T17:40:26Z","title":"Wasserstein Markets for Differentially-Private Data","summary":" Data is an increasingly vital component of decision making processes across\nindustries. However, data access raises privacy concerns motivating the need\nfor privacy-preserving techniques such as differential privacy. Data markets\nprovide a means to enable wider access as well as determine the appropriate\nprivacy-utility trade-off. Existing data market frameworks either require a\ntrusted third party to perform computationally expensive valuations or are\nunable to capture the combinatorial nature of data value and do not\nendogenously model the effect of differential privacy. This paper addresses\nthese shortcomings by proposing a valuation mechanism based on the Wasserstein\ndistance for differentially-private data, and corresponding procurement\nmechanisms by leveraging incentive mechanism design theory, for task-agnostic\ndata procurement, and task-specific procurement co-optimisation. The mechanisms\nare reformulated into tractable mixed-integer second-order cone programs, which\nare validated with numerical studies.\n","authors":["Saurab Chhachhi","Fei Teng"],"pdf_url":"https://arxiv.org/pdf/2412.02609v1.pdf","comment":"35 pages, 15 figures"},{"id":"http://arxiv.org/abs/2412.02605v1","updated":"2024-12-03T17:34:50Z","published":"2024-12-03T17:34:50Z","title":"Interpretable Company Similarity with Sparse Autoencoders","summary":" Determining company similarity is a vital task in finance, underpinning\nhedging, risk management, portfolio diversification, and more. Practitioners\noften rely on sector and industry classifications to gauge similarity, such as\nSIC-codes and GICS-codes, the former being used by the U.S. Securities and\nExchange Commission (SEC), and the latter widely used by the investment\ncommunity. Clustering embeddings of company descriptions has been proposed as a\npotential technique for determining company similarity, but the lack of\ninterpretability in token embeddings poses a significant barrier to adoption in\nhigh-stakes contexts. Sparse Autoencoders have shown promise in enhancing the\ninterpretability of Large Language Models by decomposing LLM activations into\ninterpretable features. In this paper, we explore the use of SAE features in\nmeasuring company similarity and benchmark them against (1) SIC codes and (2)\nMajor Group codes. We conclude that SAE features can reproduce and even surpass\nsector classifications in quantifying fundamental characteristics of companies,\nevaluated by the correlation of monthly returns, a proxy for similarity, and\nPnL from cointegration.\n","authors":["Marco Molinari","Vladimir Tregubiak","Victor Shao","Abhimanyu Pandey","Mateusz Mikolajczak","Sebastião Kuznetsov Ryder Torres Pereira"],"pdf_url":"https://arxiv.org/pdf/2412.02605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02602v1","updated":"2024-12-03T17:32:47Z","published":"2024-12-03T17:32:47Z","title":"CEGI: Measuring the trade-off between efficiency and carbon emissions\n for SLMs and VLMs","summary":" This paper analyzes the performance of Small Language Models (SLMs) and\nVision Language Models (VLMs) and evaluates the trade-off between model\nperformance and carbon emissions across 4 essential tasks: Image Captioning,\nVisual Question Answering (VQA), Dialogue Summarization and Text-to-SQL\nconversion. Various SLMs and VLMs belonging to the Qwen and LLaMA architecture\nfamily are chosen and variants based on model size in terms of the number of\nparameters, quantization level and fine-tuning parameters are evaluated. The\nmodel variant's performance and carbon emissions are calculated. To quantify\nthe trade-off between model performance and carbon emissions, we introduce a\nnovel metric called CEGI (Carbon Efficient Gain Index). This metric represents\nthe carbon emission per unit percentage gain per million trainable parameters .\nThis metric provides a normalized measure to compare model's efficiency in\nterms of performance improvement relative to their environmental cost. The\nexperiment's outcome demonstrates that fine-tuning SLMs and VLMs can achieve\nperformance levels comparable to Large Language Models (LLMs) while producing\nsignificantly less carbon emissions. Our findings suggest that the marginal\ngains in accuracy from larger models do not justify the substantial increase in\ncarbon emissions. Leveraging lower-bit quantization levels, the proposed metric\nfurther enhances energy efficiency without compromising performance. This study\nhighlights balancing high performance and environmental sustainability. It\noffers a valuable metric for selecting models suitable for\nenvironmentally-friendly AI development.\n","authors":["Abhas Kumar","Kapil Pathak","Rajesh Kavuru","Prabhakar Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2412.02602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02596v1","updated":"2024-12-03T17:29:00Z","published":"2024-12-03T17:29:00Z","title":"Class-wise Autoencoders Measure Classification Difficulty And Detect\n Label Mistakes","summary":" We introduce a new framework for analyzing classification datasets based on\nthe ratios of reconstruction errors between autoencoders trained on individual\nclasses. This analysis framework enables efficient characterization of datasets\non the sample, class, and entire dataset levels. We define reconstruction error\nratios (RERs) that probe classification difficulty and allow its decomposition\ninto (1) finite sample size and (2) Bayes error and decision-boundary\ncomplexity. Through systematic study across 19 popular visual datasets, we find\nthat our RER-based dataset difficulty probe strongly correlates with error rate\nfor state-of-the-art (SOTA) classification models. By interpreting sample-level\nclassification difficulty as a label mistakenness score, we further find that\nRERs achieve SOTA performance on mislabel detection tasks on hard datasets\nunder symmetric and asymmetric label noise. Our code is publicly available at\nhttps://github.com/voxel51/reconstruction-error-ratios.\n","authors":["Jacob Marks","Brent A. Griffin","Jason J. Corso"],"pdf_url":"https://arxiv.org/pdf/2412.02596v1.pdf","comment":"30 pages, 18 figures"},{"id":"http://arxiv.org/abs/2409.06219v4","updated":"2024-12-03T17:23:07Z","published":"2024-09-10T05:05:34Z","title":"Denoising: A Powerful Building-Block for Imaging, Inverse Problems, and\n Machine Learning","summary":" Denoising, the process of reducing random fluctuations in a signal to\nemphasize essential patterns, has been a fundamental problem of interest since\nthe dawn of modern scientific inquiry. Recent denoising techniques,\nparticularly in imaging, have achieved remarkable success, nearing theoretical\nlimits by some measures. Yet, despite tens of thousands of research papers, the\nwide-ranging applications of denoising beyond noise removal have not been fully\nrecognized. This is partly due to the vast and diverse literature, making a\nclear overview challenging.\n This paper aims to address this gap. We present a clarifying perspective on\ndenoisers, their structure, and desired properties. We emphasize the increasing\nimportance of denoising and showcase its evolution into an essential building\nblock for complex tasks in imaging, inverse problems, and machine learning.\nDespite its long history, the community continues to uncover unexpected and\ngroundbreaking uses for denoising, further solidifying its place as a\ncornerstone of scientific and engineering practice.\n","authors":["Peyman Milanfar","Mauricio Delbracio"],"pdf_url":"https://arxiv.org/pdf/2409.06219v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13846v4","updated":"2024-12-03T17:22:01Z","published":"2024-04-22T03:05:19Z","title":"Filtered Direct Preference Optimization","summary":" Reinforcement learning from human feedback (RLHF) plays a crucial role in\naligning language models with human preferences. While the significance of\ndataset quality is generally recognized, explicit investigations into its\nimpact within the RLHF framework, to our knowledge, have been limited. This\npaper addresses the issue of text quality within the preference dataset by\nfocusing on direct preference optimization (DPO), an increasingly adopted\nreward-model-free RLHF method. We confirm that text quality significantly\ninfluences the performance of models optimized with DPO more than those\noptimized with reward-model-based RLHF. Building on this new insight, we\npropose an extension of DPO, termed filtered direct preference optimization\n(fDPO). fDPO uses a trained reward model to monitor the quality of texts within\nthe preference dataset during DPO training. Samples of lower quality are\ndiscarded based on comparisons with texts generated by the model being\noptimized, resulting in a more accurate dataset. Experimental results\ndemonstrate that fDPO enhances the final model performance. Our code is\navailable at https://github.com/CyberAgentAILab/filtered-dpo.\n","authors":["Tetsuro Morimura","Mitsuki Sakamoto","Yuu Jinnai","Kenshi Abe","Kaito Ariu"],"pdf_url":"https://arxiv.org/pdf/2404.13846v4.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2412.00177v2","updated":"2024-12-03T17:21:41Z","published":"2024-11-29T18:59:11Z","title":"LumiNet: Latent Intrinsics Meets Diffusion Models for Indoor Scene\n Relighting","summary":" We introduce LumiNet, a novel architecture that leverages generative models\nand latent intrinsic representations for effective lighting transfer. Given a\nsource image and a target lighting image, LumiNet synthesizes a relit version\nof the source scene that captures the target's lighting. Our approach makes two\nkey contributions: a data curation strategy from the StyleGAN-based relighting\nmodel for our training, and a modified diffusion-based ControlNet that\nprocesses both latent intrinsic properties from the source image and latent\nextrinsic properties from the target image. We further improve lighting\ntransfer through a learned adaptor (MLP) that injects the target's latent\nextrinsic properties via cross-attention and fine-tuning.\n Unlike traditional ControlNet, which generates images with conditional maps\nfrom a single scene, LumiNet processes latent representations from two\ndifferent images - preserving geometry and albedo from the source while\ntransferring lighting characteristics from the target. Experiments demonstrate\nthat our method successfully transfers complex lighting phenomena including\nspecular highlights and indirect illumination across scenes with varying\nspatial layouts and materials, outperforming existing approaches on challenging\nindoor scenes using only images as input.\n","authors":["Xiaoyan Xing","Konrad Groh","Sezer Karaoglu","Theo Gevers","Anand Bhattad"],"pdf_url":"https://arxiv.org/pdf/2412.00177v2.pdf","comment":"Project page: https://luminet-relight.github.io"},{"id":"http://arxiv.org/abs/2412.02578v1","updated":"2024-12-03T17:04:14Z","published":"2024-12-03T17:04:14Z","title":"Private Linear Regression with Differential Privacy and PAC Privacy","summary":" Linear regression is a fundamental tool for statistical analysis, which has\nmotivated the development of linear regression methods that satisfy provable\nprivacy guarantees so that the learned model reveals little about any one data\npoint used to construct it. Most existing privacy-preserving linear regression\nmethods rely on the well-established framework of differential privacy, while\nthe newly proposed PAC Privacy has not yet been explored in this context. In\nthis paper, we systematically compare linear regression models trained with\ndifferential privacy and PAC privacy across three real-world datasets,\nobserving several key findings that impact the performance of\nprivacy-preserving linear regression.\n","authors":["Hillary Yang"],"pdf_url":"https://arxiv.org/pdf/2412.02578v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.05305v2","updated":"2024-12-03T17:03:57Z","published":"2024-09-09T03:26:07Z","title":"Closed-Form Interpretation of Neural Network Latent Spaces with Symbolic\n Gradients","summary":" It has been demonstrated in many scientific fields that artificial neural\nnetworks like autoencoders or Siamese networks encode meaningful concepts in\ntheir latent spaces. However, there does not exist a comprehensive framework\nfor retrieving this information in a human-readable form without prior\nknowledge. In order to extract these concepts, we introduce a framework for\nfinding closed-form interpretations of neurons in latent spaces of artificial\nneural networks. The interpretation framework is based on embedding trained\nneural networks into an equivalence class of functions that encode the same\nconcept. We interpret these neural networks by finding an intersection between\nthe equivalence class and human-readable equations defined by a symbolic search\nspace. The approach is demonstrated by retrieving invariants of matrices and\nconserved quantities of dynamical systems from latent spaces of Siamese neural\nnetworks.\n","authors":["Zakaria Patel","Sebastian J. Wetzel"],"pdf_url":"https://arxiv.org/pdf/2409.05305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02570v1","updated":"2024-12-03T16:55:27Z","published":"2024-12-03T16:55:27Z","title":"TAB-Fields: A Maximum Entropy Framework for Mission-Aware Adversarial\n Planning","summary":" Autonomous agents operating in adversarial scenarios face a fundamental\nchallenge: while they may know their adversaries' high-level objectives, such\nas reaching specific destinations within time constraints, the exact policies\nthese adversaries will employ remain unknown. Traditional approaches address\nthis challenge by treating the adversary's state as a partially observable\nelement, leading to a formulation as a Partially Observable Markov Decision\nProcess (POMDP). However, the induced belief-space dynamics in a POMDP require\nknowledge of the system's transition dynamics, which, in this case, depend on\nthe adversary's unknown policy. Our key observation is that while an\nadversary's exact policy is unknown, their behavior is necessarily constrained\nby their mission objectives and the physical environment, allowing us to\ncharacterize the space of possible behaviors without assuming specific\npolicies. In this paper, we develop Task-Aware Behavior Fields (TAB-Fields), a\nrepresentation that captures adversary state distributions over time by\ncomputing the most unbiased probability distribution consistent with known\nconstraints. We construct TAB-Fields by solving a constrained optimization\nproblem that minimizes additional assumptions about adversary behavior beyond\nmission and environmental requirements. We integrate TAB-Fields with standard\nplanning algorithms by introducing TAB-conditioned POMCP, an adaptation of\nPartially Observable Monte Carlo Planning. Through experiments in simulation\nwith underwater robots and hardware implementations with ground robots, we\ndemonstrate that our approach achieves superior performance compared to\nbaselines that either assume specific adversary policies or neglect mission\nconstraints altogether. Evaluation videos and code are available at\nhttps://tab-fields.github.io.\n","authors":["Gokul Puthumanaillam","Jae Hyuk Song","Nurzhan Yesmagambet","Shinkyu Park","Melkior Ornik"],"pdf_url":"https://arxiv.org/pdf/2412.02570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02548v1","updated":"2024-12-03T16:41:18Z","published":"2024-12-03T16:41:18Z","title":"Plug-and-Play Half-Quadratic Splitting for Ptychography","summary":" Ptychography is a coherent diffraction imaging method that uses phase\nretrieval techniques to reconstruct complex-valued images. It achieves this by\nsequentially illuminating overlapping regions of a sample with a coherent beam\nand recording the diffraction pattern. Although this addresses traditional\nimaging system challenges, it is computationally intensive and highly sensitive\nto noise, especially with reduced illumination overlap. Data-driven\nregularisation techniques have been applied in phase retrieval to improve\nreconstruction quality. In particular, plug-and-play (PnP) offers flexibility\nby integrating data-driven denoisers as implicit priors. In this work, we\npropose a half-quadratic splitting framework for using PnP and other\ndata-driven priors for ptychography. We evaluate our method both on natural\nimages and real test objects to validate its effectiveness for ptychographic\nimage reconstruction.\n","authors":["Alexander Denker","Johannes Hertrich","Zeljko Kereta","Silvia Cipiccia","Ecem Erin","Simon Arridge"],"pdf_url":"https://arxiv.org/pdf/2412.02548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02546v1","updated":"2024-12-03T16:39:01Z","published":"2024-12-03T16:39:01Z","title":"Fractional Order Distributed Optimization","summary":" Distributed optimization is fundamental to modern machine learning\napplications like federated learning, but existing methods often struggle with\nill-conditioned problems and face stability-versus-speed tradeoffs. We\nintroduce fractional order distributed optimization (FrODO); a\ntheoretically-grounded framework that incorporates fractional-order memory\nterms to enhance convergence properties in challenging optimization landscapes.\nOur approach achieves provable linear convergence for any strongly connected\nnetwork. Through empirical validation, our results suggest that FrODO achieves\nup to 4 times faster convergence versus baselines on ill-conditioned problems\nand 2-3 times speedup in federated neural network training, while maintaining\nstability and theoretical guarantees.\n","authors":["Andrei Lixandru","Marcel van Gerven","Sergio Pequito"],"pdf_url":"https://arxiv.org/pdf/2412.02546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02542v1","updated":"2024-12-03T16:34:49Z","published":"2024-12-03T16:34:49Z","title":"Unveiling Concept Attribution in Diffusion Models","summary":" Diffusion models have shown remarkable abilities in generating realistic and\nhigh-quality images from text prompts. However, a trained model remains\nblack-box; little do we know about the role of its components in exhibiting a\nconcept such as objects or styles. Recent works employ causal tracing to\nlocalize layers storing knowledge in generative models without showing how\nthose layers contribute to the target concept. In this work, we approach the\nmodel interpretability problem from a more general perspective and pose a\nquestion: \\textit{``How do model components work jointly to demonstrate\nknowledge?''}. We adapt component attribution to decompose diffusion models,\nunveiling how a component contributes to a concept. Our framework allows\neffective model editing, in particular, we can erase a concept from diffusion\nmodels by removing positive components while remaining knowledge of other\nconcepts. Surprisingly, we also show there exist components that contribute\nnegatively to a concept, which has not been discovered in the knowledge\nlocalization approach. Experimental results confirm the role of positive and\nnegative components pinpointed by our framework, depicting a complete view of\ninterpreting generative models. Our code is available at\n\\url{https://github.com/mail-research/CAD-attribution4diffusion}\n","authors":["Quang H. Nguyen","Hoang Phan","Khoa D. Doan"],"pdf_url":"https://arxiv.org/pdf/2412.02542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02538v1","updated":"2024-12-03T16:32:19Z","published":"2024-12-03T16:32:19Z","title":"On the Privacy, Security, and Trustworthy for Distributed Wireless Large\n AI Model (WLAM)","summary":" Combining wireless communication with large artificial intelligence (AI)\nmodels can open up a myriad of novel application scenarios. In sixth generation\n(6G) networks, ubiquitous communication and computing resources allow large AI\nmodels to serve democratic large AI models-related services to enable real-time\napplications like autonomous vehicles, smart cities, and Internet of Things\n(IoT) ecosystems. However, the security considerations and sustainable\ncommunication resources limit the deployment of large AI models over\ndistributed wireless networks. This paper provides a comprehensive overview of\nprivacy, security, and trustworthy for distributed wireless large AI model\n(WLAM). In particular, the detailed privacy and security are analysis for\ndistributed WLAM is fist revealed. The classifications and theoretical findings\nabout privacy and security in distributed WLAM are discussed. Then the\ntrustworthy and ethics for implementing distributed WLAM are described.\nFinally, the comprehensive applications of distributed WLAM is provided in the\naspect of electromagnetic signal processing.\n","authors":["Zhaohui Yang","Wei Xu","Le Liang","Yuanhao Cui","Zhijin Qin","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2412.02538v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.02535v1","updated":"2024-12-03T16:26:56Z","published":"2024-12-03T16:26:56Z","title":"Defending Against Diverse Attacks in Federated Learning Through\n Consensus-Based Bi-Level Optimization","summary":" Adversarial attacks pose significant challenges in many machine learning\napplications, particularly in the setting of distributed training and federated\nlearning, where malicious agents seek to corrupt the training process with the\ngoal of jeopardizing and compromising the performance and reliability of the\nfinal models. In this paper, we address the problem of robust federated\nlearning in the presence of such attacks by formulating the training task as a\nbi-level optimization problem. We conduct a theoretical analysis of the\nresilience of consensus-based bi-level optimization (CB$^2$O), an interacting\nmulti-particle metaheuristic optimization method, in adversarial settings.\nSpecifically, we provide a global convergence analysis of CB$^2$O in mean-field\nlaw in the presence of malicious agents, demonstrating the robustness of\nCB$^2$O against a diverse range of attacks. Thereby, we offer insights into how\nspecific hyperparameter choices enable to mitigate adversarial effects. On the\npractical side, we extend CB$^2$O to the clustered federated learning setting\nby proposing FedCB$^2$O, a novel interacting multi-particle system, and design\na practical algorithm that addresses the demands of real-world applications.\nExtensive experiments demonstrate the robustness of the FedCB$^2$O algorithm\nagainst label-flipping attacks in decentralized clustered federated learning\nscenarios, showcasing its effectiveness in practical contexts.\n","authors":["Nicolás García Trillos","Aditya Kumar Akash","Sixu Li","Konstantin Riedl","Yuhua Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.02535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.10578v5","updated":"2024-12-03T16:26:09Z","published":"2024-10-14T14:52:23Z","title":"Burning RED: Unlocking Subtask-Driven Reinforcement Learning and\n Risk-Awareness in Average-Reward Markov Decision Processes","summary":" Average-reward Markov decision processes (MDPs) provide a foundational\nframework for sequential decision-making under uncertainty. However,\naverage-reward MDPs have remained largely unexplored in reinforcement learning\n(RL) settings, with the majority of RL-based efforts having been allocated to\nepisodic and discounted MDPs. In this work, we study a unique structural\nproperty of average-reward MDPs and utilize it to introduce Reward-Extended\nDifferential (or RED) reinforcement learning: a novel RL framework that can be\nused to effectively and efficiently solve various subtasks simultaneously in\nthe average-reward setting. We introduce a family of RED learning algorithms\nfor prediction and control, including proven-convergent algorithms for the\ntabular case. We then showcase the power of these algorithms by demonstrating\nhow they can be used to learn a policy that optimizes, for the first time, the\nwell-known conditional value-at-risk (CVaR) risk measure in a fully-online\nmanner, without the use of an explicit bi-level optimization scheme or an\naugmented state-space.\n","authors":["Juan Sebastian Rojas","Chi-Guhn Lee"],"pdf_url":"https://arxiv.org/pdf/2410.10578v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02529v1","updated":"2024-12-03T16:21:53Z","published":"2024-12-03T16:21:53Z","title":"Active learning of neural population dynamics using two-photon\n holographic optogenetics","summary":" Recent advances in techniques for monitoring and perturbing neural\npopulations have greatly enhanced our ability to study circuits in the brain.\nIn particular, two-photon holographic optogenetics now enables precise\nphotostimulation of experimenter-specified groups of individual neurons, while\nsimultaneous two-photon calcium imaging enables the measurement of ongoing and\ninduced activity across the neural population. Despite the enormous space of\npotential photostimulation patterns and the time-consuming nature of\nphotostimulation experiments, very little algorithmic work has been done to\ndetermine the most effective photostimulation patterns for identifying the\nneural population dynamics. Here, we develop methods to efficiently select\nwhich neurons to stimulate such that the resulting neural responses will best\ninform a dynamical model of the neural population activity. Using neural\npopulation responses to photostimulation in mouse motor cortex, we demonstrate\nthe efficacy of a low-rank linear dynamical systems model, and develop an\nactive learning procedure which takes advantage of low-rank structure to\ndetermine informative photostimulation patterns. We demonstrate our approach on\nboth real and synthetic data, obtaining in some cases as much as a two-fold\nreduction in the amount of data required to reach a given predictive power. Our\nactive stimulation design method is based on a novel active learning procedure\nfor low-rank regression, which may be of independent interest.\n","authors":["Andrew Wagenmaker","Lu Mi","Marton Rozsa","Matthew S. Bull","Karel Svoboda","Kayvon Daie","Matthew D. Golub","Kevin Jamieson"],"pdf_url":"https://arxiv.org/pdf/2412.02529v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.02525v1","updated":"2024-12-03T16:18:42Z","published":"2024-12-03T16:18:42Z","title":"LLMForecaster: Improving Seasonal Event Forecasts with Unstructured\n Textual Data","summary":" Modern time-series forecasting models often fail to make full use of rich\nunstructured information about the time series themselves. This lack of proper\nconditioning can lead to obvious model failures; for example, models may be\nunaware of the details of a particular product, and hence fail to anticipate\nseasonal surges in customer demand in the lead up to major exogenous events\nlike holidays for clearly relevant products. To address this shortcoming, this\npaper introduces a novel forecast post-processor -- which we call LLMForecaster\n-- that fine-tunes large language models (LLMs) to incorporate unstructured\nsemantic and contextual information and historical data to improve the\nforecasts from an existing demand forecasting pipeline. In an industry-scale\nretail application, we demonstrate that our technique yields statistically\nsignificantly forecast improvements across several sets of products subject to\nholiday-driven demand surges.\n","authors":["Hanyu Zhang","Chuck Arvin","Dmitry Efimov","Michael W. Mahoney","Dominique Perrault-Joncas","Shankar Ramasubramanian","Andrew Gordon Wilson","Malcolm Wolff"],"pdf_url":"https://arxiv.org/pdf/2412.02525v1.pdf","comment":"Presented at NeurIPS Time Series in the Age of Large Models (2024)"},{"id":"http://arxiv.org/abs/2408.07712v3","updated":"2024-12-03T16:17:32Z","published":"2024-08-13T23:08:06Z","title":"Introduction to Reinforcement Learning","summary":" Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI),\nfocuses on training agents to make decisions by interacting with their\nenvironment to maximize cumulative rewards. This paper provides an overview of\nRL, covering its core concepts, methodologies, and resources for further\nlearning. It offers a thorough explanation of fundamental components such as\nstates, actions, policies, and reward signals, ensuring readers develop a solid\nfoundational understanding. Additionally, the paper presents a variety of RL\nalgorithms, categorized based on the key factors such as model-free,\nmodel-based, value-based, policy-based, and other key factors. Resources for\nlearning and implementing RL, such as books, courses, and online communities\nare also provided. By offering a clear, structured introduction, this paper\naims to simplify the complexities of RL for beginners, providing a\nstraightforward pathway to understanding.\n","authors":["Majid Ghasemi","Dariush Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2408.07712v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2412.02520v1","updated":"2024-12-03T16:13:42Z","published":"2024-12-03T16:13:42Z","title":"Cooperative Cruising: Reinforcement Learning based Time-Headway Control\n for Increased Traffic Efficiency","summary":" The proliferation of Connected Automated Vehicles represents an unprecedented\nopportunity for improving driving efficiency and alleviating traffic\ncongestion. However, existing research fails to address realistic multi-lane\nhighway scenarios without assuming connectivity, perception, and control\ncapabilities that are typically unavailable in current vehicles. This paper\nproposes a novel AI system that is the first to improve highway traffic\nefficiency compared with human-like traffic in realistic, simulated multi-lane\nscenarios, while relying on existing connectivity, perception, and control\ncapabilities. At the core of our approach is a reinforcement learning based\ncontroller that dynamically communicates time-headways to automated vehicles\nnear bottlenecks based on real-time traffic conditions. These desired\ntime-headways are then used by Adaptive Cruise Control (ACC) systems to adjust\ntheir following distance. By (i) integrating existing traffic estimation\ntechnology and low-bandwidth vehicle-to-infrastructure connectivity, (ii)\nleveraging safety-certified ACC systems, and (iii) targeting localized\nbottleneck challenges that can be addressed independently in different\nlocations, we propose a practical, safe, and scalable system that can\npositively impact numerous road users.\n","authors":["Yaron Veksler","Sharon Hornstein","Han Wang","Maria Laura Delle Monache","Daniel Urieli"],"pdf_url":"https://arxiv.org/pdf/2412.02520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00876v2","updated":"2024-12-03T16:12:09Z","published":"2024-12-01T16:32:31Z","title":"Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic\n Vision-language Context Sparsification","summary":" Multimodal Large Language Models (MLLMs) have achieved remarkable success in\nvision understanding, reasoning, and interaction. However, the inference\ncomputation and memory increase progressively with the generation of output\ntokens during decoding, directly affecting the efficacy of MLLMs. Existing\nmethods attempt to reduce the vision context redundancy to achieve efficient\nMLLMs. Unfortunately, the efficiency benefits of the vision context reduction\nin the prefill stage gradually diminish during the decoding stage. To address\nthis problem, we proposed a dynamic vision-language context sparsification\nframework Dynamic-LLaVA, which dynamically reduces the redundancy of vision\ncontext in the prefill stage and decreases the memory and computation overhead\nof the generated language context during decoding. Dynamic-LLaVA designs a\ntailored sparsification inference scheme for different inference modes, i.e.,\nprefill, decoding with and without KV cache, to achieve efficient inference of\nMLLMs. In practice, Dynamic-LLaVA can reduce computation consumption by\n$\\sim$75\\% in the prefill stage. Meanwhile, throughout the entire generation\nprocess of MLLMs, Dynamic-LLaVA reduces the $\\sim$50\\% computation consumption\nunder decoding without KV cache, while saving $\\sim$50\\% GPU memory overhead\nwhen decoding with KV cache, due to the vision-language context sparsification.\nExtensive experiments also demonstrate that Dynamic-LLaVA achieves efficient\ninference for MLLMs with negligible understanding and generation ability\ndegradation or even performance gains compared to the full-context inference\nbaselines. Code is available at https://github.com/Osilly/dynamic_llava .\n","authors":["Wenxuan Huang","Zijie Zhai","Yunhang Shen","Shaoshen Cao","Fei Zhao","Xiangfeng Xu","Zheyu Ye","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2412.00876v2.pdf","comment":"Code is available at https://github.com/Osilly/dynamic_llava"},{"id":"http://arxiv.org/abs/2412.01491v2","updated":"2024-12-03T16:01:54Z","published":"2024-12-02T13:42:36Z","title":"Understanding complex crowd dynamics with generative neural simulators","summary":" Understanding the dynamics of pedestrian crowds is an outstanding challenge\ncrucial for designing efficient urban infrastructure and ensuring safe crowd\nmanagement. To this end, both small-scale laboratory and large-scale real-world\nmeasurements have been used. However, these approaches respectively lack\nstatistical resolution and parametric controllability, both essential to\ndiscovering physical relationships underlying the complex stochastic dynamics\nof crowds. Here, we establish an investigation paradigm that offers\nlaboratory-like controllability, while ensuring the statistical resolution of\nlarge-scale real-world datasets. Using our data-driven Neural Crowd Simulator\n(NeCS), which we train on large-scale data and validate against key statistical\nfeatures of crowd dynamics, we show that we can perform effective surrogate\ncrowd dynamics experiments without training on specific scenarios. We not only\nreproduce known experimental results on pairwise avoidance, but also uncover\nthe vision-guided and topological nature of N-body interactions. These findings\nshow how virtual experiments based on neural simulation enable data-driven\nscientific discovery.\n","authors":["Koen Minartz","Fleur Hendriks","Simon Martinus Koop","Alessandro Corbetta","Vlado Menkovski"],"pdf_url":"https://arxiv.org/pdf/2412.01491v2.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2405.13220v2","updated":"2024-12-03T16:00:40Z","published":"2024-05-21T22:00:34Z","title":"Paired Autoencoders for Likelihood-free Estimation in Inverse Problems","summary":" We consider the solution of nonlinear inverse problems where the forward\nproblem is a discretization of a partial differential equation. Such problems\nare notoriously difficult to solve in practice and require minimizing a\ncombination of a data-fit term and a regularization term. The main\ncomputational bottleneck of typical algorithms is the direct estimation of the\ndata misfit. Therefore, likelihood-free approaches have become appealing\nalternatives. Nonetheless, difficulties in generalization and limitations in\naccuracy have hindered their broader utility and applicability. In this work,\nwe use a paired autoencoder framework as a likelihood-free estimator for\ninverse problems. We show that the use of such an architecture allows us to\nconstruct a solution efficiently and to overcome some known open problems when\nusing likelihood-free estimators. In particular, our framework can assess the\nquality of the solution and improve on it if needed. We demonstrate the\nviability of our approach using examples from full waveform inversion and\ninverse electromagnetic imaging.\n","authors":["Matthias Chung","Emma Hart","Julianne Chung","Bas Peters","Eldad Haber"],"pdf_url":"https://arxiv.org/pdf/2405.13220v2.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.10182v2","updated":"2024-12-03T15:35:24Z","published":"2024-03-15T10:38:48Z","title":"Fast and reliable uncertainty quantification with neural network\n ensembles for industrial image classification","summary":" Image classification with neural networks (NNs) is widely used in industrial\nprocesses, situations where the model likely encounters unknown objects during\ndeployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make\nconfident yet incorrect predictions when confronted with OOD data. To increase\nthe models' reliability, they should quantify the uncertainty in their own\npredictions, communicating when the output should (not) be trusted. Deep\nensembles, composed of multiple independent NNs, have been shown to perform\nstrongly but are computationally expensive. Recent research has proposed more\nefficient NN ensembles, namely the snapshot, batch, and multi-input\nmulti-output ensemble. This study investigates the predictive and uncertainty\nperformance of efficient NN ensembles in the context of image classification\nfor industrial processes. It is the first to provide a comprehensive comparison\nand it proposes a novel Diversity Quality metric to quantify the ensembles'\nperformance on the in-distribution and OOD sets in one single metric. The\nresults highlight the batch ensemble as a cost-effective and competitive\nalternative to the deep ensemble. It matches the deep ensemble in both\nuncertainty and accuracy while exhibiting considerable savings in training\ntime, test time, and memory storage.\n","authors":["Arthur Thuy","Dries F. Benoit"],"pdf_url":"https://arxiv.org/pdf/2403.10182v2.pdf","comment":"Submitted to Annals of Operations Research"},{"id":"http://arxiv.org/abs/2412.02503v1","updated":"2024-12-03T15:30:52Z","published":"2024-12-03T15:30:52Z","title":"CA-MoE: Channel-Adapted MoE for Incremental Weather Forecasting","summary":" Atmospheric science is intricately connected with other fields, e.g.,\ngeography and aerospace. Most existing approaches involve training a joint\natmospheric and geographic model from scratch, which incurs significant\ncomputational costs and overlooks the potential for incremental learning of\nweather variables across different domains. In this paper, we introduce\nincremental learning to weather forecasting and propose a novel structure that\nallows for the flexible expansion of variables within the model. Specifically,\nour method presents a Channel-Adapted MoE (CA-MoE) that employs a\ndivide-and-conquer strategy. This strategy assigns variable training tasks to\ndifferent experts by index embedding and reduces computational complexity\nthrough a channel-wise Top-K strategy. Experiments conducted on the widely\nutilized ERA5 dataset reveal that our method, utilizing only approximately 15\\%\nof trainable parameters during the incremental stage, attains performance that\nis on par with state-of-the-art competitors. Notably, in the context of\nvariable incremental experiments, our method demonstrates negligible issues\nwith catastrophic forgetting.\n","authors":["Hao Chen","Han Tao","Guo Song","Jie Zhang","Yunlong Yu","Yonghan Dong","Chuang Yang","Lei Bai"],"pdf_url":"https://arxiv.org/pdf/2412.02503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05469v4","updated":"2024-12-03T15:21:53Z","published":"2023-10-09T07:26:35Z","title":"Learning to Predict Structural Vibrations","summary":" In mechanical structures like airplanes, cars and houses, noise is generated\nand transmitted through vibrations. To take measures to reduce this noise,\nvibrations need to be simulated with expensive numerical computations. Deep\nlearning surrogate models present a promising alternative to classical\nnumerical simulations as they can be evaluated magnitudes faster, while\ntrading-off accuracy. To quantify such trade-offs systematically and foster the\ndevelopment of methods, we present a benchmark on the task of predicting the\nvibration of harmonically excited plates. The benchmark features a total of\n12,000 plate geometries with varying forms of beadings, material, boundary\nconditions, load position and sizes with associated numerical solutions. To\naddress the benchmark task, we propose a new network architecture, named\nFrequency-Query Operator, which predicts vibration patterns of plate geometries\ngiven a specific excitation frequency. Applying principles from operator\nlearning and implicit models for shape encoding, our approach effectively\naddresses the prediction of highly variable frequency response functions\noccurring in dynamic systems. To quantify the prediction quality, we introduce\na set of evaluation metrics and evaluate the method on our vibrating-plates\nbenchmark. Our method outperforms DeepONets, Fourier Neural Operators and more\ntraditional neural network architectures and can be used for design\noptimization. Code, dataset and visualizations:\nhttps://github.com/ecker-lab/Learning_Vibrating_Plates\n","authors":["Jan van Delden","Julius Schultz","Christopher Blech","Sabine C. Langer","Timo Lüddecke"],"pdf_url":"https://arxiv.org/pdf/2310.05469v4.pdf","comment":"Accepted at Neurips 2024"},{"id":"http://arxiv.org/abs/2412.02492v1","updated":"2024-12-03T15:06:07Z","published":"2024-12-03T15:06:07Z","title":"The Cost of Consistency: Submodular Maximization with Constant Recourse","summary":" In this work, we study online submodular maximization, and how the\nrequirement of maintaining a stable solution impacts the approximation. In\nparticular, we seek bounds on the best-possible approximation ratio that is\nattainable when the algorithm is allowed to make at most a constant number of\nupdates per step. We show a tight information-theoretic bound of $\\tfrac{2}{3}$\nfor general monotone submodular functions, and an improved (also tight) bound\nof $\\tfrac{3}{4}$ for coverage functions. Since both these bounds are attained\nby non poly-time algorithms, we also give a poly-time randomized algorithm that\nachieves a $0.51$-approximation. Combined with an information-theoretic\nhardness of $\\tfrac{1}{2}$ for deterministic algorithms from prior work, our\nwork thus shows a separation between deterministic and randomized algorithms,\nboth information theoretically and for poly-time algorithms.\n","authors":["Paul Dütting","Federico Fusco","Silvio Lattanzi","Ashkan Norouzi-Fard","Ola Svensson","Morteza Zadimoghaddam"],"pdf_url":"https://arxiv.org/pdf/2412.02492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02484v1","updated":"2024-12-03T14:47:46Z","published":"2024-12-03T14:47:46Z","title":"Vector Optimization with Gaussian Process Bandits","summary":" Learning problems in which multiple conflicting objectives must be considered\nsimultaneously often arise in various fields, including engineering, drug\ndesign, and environmental management. Traditional methods for dealing with\nmultiple black-box objective functions, such as scalarization and\nidentification of the Pareto set under the componentwise order, have\nlimitations in incorporating objective preferences and exploring the solution\nspace accordingly. While vector optimization offers improved flexibility and\nadaptability via specifying partial orders based on ordering cones, current\ntechniques designed for sequential experiments either suffer from high sample\ncomplexity or lack theoretical guarantees. To address these issues, we propose\nVector Optimization with Gaussian Process (VOGP), a probably approximately\ncorrect adaptive elimination algorithm that performs black-box vector\noptimization using Gaussian process bandits. VOGP allows users to convey\nobjective preferences through ordering cones while performing efficient\nsampling by exploiting the smoothness of the objective function, resulting in a\nmore effective optimization process that requires fewer evaluations. We\nestablish theoretical guarantees for VOGP and derive information gain-based and\nkernel-specific sample complexity bounds. We also conduct experiments on both\nreal-world and synthetic datasets to compare VOGP with the state-of-the-art\nmethods.\n","authors":["İlter Onat Korkmaz","Yaşar Cahit Yıldırım","Çağın Ararat","Cem Tekin"],"pdf_url":"https://arxiv.org/pdf/2412.02484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02482v1","updated":"2024-12-03T14:45:46Z","published":"2024-12-03T14:45:46Z","title":"What should a neuron aim for? Designing local objective functions based\n on information theory","summary":" In modern deep neural networks, the learning dynamics of the individual\nneurons is often obscure, as the networks are trained via global optimization.\nConversely, biological systems build on self-organized, local learning,\nachieving robustness and efficiency with limited global information. We here\nshow how self-organization between individual artificial neurons can be\nachieved by designing abstract bio-inspired local learning goals. These goals\nare parameterized using a recent extension of information theory, Partial\nInformation Decomposition (PID), which decomposes the information that a set of\ninformation sources holds about an outcome into unique, redundant and\nsynergistic contributions. Our framework enables neurons to locally shape the\nintegration of information from various input classes, i.e. feedforward,\nfeedback, and lateral, by selecting which of the three inputs should contribute\nuniquely, redundantly or synergistically to the output. This selection is\nexpressed as a weighted sum of PID terms, which, for a given problem, can be\ndirectly derived from intuitive reasoning or via numerical optimization,\noffering a window into understanding task-relevant local information\nprocessing. Achieving neuron-level interpretability while enabling strong\nperformance using local learning, our work advances a principled\ninformation-theoretic foundation for local learning strategies.\n","authors":["Andreas C. Schneider","Valentin Neuhaus","David A. Ehrlich","Abdullah Makkeh","Alexander S. Ecker","Viola Priesemann","Michael Wibral"],"pdf_url":"https://arxiv.org/pdf/2412.02482v1.pdf","comment":"24 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.00710v3","updated":"2024-12-03T14:45:03Z","published":"2023-12-01T16:42:57Z","title":"SpaCE: The Spatial Confounding Environment","summary":" Spatial confounding poses a significant challenge in scientific studies\ninvolving spatial data, where unobserved spatial variables can influence both\ntreatment and outcome, possibly leading to spurious associations. To address\nthis problem, we introduce SpaCE: The Spatial Confounding Environment, the\nfirst toolkit to provide realistic benchmark datasets and tools for\nsystematically evaluating causal inference methods designed to alleviate\nspatial confounding. Each dataset includes training data, true counterfactuals,\na spatial graph with coordinates, and smoothness and confounding scores\ncharacterizing the effect of a missing spatial confounder. It also includes\nrealistic semi-synthetic outcomes and counterfactuals, generated using\nstate-of-the-art machine learning ensembles, following best practices for\ncausal inference benchmarks. The datasets cover real treatment and covariates\nfrom diverse domains, including climate, health and social sciences. SpaCE\nfacilitates an automated end-to-end pipeline, simplifying data loading,\nexperimental setup, and evaluating machine learning and causal inference\nmodels. The SpaCE project provides several dozens of datasets of diverse sizes\nand spatial complexity. It is publicly available as a Python package,\nencouraging community feedback and contributions.\n","authors":["Mauricio Tec","Ana Trisovic","Michelle Audirac","Sophie Woodward","Jie Kate Hu","Naeem Khoshnevis","Francesca Dominici"],"pdf_url":"https://arxiv.org/pdf/2312.00710v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02479v1","updated":"2024-12-03T14:42:31Z","published":"2024-12-03T14:42:31Z","title":"OODFace: Benchmarking Robustness of Face Recognition under Common\n Corruptions and Appearance Variations","summary":" With the rise of deep learning, facial recognition technology has seen\nextensive research and rapid development. Although facial recognition is\nconsidered a mature technology, we find that existing open-source models and\ncommercial algorithms lack robustness in certain real-world Out-of-Distribution\n(OOD) scenarios, raising concerns about the reliability of these systems. In\nthis paper, we introduce OODFace, which explores the OOD challenges faced by\nfacial recognition models from two perspectives: common corruptions and\nappearance variations. We systematically design 30 OOD scenarios across 9 major\ncategories tailored for facial recognition. By simulating these challenges on\npublic datasets, we establish three robustness benchmarks: LFW-C/V, CFP-FP-C/V,\nand YTF-C/V. We then conduct extensive experiments on 19 different facial\nrecognition models and 3 commercial APIs, along with extended experiments on\nface masks, Vision-Language Models (VLMs), and defense strategies to assess\ntheir robustness. Based on the results, we draw several key insights,\nhighlighting the vulnerability of facial recognition systems to OOD data and\nsuggesting possible solutions. Additionally, we offer a unified toolkit that\nincludes all corruption and variation types, easily extendable to other\ndatasets. We hope that our benchmarks and findings can provide guidance for\nfuture improvements in facial recognition model robustness.\n","authors":["Caixin Kang","Yubo Chen","Shouwei Ruan","Shiji Zhao","Ruochen Zhang","Jiayi Wang","Shan Fu","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2412.02479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.03523v4","updated":"2024-12-03T14:31:41Z","published":"2024-10-04T15:44:23Z","title":"A Probabilistic Perspective on Unlearning and Alignment for Large\n Language Models","summary":" Comprehensive evaluation of Large Language Models (LLMs) is an open research\nproblem. Existing evaluations rely on deterministic point estimates generated\nvia greedy decoding. However, we find that deterministic evaluations fail to\ncapture the whole output distribution of a model, yielding inaccurate\nestimations of model capabilities. This is particularly problematic in critical\ncontexts such as unlearning and alignment, where precise model evaluations are\ncrucial. To remedy this, we introduce the first formal probabilistic evaluation\nframework in LLMs. Namely, we derive novel metrics with high-probability\nguarantees concerning the output distribution of a model. Our metrics are\napplication-independent and allow practitioners to make more reliable estimates\nabout model capabilities before deployment. Through a case study focused on\nunlearning, we reveal that deterministic evaluations falsely indicate\nsuccessful unlearning, whereas our probabilistic evaluations demonstrate that\nmost if not all of the supposedly unlearned information remains accessible in\nthese models. Additionally, we propose a novel unlearning loss based on entropy\noptimization and adaptive temperature scaling, which significantly improves\nunlearning in probabilistic settings on recent benchmarks. Our proposed shift\nfrom point estimates to probabilistic evaluations of output distributions\nrepresents an important step toward comprehensive evaluations of LLMs. Code\navailable at https://github.com/yascho/probabilistic-unlearning.\n","authors":["Yan Scholten","Stephan Günnemann","Leo Schwinn"],"pdf_url":"https://arxiv.org/pdf/2410.03523v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02471v1","updated":"2024-12-03T14:29:47Z","published":"2024-12-03T14:29:47Z","title":"COMET:Combined Matrix for Elucidating Targets","summary":" Identifying the interaction targets of bioactive compounds is a foundational\nelement for deciphering their pharmacological effects. Target prediction\nalgorithms equip researchers with an effective tool to rapidly scope and\nexplore potential targets. Here, we introduce the COMET, a multi-technological\nmodular target prediction tool that provides comprehensive predictive insights,\nincluding similar active compounds, three-dimensional predicted binding modes,\nand probability scores, all within an average processing time of less than 10\nminutes per task. With meticulously curated data, the COMET database\nencompasses 990,944 drug-target interaction pairs and 45,035 binding pockets,\nenabling predictions for 2,685 targets, which span confirmed and exploratory\ntherapeutic targets for human diseases. In comparative testing using datasets\nfrom ChEMBL and BindingDB, COMET outperformed five other well-known algorithms,\noffering nearly an 80% probability of accurately identifying at least one true\ntarget within the top 15 predictions for a given compound. COMET also features\na user-friendly web server, accessible freely at\nhttps://www.pdbbind-plus.org.cn/comet.\n","authors":["Haojie Wang","Zhe Zhang","Haotian Gao","Xiangying Zhang","Zhihang Chen","Xinchong Chen","Yifei Qi","Yan Li","Renxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2412.02471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02467v1","updated":"2024-12-03T14:10:09Z","published":"2024-12-03T14:10:09Z","title":"DP-2Stage: Adapting Language Models as Differentially Private Tabular\n Data Generators","summary":" Generating tabular data under differential privacy (DP) protection ensures\ntheoretical privacy guarantees but poses challenges for training machine\nlearning models, primarily due to the need to capture complex structures under\nnoisy supervision signals. Recently, pre-trained Large Language Models (LLMs)\n-- even those at the scale of GPT-2 -- have demonstrated great potential in\nsynthesizing tabular data. However, their applications under DP constraints\nremain largely unexplored. In this work, we address this gap by applying DP\ntechniques to the generation of synthetic tabular data. Our findings shows that\nLLMs face difficulties in generating coherent text when fine-tuned with DP, as\nprivacy budgets are inefficiently allocated to non-private elements like table\nstructures. To overcome this, we propose \\ours, a two-stage fine-tuning\nframework for differentially private tabular data generation. The first stage\ninvolves non-private fine-tuning on a pseudo dataset, followed by DP\nfine-tuning on a private dataset. Our empirical results show that this approach\nimproves performance across various settings and metrics compared to directly\nfine-tuned LLMs in DP contexts. We release our code and setup at\nhttps://github.com/tejuafonja/DP-2Stage.\n","authors":["Tejumade Afonja","Hui-Po Wang","Raouf Kerkouche","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2412.02467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01388v2","updated":"2024-12-03T14:08:19Z","published":"2024-12-02T11:21:58Z","title":"Harnessing Preference Optimisation in Protein LMs for Hit Maturation in\n Cell Therapy","summary":" Cell and immunotherapy offer transformative potential for treating diseases\nlike cancer and autoimmune disorders by modulating the immune system. The\ndevelopment of these therapies is resource-intensive, with the majority of drug\ncandidates failing to progress beyond laboratory testing. While recent advances\nin machine learning have revolutionised areas such as protein engineering,\napplications in immunotherapy remain limited due to the scarcity of\nlarge-scale, standardised datasets and the complexity of cellular systems. In\nthis work, we address these challenges by leveraging a high-throughput\nexperimental platform to generate data suitable for fine-tuning protein\nlanguage models. We demonstrate how models fine-tuned using a preference task\nshow surprising correlations to biological assays, and how they can be\nleveraged for few-shot hit maturation in CARs. This proof-of-concept presents a\nnovel pathway for applying ML to immunotherapy and could generalise to other\ntherapeutic modalities.\n","authors":["Katarzyna Janocha","Annabel Ling","Alice Godson","Yulia Lampi","Simon Bornschein","Nils Y. Hammerla"],"pdf_url":"https://arxiv.org/pdf/2412.01388v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18355v2","updated":"2024-12-03T14:07:34Z","published":"2024-03-27T08:48:16Z","title":"Supervised Multiple Kernel Learning approaches for multi-omics data\n integration","summary":" Advances in high-throughput technologies have originated an ever-increasing\navailability of omics datasets. The integration of multiple heterogeneous data\nsources is currently an issue for biology and bioinformatics. Multiple kernel\nlearning (MKL) has shown to be a flexible and valid approach to consider the\ndiverse nature of multi-omics inputs, despite being an underused tool in\ngenomic data mining. We provide novel MKL approaches based on different kernel\nfusion strategies. To learn from the meta-kernel of input kernels, we adapted\nunsupervised integration algorithms for supervised tasks with support vector\nmachines. We also tested deep learning architectures for kernel fusion and\nclassification. The results show that MKL-based models can outperform more\ncomplex, state-of-the-art, supervised multi-omics integrative approaches.\nMultiple kernel learning offers a natural framework for predictive models in\nmulti-omics data. It proved to provide a fast and reliable solution that can\ncompete with and outperform more complex architectures. Our results offer a\ndirection for bio-data mining research, biomarker discovery and further\ndevelopment of methods for heterogeneous data integration.\n","authors":["Mitja Briscik","Gabriele Tazza","Marie-Agnes Dillies","László Vidács","Sébastien Dejean"],"pdf_url":"https://arxiv.org/pdf/2403.18355v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02449v1","updated":"2024-12-03T13:34:42Z","published":"2024-12-03T13:34:42Z","title":"BYE: Build Your Encoder with One Sequence of Exploration Data for\n Long-Term Dynamic Scene Understanding","summary":" Dynamic scene understanding remains a persistent challenge in robotic\napplications. Early dynamic mapping methods focused on mitigating the negative\ninfluence of short-term dynamic objects on camera motion estimation by masking\nor tracking specific categories, which often fall short in adapting to\nlong-term scene changes. Recent efforts address object association in long-term\ndynamic environments using neural networks trained on synthetic datasets, but\nthey still rely on predefined object shapes and categories. Other methods\nincorporate visual, geometric, or semantic heuristics for the association but\noften lack robustness. In this work, we introduce BYE, a class-agnostic,\nper-scene point cloud encoder that removes the need for predefined categories,\nshape priors, or extensive association datasets. Trained on only a single\nsequence of exploration data, BYE can efficiently perform object association in\ndynamically changing scenes. We further propose an ensembling scheme combining\nthe semantic strengths of Vision Language Models (VLMs) with the scene-specific\nexpertise of BYE, achieving a 7% improvement and a 95% success rate in object\nassociation tasks. Code and dataset are available at\nhttps://byencoder.github.io.\n","authors":["Chenguang Huang","Shengchao Yan","Wolfram Burgard"],"pdf_url":"https://arxiv.org/pdf/2412.02449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02441v1","updated":"2024-12-03T13:25:18Z","published":"2024-12-03T13:25:18Z","title":"Artificial Expert Intelligence through PAC-reasoning","summary":" Artificial Expert Intelligence (AEI) seeks to transcend the limitations of\nboth Artificial General Intelligence (AGI) and narrow AI by integrating\ndomain-specific expertise with critical, precise reasoning capabilities akin to\nthose of top human experts. Existing AI systems often excel at predefined tasks\nbut struggle with adaptability and precision in novel problem-solving. To\novercome this, AEI introduces a framework for ``Probably Approximately Correct\n(PAC) Reasoning\". This paradigm provides robust theoretical guarantees for\nreliably decomposing complex problems, with a practical mechanism for\ncontrolling reasoning precision. In reference to the division of human thought\ninto System 1 for intuitive thinking and System 2 for reflective\nreasoning~\\citep{tversky1974judgment}, we refer to this new type of reasoning\nas System 3 for precise reasoning, inspired by the rigor of the scientific\nmethod. AEI thus establishes a foundation for error-bounded, inference-time\nlearning.\n","authors":["Shai Shalev-Shwartz","Amnon Shashua","Gal Beniamini","Yoav Levine","Or Sharir","Noam Wies","Ido Ben-Shaul","Tomer Nussbaum","Shir Granot Peled"],"pdf_url":"https://arxiv.org/pdf/2412.02441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02439v1","updated":"2024-12-03T13:21:09Z","published":"2024-12-03T13:21:09Z","title":"Nature versus nurture in galaxy formation: the effect of environment on\n star formation with causal machine learning","summary":" Understanding how galaxies form and evolve is at the heart of modern\nastronomy. With the advent of large-scale surveys and simulations, remarkable\nprogress has been made in the last few decades. Despite this, the physical\nprocesses behind the phenomena, and particularly their importance, remain far\nfrom known, as correlations have primarily been established rather than the\nunderlying causality. We address this challenge by applying the causal\ninference framework. Specifically, we tackle the fundamental open question of\nwhether galaxy formation and evolution depends more on nature (i.e., internal\nprocesses) or nurture (i.e., external processes), by estimating the causal\neffect of environment on star-formation rate in the IllustrisTNG simulations.\nTo do so, we develop a comprehensive causal model and employ cutting-edge\ntechniques from epidemiology to overcome the long-standing problem of\ndisentangling nature and nurture. We find that the causal effect is negative\nand substantial, with environment suppressing the SFR by a maximal factor of\n$\\sim100$. While the overall effect at $z=0$ is negative, in the early\nuniverse, environment is discovered to have a positive impact, boosting star\nformation by a factor of $\\sim10$ at $z\\sim1$ and by even greater amounts at\nhigher redshifts. Furthermore, we show that: (i) nature also plays an important\nrole, as ignoring it underestimates the causal effect in intermediate-density\nenvironments by a factor of $\\sim2$, (ii) controlling for the stellar mass at a\nsnapshot in time, as is common in the literature, is not only insufficient to\ndisentangle nature and nurture but actually has an adverse effect, though (iii)\nstellar mass is an adequate proxy of the effects of nature. Finally, this work\nmay prove a useful blueprint for extracting causal insights in other fields\nthat deal with dynamical systems with closed feedback loops, such as the\nEarth's climate.\n","authors":["Sunil Mucesh","William G. Hartley","Ciarán M. Gilligan-Lee","Ofer Lahav"],"pdf_url":"https://arxiv.org/pdf/2412.02439v1.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.02432v1","updated":"2024-12-03T12:57:08Z","published":"2024-12-03T12:57:08Z","title":"Improved Localized Machine Unlearning Through the Lens of Memorization","summary":" Machine unlearning refers to removing the influence of a specified subset of\ntraining data from a machine learning model, efficiently, after it has already\nbeen trained. This is important for key applications, including making the\nmodel more accurate by removing outdated, mislabeled, or poisoned data. In this\nwork, we study localized unlearning, where the unlearning algorithm operates on\na (small) identified subset of parameters. Drawing inspiration from the\nmemorization literature, we propose an improved localization strategy that\nyields strong results when paired with existing unlearning algorithms. We also\npropose a new unlearning algorithm, Deletion by Example Localization (DEL),\nthat resets the parameters deemed-to-be most critical according to our\nlocalization strategy, and then finetunes them. Our extensive experiments on\ndifferent datasets, forget sets and metrics reveal that DEL sets a new\nstate-of-the-art for unlearning metrics, against both localized and\nfull-parameter methods, while modifying a small subset of parameters, and\noutperforms the state-of-the-art localized unlearning in terms of test accuracy\ntoo.\n","authors":["Reihaneh Torkzadehmahani","Reza Nasirigerdeh","Georgios Kaissis","Daniel Rueckert","Gintare Karolina Dziugaite","Eleni Triantafillou"],"pdf_url":"https://arxiv.org/pdf/2412.02432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02430v1","updated":"2024-12-03T12:52:04Z","published":"2024-12-03T12:52:04Z","title":"Transformer-based Koopman Autoencoder for Linearizing Fisher's Equation","summary":" A Transformer-based Koopman autoencoder is proposed for linearizing Fisher's\nreaction-diffusion equation. The primary focus of this study is on using deep\nlearning techniques to find complex spatiotemporal patterns in the\nreaction-diffusion system. The emphasis is on not just solving the equation but\nalso transforming the system's dynamics into a more comprehensible, linear\nform. Global coordinate transformations are achieved through the autoencoder,\nwhich learns to capture the underlying dynamics by training on a dataset with\n60,000 initial conditions. Extensive testing on multiple datasets was used to\nassess the efficacy of the proposed model, demonstrating its ability to\naccurately predict the system's evolution as well as to generalize. We provide\na thorough comparison study, comparing our suggested design to a few other\ncomparable methods using experiments on various PDEs, such as the\nKuramoto-Sivashinsky equation and the Burger's equation. Results show improved\naccuracy, highlighting the capabilities of the Transformer-based Koopman\nautoencoder. The proposed architecture in is significantly ahead of other\narchitectures, in terms of solving different types of PDEs using a single\narchitecture. Our method relies entirely on the data, without requiring any\nknowledge of the underlying equations. This makes it applicable to even the\ndatasets where the governing equations are not known.\n","authors":["Kanav Singh Rana","Nitu Kumari"],"pdf_url":"https://arxiv.org/pdf/2412.02430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02423v1","updated":"2024-12-03T12:38:53Z","published":"2024-12-03T12:38:53Z","title":"Time-Series-Informed Closed-loop Learning for Sequential Decision Making\n and Control","summary":" Closed-loop performance of sequential decision making algorithms, such as\nmodel predictive control, depends strongly on the parameters of cost functions,\nmodels, and constraints. Bayesian optimization is a common approach to learning\nthese parameters based on closed-loop experiments. However, traditional\nBayesian optimization approaches treat the learning problem as a black box,\nignoring valuable information and knowledge about the structure of the\nunderlying problem, resulting in slow convergence and high experimental\nresource use. We propose a time-series-informed optimization framework that\nincorporates intermediate performance evaluations from early iterations of each\nexperimental episode into the learning procedure. Additionally, probabilistic\nearly stopping criteria are proposed to terminate unpromising experiments,\nsignificantly reducing experimental time. Simulation results show that our\napproach achieves baseline performance with approximately half the resources.\nMoreover, with the same resource budget, our approach outperforms the baseline\nin terms of final closed-loop performance, highlighting its efficiency in\nsequential decision making scenarios.\n","authors":["Sebastian Hirt","Lukas Theiner","Rolf Findeisen"],"pdf_url":"https://arxiv.org/pdf/2412.02423v1.pdf","comment":"12 pages, 3 figures, submitted to L4DC 2025"},{"id":"http://arxiv.org/abs/2412.02412v1","updated":"2024-12-03T12:12:03Z","published":"2024-12-03T12:12:03Z","title":"VISTA: A Panoramic View of Neural Representations","summary":" We present VISTA (Visualization of Internal States and Their Associations), a\nnovel pipeline for visually exploring and interpreting neural network\nrepresentations. VISTA addresses the challenge of analyzing vast\nmultidimensional spaces in modern machine learning models by mapping\nrepresentations into a semantic 2D space. The resulting collages visually\nreveal patterns and relationships within internal representations. We\ndemonstrate VISTA's utility by applying it to sparse autoencoder latents\nuncovering new properties and interpretations. We review the VISTA methodology,\npresent findings from our case study ( https://got.drib.net/latents/ ), and\ndiscuss implications for neural network interpretability across various domains\nof machine learning.\n","authors":["Tom White"],"pdf_url":"https://arxiv.org/pdf/2412.02412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02408v1","updated":"2024-12-03T12:03:13Z","published":"2024-12-03T12:03:13Z","title":"Leveraging Ensemble-Based Semi-Supervised Learning for Illicit Account\n Detection in Ethereum DeFi Transactions","summary":" The advent of smart contracts has enabled the rapid rise of Decentralized\nFinance (DeFi) on the Ethereum blockchain, offering substantial rewards in\nfinancial innovation and inclusivity. However, this growth has also introduced\nsignificant security risks, including the proliferation of illicit accounts\ninvolved in fraudulent activities. Traditional detection methods are limited by\nthe scarcity of labeled data and the evolving tactics of malicious actors. In\nthis paper, we propose a novel Self-Learning Ensemble-based Illicit account\nDetection (SLEID) framework to address these challenges. SLEID employs an\nIsolation Forest for initial outlier detection and a self-training mechanism to\niteratively generate pseudo-labels for unlabeled accounts, thereby enhancing\ndetection accuracy. Extensive experiments demonstrate that SLEID significantly\noutperforms traditional supervised approaches and recent semi-supervised\nmodels, achieving superior precision, recall, and F1-scores, particularly in\ndetecting illicit accounts. Compared to state-of-the-art methods, our approach\nachieves better detection performance while reducing reliance on labeled data.\nThe results affirm SLEID's efficacy as a robust solution for safeguarding the\nDeFi ecosystem and mitigating risks posed by malicious accounts.\n","authors":["Shabnam Fazliani","Mohammad Mowlavi Sorond","Arsalan Masoudifard"],"pdf_url":"https://arxiv.org/pdf/2412.02408v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.02403v1","updated":"2024-12-03T11:53:05Z","published":"2024-12-03T11:53:05Z","title":"3D Face Reconstruction From Radar Images","summary":" The 3D reconstruction of faces gains wide attention in computer vision and is\nused in many fields of application, for example, animation, virtual reality,\nand even forensics. This work is motivated by monitoring patients in sleep\nlaboratories. Due to their unique characteristics, sensors from the radar\ndomain have advantages compared to optical sensors, namely penetration of\nelectrically non-conductive materials and independence of light. These\nadvantages of radar signals unlock new applications and require adaptation of\n3D reconstruction frameworks. We propose a novel model-based method for 3D\nreconstruction from radar images. We generate a dataset of synthetic radar\nimages with a physics-based but non-differentiable radar renderer. This dataset\nis used to train a CNN-based encoder to estimate the parameters of a 3D\nmorphable face model. Whilst the encoder alone already leads to strong\nreconstructions of synthetic data, we extend our reconstruction in an\nAnalysis-by-Synthesis fashion to a model-based autoencoder. This is enabled by\nlearning the rendering process in the decoder, which acts as an object-specific\ndifferentiable radar renderer. Subsequently, the combination of both network\nparts is trained to minimize both, the loss of the parameters and the loss of\nthe resulting reconstructed radar image. This leads to the additional benefit,\nthat at test time the parameters can be further optimized by finetuning the\nautoencoder unsupervised on the image loss. We evaluated our framework on\ngenerated synthetic face images as well as on real radar images with 3D ground\ntruth of four individuals.\n","authors":["Valentin Braeutigam","Vanessa Wirth","Ingrid Ullmann","Christian Schüßler","Martin Vossiek","Matthias Berking","Bernhard Egger"],"pdf_url":"https://arxiv.org/pdf/2412.02403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02399v1","updated":"2024-12-03T11:49:01Z","published":"2024-12-03T11:49:01Z","title":"OMENN: One Matrix to Explain Neural Networks","summary":" Deep Learning (DL) models are often black boxes, making their decision-making\nprocesses difficult to interpret. This lack of transparency has driven\nadvancements in eXplainable Artificial Intelligence (XAI), a field dedicated to\nclarifying the reasoning behind DL model predictions. Among these,\nattribution-based methods such as LRP and GradCAM are widely used, though they\nrely on approximations that can be imprecise.\n To address these limitations, we introduce One Matrix to Explain Neural\nNetworks (OMENN), a novel post-hoc method that represents a neural network as a\nsingle, interpretable matrix for each specific input. This matrix is\nconstructed through a series of linear transformations that represent the\nprocessing of the input by each successive layer in the neural network. As a\nresult, OMENN provides locally precise, attribution-based explanations of the\ninput across various modern models, including ViTs and CNNs. We present a\ntheoretical analysis of OMENN based on dynamic linearity property and validate\nits effectiveness with extensive tests on two XAI benchmarks, demonstrating\nthat OMENN is competitive with state-of-the-art methods.\n","authors":["Adam Wróbel","Mikołaj Janusz","Bartosz Zieliński","Dawid Rymarczyk"],"pdf_url":"https://arxiv.org/pdf/2412.02399v1.pdf","comment":"Under review, code will be released after acceptance"},{"id":"http://arxiv.org/abs/2303.04613v5","updated":"2024-12-03T11:48:24Z","published":"2023-03-08T14:32:59Z","title":"The Descriptive Complexity of Graph Neural Networks","summary":" We analyse the power of graph neural networks (GNNs) in terms of Boolean\ncircuit complexity and descriptive complexity.\n We prove that the graph queries that can be computed by a polynomial-size\nbounded-depth family of GNNs are exactly those definable in the guarded\nfragment GFO+C of first-order logic with counting and with built-in relations.\nThis puts GNNs in the circuit complexity class (non-uniform) $\\text{TC}^0$.\nRemarkably, the GNN families may use arbitrary real weights and a wide class of\nactivation functions that includes the standard ReLU, logistic \"sigmoid\", and\nhyperbolic tangent functions. If the GNNs are allowed to use random\ninitialisation and global readout (both standard features of GNNs widely used\nin practice), they can compute exactly the same queries as bounded depth\nBoolean circuits with threshold gates, that is, exactly the queries in\n$\\text{TC}^0$.\n Moreover, we show that queries computable by a single GNN with piecewise\nlinear activations and rational weights are definable in GFO+C without built-in\nrelations. Therefore, they are contained in uniform $\\text{TC}^0$.\n","authors":["Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2303.04613v5.pdf","comment":"Journal version for TheoretiCS"},{"id":"http://arxiv.org/abs/2003.12366v2","updated":"2024-12-03T11:13:27Z","published":"2020-03-22T11:21:29Z","title":"Training for Speech Recognition on Coprocessors","summary":" Automatic Speech Recognition (ASR) has increased in popularity in recent\nyears. The evolution of processor and storage technologies has enabled more\nadvanced ASR mechanisms, fueling the development of virtual assistants such as\nAmazon Alexa, Apple Siri, Microsoft Cortana, and Google Home. The interest in\nsuch assistants, in turn, has amplified the novel developments in ASR research.\nHowever, despite this popularity, there has not been a detailed training\nefficiency analysis of modern ASR systems. This mainly stems from: the\nproprietary nature of many modern applications that depend on ASR, like the\nones listed above; the relatively expensive co-processor hardware that is used\nto accelerate ASR by big vendors to enable such applications; and the absence\nof well-established benchmarks. The goal of this paper is to address the latter\ntwo of these challenges. The paper first describes an ASR model, based on a\ndeep neural network inspired by recent work in this domain, and our experiences\nbuilding it. Then we evaluate this model on three CPU-GPU co-processor\nplatforms that represent different budget categories. Our results demonstrate\nthat utilizing hardware acceleration yields good results even without high-end\nequipment. While the most expensive platform (10X price of the least expensive\none) converges to the initial accuracy target 10-30% and 60-70% faster than the\nother two, the differences among the platforms almost disappear at slightly\nhigher accuracy targets. In addition, our results further highlight both the\ndifficulty of evaluating ASR systems due to the complex, long, and resource\nintensive nature of the model training in this domain, and the importance of\nestablishing benchmarks for ASR.\n","authors":["Sebastian Baunsgaard","Sebastian B. Wrede","Pınar Tozun"],"pdf_url":"https://arxiv.org/pdf/2003.12366v2.pdf","comment":"published at ADMS 2020"},{"id":"http://arxiv.org/abs/2403.16970v3","updated":"2024-12-03T11:09:31Z","published":"2024-03-25T17:31:12Z","title":"Enhancing joint automatic chest X-ray diagnosis and clinical visual\n attention prediction with multi-stage cooperative learning","summary":" Purpose: As visual inspection is an inherent process during radiological\nscreening, the associated eye gaze data can provide valuable insights into\nrelevant clinical decisions. As deep learning has become the state-of-the-art\nfor computer-assisted diagnosis, integrating human behavior, such as eye gaze\ndata, into these systems is instrumental to help align machine predictions with\nclinical diagnostic criteria, thus enhancing the quality of automatic\nradiological diagnosis. Methods: We propose a novel deep learning framework for\njoint disease diagnosis and prediction of corresponding clinical visual\nattention maps for chest X-ray scans. Specifically, we introduce a new\ndual-encoder multi-task UNet, which leverages both a DenseNet201 backbone and a\nResidual and Squeeze-and-Excitation block-based encoder to extract diverse\nfeatures for visual attention map prediction, and a multi-scale feature-fusion\nclassifier to perform disease classification. To tackle the issue of\nasynchronous training schedules of individual tasks in multi-task learning, we\nproposed a multi-stage cooperative learning strategy, with contrastive learning\nfor feature encoder pretraining to boost performance. Results: Our proposed\nmethod is shown to significantly outperform existing techniques for chest X-ray\ndiagnosis (AUC=0.93) and the quality of visual attention map prediction\n(Correlation coefficient=0.58). Conclusion: Benefiting from the proposed\nmulti-task multi-stage cooperative learning, our technique demonstrates the\nbenefit of integrating clinicians' eye gaze into clinical AI systems to boost\nperformance and potentially explainability.\n","authors":["Zirui Qiu","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.16970v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08488v2","updated":"2024-12-03T11:06:03Z","published":"2024-08-16T02:17:21Z","title":"PITN: Physics-Informed Temporal Networks for Cuffless Blood Pressure\n Estimation","summary":" Monitoring blood pressure with non-invasive sensors has gained popularity for\nproviding comfortable user experiences, one of which is a significant function\nof smart wearables. Although providing a comfortable user experience, such\nmethods are suffering from the demand for a significant amount of realistic\ndata to train an individual model for each subject, especially considering the\ninvasive or obtrusive BP ground-truth measurements. To tackle this challenge,\nwe introduce a novel physics-informed temporal network~(PITN) with adversarial\ncontrastive learning to enable precise BP estimation with very limited data.\nSpecifically, we first enhance the physics-informed neural network~(PINN) with\nthe temporal block for investigating BP dynamics' multi-periodicity for\npersonal cardiovascular cycle modeling and temporal variation. We then employ\nadversarial training to generate extra physiological time series data,\nimproving PITN's robustness in the face of sparse subject-specific training\ndata. Furthermore, we utilize contrastive learning to capture the\ndiscriminative variations of cardiovascular physiologic phenomena. This\napproach aggregates physiological signals with similar blood pressure values in\nlatent space while separating clusters of samples with dissimilar blood\npressure values. Experiments on three widely-adopted datasets with different\nmodailties (\\emph{i.e.,} bioimpedance, PPG, millimeter-wave) demonstrate the\nsuperiority and effectiveness of the proposed methods over previous\nstate-of-the-art approaches. The code is available\nat~\\url{https://github.com/Zest86/ACL-PITN}.\n","authors":["Rui Wang","Mengshi Qi","Yingxia Shao","Anfu Zhou","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2408.08488v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.02372v1","updated":"2024-12-03T10:58:34Z","published":"2024-12-03T10:58:34Z","title":"HERO: Hint-Based Efficient and Reliable Query Optimizer","summary":" We propose a novel model for learned query optimization which provides query\nhints leading to better execution plans. The model addresses the three key\nchallenges in learned hint-based query optimization: reliable hint\nrecommendation (ensuring non-degradation of query latency), efficient hint\nexploration, and fast inference. We provide an in-depth analysis of existing\nNN-based approaches to hint-based optimization and experimentally confirm the\nnamed challenges for them. Our alternative solution consists of a new inference\nschema based on an ensemble of context-aware models and a graph storage for\nreliable hint suggestion and fast inference, and a budget-controlled training\nprocedure with a local search algorithm that solves the issue of exponential\nsearch space exploration. In experiments on standard benchmarks, our model\ndemonstrates optimization capability close to the best achievable with\ncoarse-grained hints. Controlling the degree of parallelism (query dop) in\naddition to operator-related hints enables our model to achieve 3x latency\nimprovement on JOB benchmark which sets a new standard for optimization. Our\nmodel is interpretable and easy to debug, which is particularly important for\ndeployment in production.\n","authors":["Sergey Zinchenko","Sergey Iazov"],"pdf_url":"https://arxiv.org/pdf/2412.02372v1.pdf","comment":"Submitted to VLDB 2025; 13 pages; 13 figures"},{"id":"http://arxiv.org/abs/2412.02352v1","updated":"2024-12-03T10:17:15Z","published":"2024-12-03T10:17:15Z","title":"LoRA Diffusion: Zero-Shot LoRA Synthesis for Diffusion Model\n Personalization","summary":" Low-Rank Adaptation (LoRA) and other parameter-efficient fine-tuning (PEFT)\nmethods provide low-memory, storage-efficient solutions for personalizing\ntext-to-image models. However, these methods offer little to no improvement in\nwall-clock training time or the number of steps needed for convergence compared\nto full model fine-tuning. While PEFT methods assume that shifts in generated\ndistributions (from base to fine-tuned models) can be effectively modeled\nthrough weight changes in a low-rank subspace, they fail to leverage knowledge\nof common use cases, which typically focus on capturing specific styles or\nidentities. Observing that desired outputs often comprise only a small subset\nof the possible domain covered by LoRA training, we propose reducing the search\nspace by incorporating a prior over regions of interest. We demonstrate that\ntraining a hypernetwork model to generate LoRA weights can achieve competitive\nquality for specific domains while enabling near-instantaneous conditioning on\nuser input, in contrast to traditional training methods that require thousands\nof steps.\n","authors":["Ethan Smith","Rami Seid","Alberto Hojel","Paramita Mishra","Jianbo Wu"],"pdf_url":"https://arxiv.org/pdf/2412.02352v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.02340v1","updated":"2024-12-03T10:03:12Z","published":"2024-12-03T10:03:12Z","title":"Federated Analytics in Practice: Engineering for Privacy, Scalability\n and Practicality","summary":" Cross-device Federated Analytics (FA) is a distributed computation paradigm\ndesigned to answer analytics queries about and derive insights from data held\nlocally on users' devices. On-device computations combined with other privacy\nand security measures ensure that only minimal data is transmitted off-device,\nachieving a high standard of data protection. Despite FA's broad relevance, the\napplicability of existing FA systems is limited by compromised accuracy; lack\nof flexibility for data analytics; and an inability to scale effectively. In\nthis paper, we describe our approach to combine privacy, scalability, and\npracticality to build and deploy a system that overcomes these limitations. Our\nFA system leverages trusted execution environments (TEEs) and optimizes the use\nof on-device computing resources to facilitate federated data processing across\nlarge fleets of devices, while ensuring robust, defensible, and verifiable\nprivacy safeguards. We focus on federated analytics (statistics and\nmonitoring), in contrast to systems for federated learning (ML workloads), and\nwe flag the key differences.\n","authors":["Harish Srinivas","Graham Cormode","Mehrdad Honarkhah","Samuel Lurye","Jonathan Hehir","Lunwen He","George Hong","Ahmed Magdy","Dzmitry Huba","Kaikai Wang","Shen Guo","Shoubhik Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2412.02340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.01464v2","updated":"2024-12-03T10:01:06Z","published":"2024-10-02T12:16:46Z","title":"Flow Matching for Accelerated Simulation of Atomic Transport in\n Materials","summary":" We introduce LiFlow, a generative framework to accelerate molecular dynamics\n(MD) simulations for crystalline materials that formulates the task as\nconditional generation of atomic displacements. The model uses flow matching,\nwith a Propagator submodel to generate atomic displacements and a Corrector to\nlocally correct unphysical geometries, and incorporates an adaptive prior based\non the Maxwell-Boltzmann distribution to account for chemical and thermal\nconditions. We benchmark LiFlow on a dataset comprising 25-ps trajectories of\nlithium diffusion across 4,186 solid-state electrolyte (SSE) candidates at four\ntemperatures. The model obtains a consistent Spearman rank correlation of\n0.7-0.8 for lithium mean squared displacement (MSD) predictions on unseen\ncompositions. Furthermore, LiFlow generalizes from short training trajectories\nto larger supercells and longer simulations while maintaining high accuracy.\nWith speed-ups of up to 600,000$\\times$ compared to first-principles methods,\nLiFlow enables scalable simulations at significantly larger length and time\nscales.\n","authors":["Juno Nam","Sulin Liu","Gavin Winter","KyuJung Jun","Soojung Yang","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2410.01464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02335v1","updated":"2024-12-03T09:55:00Z","published":"2024-12-03T09:55:00Z","title":"An Adaptive Grasping Force Tracking Strategy for Nonlinear and\n Time-Varying Object Behaviors","summary":" Accurate grasp force control is one of the key skills for ensuring successful\nand damage-free robotic grasping of objects. Although existing methods have\nconducted in-depth research on slip detection and grasping force planning, they\noften overlook the issue of adaptive tracking of the actual force to the target\nforce when handling objects with different material properties. The optimal\nparameters of a force tracking controller are significantly influenced by the\nobject's stiffness, and many adaptive force tracking algorithms rely on\nstiffness estimation. However, real-world objects often exhibit viscous,\nplastic, or other more complex nonlinear time-varying behaviors, and existing\nstudies provide insufficient support for these materials in terms of stiffness\ndefinition and estimation. To address this, this paper introduces the concept\nof generalized stiffness, extending the definition of stiffness to nonlinear\ntime-varying grasp system models, and proposes an online generalized stiffness\nestimator based on Long Short-Term Memory (LSTM) networks. Based on generalized\nstiffness, this paper proposes an adaptive parameter adjustment strategy using\na PI controller as an example, enabling dynamic force tracking for objects with\nvarying characteristics. Experimental results demonstrate that the proposed\nmethod achieves high precision and short probing time, while showing better\nadaptability to non-ideal objects compared to existing methods. The method\neffectively solves the problem of grasp force tracking in unknown, nonlinear,\nand time-varying grasp systems, enhancing the robotic grasping ability in\nunstructured environments.\n","authors":["Ziyang Cheng","Xiangyu Tian","Ruomin Sui","Tiemin Li","Yao Jiang"],"pdf_url":"https://arxiv.org/pdf/2412.02335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04346v2","updated":"2024-12-03T09:54:40Z","published":"2023-12-07T15:06:06Z","title":"Detection and Imputation based Two-Stage Denoising Diffusion Power\n System Measurement Recovery under Cyber-Physical Uncertainties","summary":" Power system cyber-physical uncertainties, including measurement ambiguities\nstemming from cyber attacks and data losses, along with system uncertainties\nintroduced by massive renewables and complex dynamics, reduce the likelihood of\nenhancing the quality of measurements. Fortunately, denoising diffusion models\nexhibit powerful learning and generation abilities for the complex underlying\nphysics of the real world. To this end, this paper proposes an improved\ndetection and imputation based two-stage denoising diffusion model (TSDM) to\nidentify and reconstruct the measurements with various cyber-physical\nuncertainties. The first stage of the model comprises a classifier-guided\nconditional anomaly detection component, while the second stage involves\ndiffusion-based measurement imputation component. Moreover, the proposed TSDM\nadopts optimal variance to accelerate the diffusion generation process with\nsubsequence sampling. Extensive numerical case studies demonstrate that the\nproposed TSDM can accurately recover power system measurements despite\nrenewables-induced strong randomness and highly nonlinear dynamics.\nAdditionally, the proposed TSDM has stronger robustness compared to existing\nreconstruction networks and exhibits lower computational complexity than\ngeneral denoising diffusion models.\n","authors":["Jianhua Pei","Jingyu Wang","Dongyuan Shi","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04346v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06644v3","updated":"2024-12-03T09:49:07Z","published":"2024-06-09T23:39:31Z","title":"Latent Diffusion Model-Enabled Low-Latency Semantic Communication in the\n Presence of Semantic Ambiguities and Wireless Channel Noises","summary":" Deep learning (DL)-based Semantic Communications (SemCom) is becoming\ncritical to maximize overall efficiency of communication networks.\nNevertheless, SemCom is sensitive to wireless channel uncertainties, source\noutliers, and suffer from poor generalization bottlenecks. To address the\nmentioned challenges, this paper develops a latent diffusion model-enabled\nSemCom system with three key contributions, i.e., i) to handle potential\noutliers in the source data, semantic errors obtained by projected gradient\ndescent based on the vulnerabilities of DL models, are utilized to update the\nparameters and obtain an outlier-robust encoder, ii) a lightweight single-layer\nlatent space transformation adapter completes one-shot learning at the\ntransmitter and is placed before the decoder at the receiver, enabling\nadaptation for out-of-distribution data and enhancing human-perceptual quality,\nand iii) an end-to-end consistency distillation (EECD) strategy is used to\ndistill the diffusion models trained in latent space, enabling deterministic\nsingle or few-step low-latency denoising in various noisy channels while\nmaintaining high semantic quality. Extensive numerical experiments across\ndifferent datasets demonstrate the superiority of the proposed SemCom system,\nconsistently proving its robustness to outliers, the capability to transmit\ndata with unknown distributions, and the ability to perform real-time channel\ndenoising tasks while preserving high human perceptual quality, outperforming\nthe existing denoising approaches in semantic metrics like learned perceptual\nimage path similarity (LPIPS).\n","authors":["Jianhua Pei","Cheng Feng","Ping Wang","Hina Tabassum","Dongyuan Shi"],"pdf_url":"https://arxiv.org/pdf/2406.06644v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02331v1","updated":"2024-12-03T09:48:28Z","published":"2024-12-03T09:48:28Z","title":"Sample Efficient Robot Learning in Supervised Effect Prediction Tasks","summary":" In self-supervised robot learning, robots actively explore their environments\nand generate data by acting on entities in the environment. Therefore, an\nexploration policy is desired that ensures sample efficiency to minimize robot\nexecution costs while still providing accurate learning. For this purpose, the\nrobotic community has adopted Intrinsic Motivation (IM)-based approaches such\nas Learning Progress (LP). On the machine learning front, Active Learning (AL)\nhas been used successfully, especially for classification tasks. In this work,\nwe develop a novel AL framework geared towards robotics regression tasks, such\nas action-effect prediction and, more generally, for world model learning,\nwhich we call MUSEL - Model Uncertainty for Sample Efficient Learning. MUSEL\naims to extract model uncertainty from the total uncertainty estimate given by\na suitable learning engine by making use of earning progress and input\ndiversity and use it to improve sample efficiency beyond the state-of-the-art\naction-effect prediction methods. We demonstrate the feasibility of our model\nby using a Stochastic Variational Gaussian Process (SVGP) as the learning\nengine and testing the system on a set of robotic experiments in simulation.\nThe efficacy of MUSEL is demonstrated by comparing its performance to standard\nmethods used in robot action-effect learning. In a robotic tabletop environment\nin which a robot manipulator is tasked with learning the effect of its actions,\nthe experiments show that MUSEL facilitates higher accuracy in learning action\neffects while ensuring sample efficiency.\n","authors":["Mehmet Arda Eren","Erhan Oztop"],"pdf_url":"https://arxiv.org/pdf/2412.02331v1.pdf","comment":"18 pages, 18 figures"},{"id":"http://arxiv.org/abs/2404.10746v3","updated":"2024-12-03T09:47:45Z","published":"2024-04-16T17:24:22Z","title":"Interpolation and differentiation of alchemical degrees of freedom in\n machine learning interatomic potentials","summary":" Machine learning interatomic potentials (MLIPs) have become a workhorse of\nmodern atomistic simulations, and recently published universal MLIPs,\npre-trained on large datasets, have demonstrated remarkable accuracy and\ngeneralizability. However, the computational cost of MLIPs limits their\napplicability to chemically disordered systems requiring large simulation cells\nor to sample-intensive statistical methods. Here, we report the use of\ncontinuous and differentiable alchemical degrees of freedom in atomistic\nmaterials simulations, exploiting the fact that graph neural network MLIPs\nrepresent discrete elements as real-valued tensors. The proposed method\nintroduces alchemical atoms with corresponding weights into the input graph,\nalongside modifications to the message-passing and readout mechanisms of MLIPs,\nand allows smooth interpolation between the compositional states of materials.\nThe end-to-end differentiability of MLIPs enables efficient calculation of the\ngradient of energy with respect to the compositional weights. With this\nmodification, we propose methodologies for optimizing the composition of solid\nsolutions towards target macroscopic properties, characterizing order and\ndisorder in multicomponent oxides, and conducting alchemical free energy\nsimulations to quantify the free energy of vacancy formation and composition\nchanges. The approach offers an avenue for extending the capabilities of\nuniversal MLIPs in the modeling of compositional disorder and characterizing\nthe phase stability of complex materials systems.\n","authors":["Juno Nam","Jiayu Peng","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2404.10746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02328v1","updated":"2024-12-03T09:42:16Z","published":"2024-12-03T09:42:16Z","title":"Efficient Model Compression Techniques with FishLeg","summary":" In many domains, the most successful AI models tend to be the largest, indeed\noften too large to be handled by AI players with limited computational\nresources. To mitigate this, a number of compression methods have been\ndeveloped, including methods that prune the network down to high sparsity\nwhilst retaining performance. The best-performing pruning techniques are often\nthose that use second-order curvature information (such as an estimate of the\nFisher information matrix) to score the importance of each weight and to\npredict the optimal compensation for weight deletion. However, these methods\nare difficult to scale to high-dimensional parameter spaces without making\nheavy approximations. Here, we propose the FishLeg surgeon (FLS), a new\nsecond-order pruning method based on the Fisher-Legendre (FishLeg) optimizer.\nAt the heart of FishLeg is a meta-learning approach to amortising the action of\nthe inverse FIM, which brings a number of advantages. Firstly, the\nparameterisation enables the use of flexible tensor factorisation techniques to\nimprove computational and memory efficiency without sacrificing much accuracy,\nalleviating challenges associated with scalability of most second-order pruning\nmethods. Secondly, directly estimating the inverse FIM leads to less\nsensitivity to the amplification of stochasticity during inversion, thereby\nresulting in more precise estimates. Thirdly, our approach also allows for\nprogressive assimilation of the curvature into the parameterisation. In the\ngradual pruning regime, this results in a more efficient estimate refinement as\nopposed to re-estimation. We find that FishLeg achieves higher or comparable\nperformance against two common baselines in the area, most notably in the high\nsparsity regime when considering a ResNet18 model on CIFAR-10 (84% accuracy at\n95% sparsity vs 60% for OBS) and TinyIM (53% accuracy at 80% sparsity vs 48%\nfor OBS).\n","authors":["Jamie McGowan","Wei Sheng Lai","Weibin Chen","Henry Aldridge","Jools Clarke","Jezabel Garcia","Rui Xia","Yilei Liang","Guillaume Hennequin","Alberto Bernacchia"],"pdf_url":"https://arxiv.org/pdf/2412.02328v1.pdf","comment":"Published in NeurIPS 2024 - Neural Compression Workshop, 13 pages, 6\n figures"},{"id":"http://arxiv.org/abs/2412.02327v1","updated":"2024-12-03T09:40:59Z","published":"2024-12-03T09:40:59Z","title":"Switchable deep beamformer for high-quality and real-time passive\n acoustic mapping","summary":" Passive acoustic mapping (PAM) is a promising tool for monitoring acoustic\ncavitation activities in the applications of ultrasound therapy. Data-adaptive\nbeamformers for PAM have better image quality compared to the time exposure\nacoustics (TEA) algorithms. However, the computational cost of data-adaptive\nbeamformers is considerably expensive. In this work, we develop a deep\nbeamformer based on a generative adversarial network, which can switch between\ndifferent transducer arrays and reconstruct high-quality PAM images directly\nfrom radio frequency ultrasound signals with low computational cost. The deep\nbeamformer was trained on the dataset consisting of simulated and experimental\ncavitation signals of single and multiple microbubble clouds measured by\ndifferent (linear and phased) arrays covering 1-15 MHz. We compared the\nperformance of the deep beamformer to TEA and three different data-adaptive\nbeamformers using the simulated and experimental test dataset. Compared with\nTEA, the deep beamformer reduced the energy spread area by 18.9%-65.0% and\nimproved the image signal-to-noise ratio by 9.3-22.9 dB in average for the\ndifferent arrays in our data. Compared to the data-adaptive beamformers, the\ndeep beamformer reduced the computational cost by three orders of magnitude\nachieving 10.5 ms image reconstruction speed in our data, while the image\nquality was as good as that of the data-adaptive beamformers. These results\ndemonstrated the potential of the deep beamformer for high-resolution\nmonitoring of microbubble cavitation activities for ultrasound therapy.\n","authors":["Yi Zeng","Jinwei Li","Hui Zhu","Shukuan Lu","Jianfeng Li","Xiran Cai"],"pdf_url":"https://arxiv.org/pdf/2412.02327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.08802v3","updated":"2024-12-03T09:39:57Z","published":"2024-02-05T14:20:19Z","title":"Governance of Generative Artificial Intelligence for Companies","summary":" Generative Artificial Intelligence (GenAI), specifically large language\nmodels like ChatGPT, has swiftly entered organizations without adequate\ngovernance, posing both opportunities and risks. Despite extensive debates on\nGenAI's transformative nature and regulatory measures, limited research\naddresses organizational governance, encompassing technical and business\nperspectives. Although numerous frameworks for governance of AI exist, it is\nnot clear to what extent they apply to GenAI. Our review paper fills this gap\nby surveying recent works with the purpose of better understanding fundamental\ncharacteristics of GenAI and adjusting prior frameworks specifically towards\nGenAI governance within companies. To do so, it extends Nickerson's framework\ndevelopment processes to include prior conceptualizations. Our framework\noutlines the scope, objectives, and governance mechanisms tailored to harness\nbusiness opportunities as well as mitigate risks associated with GenAI\nintegration. Our research contributes a focused approach to GenAI governance,\noffering practical insights for companies navigating the challenges of GenAI\nadoption and highlighting research gaps.\n","authors":["Johannes Schneider","Pauline Kuss","Rene Abraham","Christian Meske"],"pdf_url":"https://arxiv.org/pdf/2403.08802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02316v1","updated":"2024-12-03T09:32:02Z","published":"2024-12-03T09:32:02Z","title":"Optimizing Plastic Waste Collection in Water Bodies Using Heterogeneous\n Autonomous Surface Vehicles with Deep Reinforcement Learning","summary":" This paper presents a model-free deep reinforcement learning framework for\ninformative path planning with heterogeneous fleets of autonomous surface\nvehicles to locate and collect plastic waste. The system employs two teams of\nvehicles: scouts and cleaners. Coordination between these teams is achieved\nthrough a deep reinforcement approach, allowing agents to learn strategies to\nmaximize cleaning efficiency. The primary objective is for the scout team to\nprovide an up-to-date contamination model, while the cleaner team collects as\nmuch waste as possible following this model. This strategy leads to\nheterogeneous teams that optimize fleet efficiency through inter-team\ncooperation supported by a tailored reward function. Different trainings of the\nproposed algorithm are compared with other state-of-the-art heuristics in two\ndistinct scenarios, one with high convexity and another with narrow corridors\nand challenging access. According to the obtained results, it is demonstrated\nthat deep reinforcement learning based algorithms outperform other benchmark\nheuristics, exhibiting superior adaptability. In addition, training with greedy\nactions further enhances performance, particularly in scenarios with intricate\nlayouts.\n","authors":["Alejandro Mendoza Barrionuevo","Samuel Yanes Luis","Daniel Gutiérrez Reina","Sergio L. Toral Marín"],"pdf_url":"https://arxiv.org/pdf/2412.02316v1.pdf","comment":"This article is currently under revision for the Robotics and\n Automation Letters (IEEE)"},{"id":"http://arxiv.org/abs/2412.02313v1","updated":"2024-12-03T09:30:57Z","published":"2024-12-03T09:30:57Z","title":"Noisy Ostracods: A Fine-Grained, Imbalanced Real-World Dataset for\n Benchmarking Robust Machine Learning and Label Correction Methods","summary":" We present the Noisy Ostracods, a noisy dataset for genus and species\nclassification of crustacean ostracods with specialists' annotations. Over the\n71466 specimens collected, 5.58% of them are estimated to be noisy (possibly\nproblematic) at genus level. The dataset is created to addressing a real-world\nchallenge: creating a clean fine-grained taxonomy dataset. The Noisy Ostracods\ndataset has diverse noises from multiple sources. Firstly, the noise is\nopen-set, including new classes discovered during curation that were not part\nof the original annotation. The dataset has pseudo-classes, where annotators\nmisclassified samples that should belong to an existing class into a new\npseudo-class. The Noisy Ostracods dataset is highly imbalanced with a imbalance\nfactor $\\rho$ = 22429. This presents a unique challenge for robust machine\nlearning methods, as existing approaches have not been extensively evaluated on\nfine-grained classification tasks with such diverse real-world noise. Initial\nexperiments using current robust learning techniques have not yielded\nsignificant performance improvements on the Noisy Ostracods dataset compared to\ncross-entropy training on the raw, noisy data. On the other hand, noise\ndetection methods have underperformed in error hit rate compared to naive\ncross-validation ensembling for identifying problematic labels. These findings\nsuggest that the fine-grained, imbalanced nature, and complex noise\ncharacteristics of the dataset present considerable challenges for existing\nnoise-robust algorithms. By openly releasing the Noisy Ostracods dataset, our\ngoal is to encourage further research into the development of noise-resilient\nmachine learning methods capable of effectively handling diverse, real-world\nnoise in fine-grained classification tasks. The dataset, along with its\nevaluation protocols, can be accessed at\nhttps://github.com/H-Jamieu/Noisy_ostracods.\n","authors":["Jiamian Hu","Yuanyuan Hong","Yihua Chen","He Wang","Moriaki Yasuhara"],"pdf_url":"https://arxiv.org/pdf/2412.02313v1.pdf","comment":"Initial submit"},{"id":"http://arxiv.org/abs/2411.18506v2","updated":"2024-12-03T09:25:11Z","published":"2024-11-27T16:48:24Z","title":"LLM-ABBA: Understanding time series via symbolic approximation","summary":" The success of large language models (LLMs) for time series has been\ndemonstrated in previous work. Utilizing a symbolic time series representation,\none can efficiently bridge the gap between LLMs and time series. However, the\nremaining challenge is to exploit the semantic information hidden in time\nseries by using symbols or existing tokens of LLMs, while aligning the\nembedding space of LLMs according to the hidden information of time series. The\nsymbolic time series approximation (STSA) method called adaptive Brownian\nbridge-based symbolic aggregation (ABBA) shows outstanding efficacy in\npreserving salient time series features by modeling time series patterns in\nterms of amplitude and period while using existing tokens of LLMs.\n In this paper, we introduce a method, called LLM-ABBA, that integrates ABBA\ninto large language models for various downstream time series tasks. By\nsymbolizing time series, LLM-ABBA compares favorably to the recent\nstate-of-the-art (SOTA) in UCR and three medical time series classification\ntasks. Meanwhile, a fixed-polygonal chain trick in ABBA is introduced to\n\\kc{avoid obvious drifting} during prediction tasks by significantly mitigating\nthe effects of cumulative error arising from misused symbols during the\ntransition from symbols to numerical values. In time series regression tasks,\nLLM-ABBA achieves the new SOTA on Time Series Extrinsic Regression (TSER)\nbenchmarks. LLM-ABBA also shows competitive prediction capability compared to\nrecent SOTA time series prediction results. We believe this framework can also\nseamlessly extend to other time series tasks.\n","authors":["Erin Carson","Xinye Chen","Cheng Kang"],"pdf_url":"https://arxiv.org/pdf/2411.18506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16861v2","updated":"2024-12-03T09:17:43Z","published":"2024-05-27T06:26:55Z","title":"BInD: Bond and Interaction-generating Diffusion Model for\n Multi-objective Structure-based Drug Design","summary":" A remarkable advance in geometric deep generative models with accumulated\nstructural data enables structure-based drug design (SBDD) with target protein\ninformation only. However, most existing models struggle to address\nmulti-objectives simultaneously while performing well only in their specialized\ntasks. Here, we present BInD, a diffusion model with knowledge-based guidance\nfor multi-objective SBDD. BInD is designed to co-generate molecules and their\ninteractions with a target protein to consider all key objectives equally well,\nincluding target-specific interactions, molecular properties, and local\ngeometry. Comprehensive evaluations show that BInD achieves robust performance\nfor all objectives while outperforming or matching state-of-the-art methods for\neach. Finally, we propose a train-free optimization method empowered by\nretrieving target-specific interactions, highlighting the role of non-covalent\ninteractions in achieving higher selectivity and binding affinities to a target\nprotein.\n","authors":["Joongwon Lee","Wonho Zhung","Jisu Seo","Woo Youn Kim"],"pdf_url":"https://arxiv.org/pdf/2405.16861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02302v1","updated":"2024-12-03T09:16:13Z","published":"2024-12-03T09:16:13Z","title":"Enhanced Photovoltaic Power Forecasting: An iTransformer and LSTM-Based\n Model Integrating Temporal and Covariate Interactions","summary":" Accurate photovoltaic (PV) power forecasting is critical for integrating\nrenewable energy sources into the grid, optimizing real-time energy management,\nand ensuring energy reliability amidst increasing demand. However, existing\nmodels often struggle with effectively capturing the complex relationships\nbetween target variables and covariates, as well as the interactions between\ntemporal dynamics and multivariate data, leading to suboptimal forecasting\naccuracy. To address these challenges, we propose a novel model architecture\nthat leverages the iTransformer for feature extraction from target variables\nand employs long short-term memory (LSTM) to extract features from covariates.\nA cross-attention mechanism is integrated to fuse the outputs of both models,\nfollowed by a Kolmogorov-Arnold network (KAN) mapping for enhanced\nrepresentation. The effectiveness of the proposed model is validated using\npublicly available datasets from Australia, with experiments conducted across\nfour seasons. Results demonstrate that the proposed model effectively capture\nseasonal variations in PV power generation and improve forecasting accuracy.\n","authors":["Guang Wu","Yun Wang","Qian Zhou","Ziyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02295v1","updated":"2024-12-03T09:09:52Z","published":"2024-12-03T09:09:52Z","title":"CADMR: Cross-Attention and Disentangled Learning for Multimodal\n Recommender Systems","summary":" The increasing availability and diversity of multimodal data in recommender\nsystems offer new avenues for enhancing recommendation accuracy and user\nsatisfaction. However, these systems must contend with high-dimensional, sparse\nuser-item rating matrices, where reconstructing the matrix with only small\nsubsets of preferred items for each user poses a significant challenge. To\naddress this, we propose CADMR, a novel autoencoder-based multimodal\nrecommender system framework. CADMR leverages multi-head cross-attention\nmechanisms and Disentangled Learning to effectively integrate and utilize\nheterogeneous multimodal data in reconstructing the rating matrix. Our approach\nfirst disentangles modality-specific features while preserving their\ninterdependence, thereby learning a joint latent representation. The multi-head\ncross-attention mechanism is then applied to enhance user-item interaction\nrepresentations with respect to the learned multimodal item latent\nrepresentations. We evaluate CADMR on three benchmark datasets, demonstrating\nsignificant performance improvements over state-of-the-art methods.\n","authors":["Yasser Khalafaoui","Martino Lovisetto","Basarab Matei","Nistor Grozavu"],"pdf_url":"https://arxiv.org/pdf/2412.02295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02294v1","updated":"2024-12-03T09:08:38Z","published":"2024-12-03T09:08:38Z","title":"Initial Study On Improving Segmentation By Combining Preoperative CT And\n Intraoperative CBCT Using Synthetic Data","summary":" Computer-Assisted Interventions enable clinicians to perform precise,\nminimally invasive procedures, often relying on advanced imaging methods.\nCone-beam computed tomography (CBCT) can be used to facilitate\ncomputer-assisted interventions, despite often suffering from artifacts that\npose challenges for accurate interpretation. While the degraded image quality\ncan affect image analysis, the availability of high quality, preoperative scans\noffers potential for improvements. Here we consider a setting where\npreoperative CT and intraoperative CBCT scans are available, however, the\nalignment (registration) between the scans is imperfect to simulate a real\nworld scenario. We propose a multimodal learning method that fuses roughly\naligned CBCT and CT scans and investigate the effect on segmentation\nperformance. For this experiment we use synthetically generated data containing\nreal CT and synthetic CBCT volumes with corresponding voxel annotations. We\nshow that this fusion setup improves segmentation performance in $18$ out of\n$20$ investigated setups.\n","authors":["Maximilian E. Tschuchnig","Philipp Steininger","Michael Gadermayr"],"pdf_url":"https://arxiv.org/pdf/2412.02294v1.pdf","comment":"Accepted at BVM 2025. arXiv admin note: text overlap with\n arXiv:2406.11650"},{"id":"http://arxiv.org/abs/2412.02292v1","updated":"2024-12-03T09:08:27Z","published":"2024-12-03T09:08:27Z","title":"Deep Matrix Factorization with Adaptive Weights for Multi-View\n Clustering","summary":" Recently, deep matrix factorization has been established as a powerful model\nfor unsupervised tasks, achieving promising results, especially for multi-view\nclustering. However, existing methods often lack effective feature selection\nmechanisms and rely on empirical hyperparameter selection. To address these\nissues, we introduce a novel Deep Matrix Factorization with Adaptive Weights\nfor Multi-View Clustering (DMFAW). Our method simultaneously incorporates\nfeature selection and generates local partitions, enhancing clustering results.\nNotably, the features weights are controlled and adjusted by a parameter that\nis dynamically updated using Control Theory inspired mechanism, which not only\nimproves the model's stability and adaptability to diverse datasets but also\naccelerates convergence. A late fusion approach is then proposed to align the\nweighted local partitions with the consensus partition. Finally, the\noptimization problem is solved via an alternating optimization algorithm with\ntheoretically guaranteed convergence. Extensive experiments on benchmark\ndatasets highlight that DMFAW outperforms state-of-the-art methods in terms of\nclustering performance.\n","authors":["Yasser Khalafaoui","Basarab Matei","Martino Lovisetto","Nistor Grozavu"],"pdf_url":"https://arxiv.org/pdf/2412.02292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02291v1","updated":"2024-12-03T09:07:31Z","published":"2024-12-03T09:07:31Z","title":"Conformal Symplectic Optimization for Stable Reinforcement Learning","summary":" Training deep reinforcement learning (RL) agents necessitates overcoming the\nhighly unstable nonconvex stochastic optimization inherent in the\ntrial-and-error mechanism. To tackle this challenge, we propose a\nphysics-inspired optimization algorithm called relativistic adaptive gradient\ndescent (RAD), which enhances long-term training stability. By conceptualizing\nneural network (NN) training as the evolution of a conformal Hamiltonian\nsystem, we present a universal framework for transferring long-term stability\nfrom conformal symplectic integrators to iterative NN updating rules, where the\nchoice of kinetic energy governs the dynamical properties of resulting\noptimization algorithms. By utilizing relativistic kinetic energy, RAD\nincorporates principles from special relativity and limits parameter updates\nbelow a finite speed, effectively mitigating abnormal gradient influences.\nAdditionally, RAD models NN optimization as the evolution of a multi-particle\nsystem where each trainable parameter acts as an independent particle with an\nindividual adaptive learning rate. We prove RAD's sublinear convergence under\ngeneral nonconvex settings, where smaller gradient variance and larger batch\nsizes contribute to tighter convergence. Notably, RAD degrades to the\nwell-known adaptive moment estimation (ADAM) algorithm when its speed\ncoefficient is chosen as one and symplectic factor as a small positive value.\nExperimental results show RAD outperforming nine baseline optimizers with five\nRL algorithms across twelve environments, including standard benchmarks and\nchallenging scenarios. Notably, RAD achieves up to a 155.1% performance\nimprovement over ADAM in Atari games, showcasing its efficacy in stabilizing\nand accelerating RL training.\n","authors":["Yao Lyu","Xiangteng Zhang","Shengbo Eben Li","Jingliang Duan","Letian Tao","Qing Xu","Lei He","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2412.02291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02289v1","updated":"2024-12-03T09:06:57Z","published":"2024-12-03T09:06:57Z","title":"Learn More by Using Less: Distributed Learning with Energy-Constrained\n Devices","summary":" Federated Learning (FL) has emerged as a solution for distributed model\ntraining across decentralized, privacy-preserving devices, but the different\nenergy capacities of participating devices (system heterogeneity) constrain\nreal-world implementations. These energy limitations not only reduce model\naccuracy but also increase dropout rates, impacting on convergence in practical\nFL deployments. In this work, we propose LeanFed, an energy-aware FL framework\ndesigned to optimize client selection and training workloads on\nbattery-constrained devices. LeanFed leverages adaptive data usage by\ndynamically adjusting the fraction of local data each device utilizes during\ntraining, thereby maximizing device participation across communication rounds\nwhile ensuring they do not run out of battery during the process. We rigorously\nevaluate LeanFed against traditional FedAvg on CIFAR-10 and CIFAR-100 datasets,\nsimulating various levels of data heterogeneity and device participation rates.\nResults show that LeanFed consistently enhances model accuracy and stability,\nparticularly in settings with high data heterogeneity and limited battery life,\nby mitigating client dropout and extending device availability. This approach\ndemonstrates the potential of energy-efficient, privacy-preserving FL in\nreal-world, large-scale applications, setting a foundation for robust and\nsustainable pervasive AI on resource-constrained networks.\n","authors":["Roberto Pereira","Cristian J. Vaca-Rubio","Luis Blanco"],"pdf_url":"https://arxiv.org/pdf/2412.02289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19146v2","updated":"2024-12-03T09:06:33Z","published":"2024-11-28T13:45:42Z","title":"Puzzle: Distillation-Based NAS for Inference-Optimized LLMs","summary":" Large language models (LLMs) have demonstrated remarkable capabilities, but\ntheir adoption is limited by high computational costs during inference. While\nincreasing parameter counts enhances accuracy, it also widens the gap between\nstate-of-the-art capabilities and practical deployability. We present Puzzle, a\nframework to accelerate LLM inference on specific hardware while preserving\ntheir capabilities. Through an innovative application of neural architecture\nsearch (NAS) at an unprecedented scale, Puzzle systematically optimizes models\nwith tens of billions of parameters under hardware constraints. Our approach\nutilizes blockwise local knowledge distillation (BLD) for parallel architecture\nexploration and employs mixed-integer programming for precise constraint\noptimization.\n We demonstrate the real-world impact of our framework through\nLlama-3.1-Nemotron-51B-Instruct (Nemotron-51B), a publicly available model\nderived from Llama-3.1-70B-Instruct. Nemotron-51B achieves a 2.17x inference\nthroughput speedup, fitting on a single NVIDIA H100 GPU while preserving 98.4%\nof the original model's capabilities. Nemotron-51B currently stands as the most\naccurate language model capable of inference on a single GPU with large batch\nsizes. Remarkably, this transformation required just 45B training tokens,\ncompared to over 15T tokens used for the 70B model it was derived from. This\nestablishes a new paradigm where powerful models can be optimized for efficient\ndeployment with only negligible compromise of their capabilities, demonstrating\nthat inference performance, not parameter count alone, should guide model\nselection. With the release of Nemotron-51B and the presentation of the Puzzle\nframework, we provide practitioners immediate access to state-of-the-art\nlanguage modeling capabilities at significantly reduced computational costs.\n","authors":["Akhiad Bercovich","Tomer Ronen","Talor Abramovich","Nir Ailon","Nave Assaf","Mohammad Dabbah","Ido Galil","Amnon Geifman","Yonatan Geifman","Izhak Golan","Netanel Haber","Ehud Karpas","Roi Koren","Itay Levy","Pavlo Molchanov","Shahar Mor","Zach Moshe","Najeeb Nabwani","Omri Puny","Ran Rubin","Itamar Schen","Ido Shahaf","Oren Tropp","Omer Ullman Argov","Ran Zilberstein","Ran El-Yaniv"],"pdf_url":"https://arxiv.org/pdf/2411.19146v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11465v2","updated":"2024-12-03T09:04:35Z","published":"2024-11-18T10:58:46Z","title":"Re-examining learning linear functions in context","summary":" In context learning (ICL) is an attractive method of solving a wide range of\nproblems. Inspired by Garg et al. (2022), we look closely at ICL in a variety\nof train and test settings for several transformer models of different sizes\ntrained from scratch. Our study complements prior work by pointing out several\nsystematic failures of these models to generalize to data not in the training\ndistribution, thereby showing some limitations of ICL. We find that models\nadopt a strategy for this task that is very different from standard solutions.\n","authors":["Omar Naim","Guilhem Fouilhé","Nicholas Asher"],"pdf_url":"https://arxiv.org/pdf/2411.11465v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02285v1","updated":"2024-12-03T09:03:04Z","published":"2024-12-03T09:03:04Z","title":"GQWformer: A Quantum-based Transformer for Graph Representation Learning","summary":" Graph Transformers (GTs) have demonstrated significant advantages in graph\nrepresentation learning through their global attention mechanisms. However, the\nself-attention mechanism in GTs tends to neglect the inductive biases inherent\nin graph structures, making it chanllenging to effectively capture essential\nstructural information. To address this issue, we propose a novel approach that\nintegrate graph inductive bias into self-attention mechanisms by leveraging\nquantum technology for structural encoding. In this paper, we introduce the\nGraph Quantum Walk Transformer (GQWformer), a groundbreaking GNN framework that\nutilizes quantum walks on attributed graphs to generate node quantum states.\nThese quantum states encapsulate rich structural attributes and serve as\ninductive biases for the transformer, thereby enabling the generation of more\nmeaningful attention scores. By subsequently incorporating a recurrent neural\nnetwork, our design amplifies the model's ability to focus on both local and\nglobal information. We conducted comprehensive experiments across five publicly\navailable datasets to evaluate the effectiveness of our model. These results\nclearly indicate that GQWformer outperforms existing state-of-the-art graph\nclassification algorithms. These findings highlight the significant potential\nof integrating quantum computing methodologies with traditional GNNs to advance\nthe field of graph representation learning, providing a promising direction for\nfuture research and applications.\n","authors":["Lei Yu","Hongyang Chen","Jingsong Lv","Linyao Yang"],"pdf_url":"https://arxiv.org/pdf/2412.02285v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05099v6","updated":"2024-12-03T08:58:22Z","published":"2023-04-11T09:51:13Z","title":"Feudal Graph Reinforcement Learning","summary":" Graph-based representations and message-passing modular policies constitute\nprominent approaches to tackling composable control problems in reinforcement\nlearning (RL). However, as shown by recent graph deep learning literature, such\nlocal message-passing operators can create information bottlenecks and hinder\nglobal coordination. The issue becomes more serious in tasks requiring\nhigh-level planning. In this work, we propose a novel methodology, named Feudal\nGraph Reinforcement Learning (FGRL), that addresses such challenges by relying\non hierarchical RL and a pyramidal message-passing architecture. In particular,\nFGRL defines a hierarchy of policies where high-level commands are propagated\nfrom the top of the hierarchy down through a layered graph structure. The\nbottom layers mimic the morphology of the physical system, while the upper\nlayers correspond to higher-order sub-modules. The resulting agents are then\ncharacterized by a committee of policies where actions at a certain level set\ngoals for the level below, thus implementing a hierarchical decision-making\nstructure that can naturally implement task decomposition. We evaluate the\nproposed framework on a graph clustering problem and MuJoCo locomotion tasks;\nsimulation results show that FGRL compares favorably against relevant\nbaselines. Furthermore, an in-depth analysis of the command propagation\nmechanism provides evidence that the introduced message-passing scheme favors\nlearning hierarchical decision-making policies.\n","authors":["Tommaso Marzi","Arshjot Khehra","Andrea Cini","Cesare Alippi"],"pdf_url":"https://arxiv.org/pdf/2304.05099v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03848v3","updated":"2024-12-03T08:54:30Z","published":"2024-06-06T08:29:29Z","title":"OceanCastNet: A Deep Learning Ocean Wave Model with Energy Conservation","summary":" Traditional wave forecasting models, although based on energy conservation\nequations, are computationally expensive. On the other hand, existing deep\nlearning geophysical fluid models, while computationally efficient, often\nsuffer from issues such as energy dissipation in long-term forecasts. This\npaper proposes a novel energy-balanced deep learning wave forecasting model\ncalled OceanCastNet (OCN). By incorporating wind fields at the current,\nprevious, and future time steps, as well as wave fields at the current and\nprevious time steps as input variables, OCN maintains energy balance within the\nmodel. Furthermore, the model employs adaptive Fourier operators as its core\ncomponents and designs a masked loss function to better handle the impact of\nland-sea boundaries. A series of experiments on the ERA5 dataset demonstrate\nthat OCN can achieve short-term forecast accuracy comparable to traditional\nmodels while exhibiting an understanding of the wave generation process. In\ncomparative experiments under both normal and extreme conditions, OCN\nconsistently outperforms the widely used WaveWatch III model in the industry.\nEven after long-term forecasting, OCN maintains a stable and energy-rich state.\nBy further constructing a simple meteorological model, OCN-wind, which\nconsiders energy balance, this paper confirms the importance of energy\nconstraints for improving the long-term forecast performance of deep learning\nmeteorological models. This finding provides new ideas for future research on\ndeep learning geophysical fluid models.\n","authors":["Ziliang Zhang","Huaming Yu","Danqin Ren"],"pdf_url":"https://arxiv.org/pdf/2406.03848v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01585v2","updated":"2024-12-03T08:54:27Z","published":"2024-12-02T15:04:51Z","title":"FairML: A Julia Package for Fair Classification","summary":" In this paper, we propose FairML.jl, a Julia package providing a framework\nfor fair classification in machine learning. In this framework, the fair\nlearning process is divided into three stages. Each stage aims to reduce\nunfairness, such as disparate impact and disparate mistreatment, in the final\nprediction. For the preprocessing stage, we present a resampling method that\naddresses unfairness coming from data imbalances. The in-processing phase\nconsist of a classification method. This can be either one coming from the\nMLJ.jl package, or a user defined one. For this phase, we incorporate fair ML\nmethods that can handle unfairness to a certain degree through their\noptimization process. In the post-processing, we discuss the choice of the\ncut-off value for fair prediction. With simulations, we show the performance of\nthe single phases and their combinations.\n","authors":["Jan Pablo Burgard","João Vitor Pamplona"],"pdf_url":"https://arxiv.org/pdf/2412.01585v2.pdf","comment":"25 pages, 8 figures"},{"id":"http://arxiv.org/abs/2410.10929v5","updated":"2024-12-03T08:48:21Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v5.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2411.09545v2","updated":"2024-12-03T08:48:06Z","published":"2024-11-14T15:59:41Z","title":"Equation-informed data-driven identification of flow budgets and\n dynamics","summary":" Computational Fluid Dynamics (CFD) is an indispensable method of fluid\nmodelling in engineering applications, reducing the need for physical\nprototypes and testing for tasks such as design optimisation and performance\nanalysis. Depending on the complexity of the system under consideration, models\nranging from low to high fidelity can be used for prediction, allowing\nsignificant speed-up. However, the choice of model requires information about\nthe actual dynamics of the flow regime. Correctly identifying the\nregions/clusters of flow that share the same dynamics has been a challenging\nresearch topic to date. In this study, we propose a novel hybrid approach to\nflow clustering. It consists of characterising each sample point of the system\nwith equation-based features, i.e. features are budgets that represent the\ncontribution of each term from the original governing equation to the local\ndynamics at each sample point. This was achieved by applying the Sparse\nIdentification of Nonlinear Dynamical systems (SINDy) method pointwise to time\nevolution data. The method proceeds with equation-based clustering using the\nGirvan-Newman algorithm. This allows the detection of communities that share\nthe same physical dynamics. The algorithm is implemented in both Eulerian and\nLagrangian frameworks. In the Lagrangian, i.e. dynamic approach, the clustering\nis performed on the trajectory of each point, allowing the change of clusters\nto be represented also in time. The performance of the algorithm is first\ntested on a flow around a cylinder. The construction of the dynamic clusters in\nthis test case clearly shows the evolution of the wake from the steady state\nsolution through the transient to the oscillatory solution. Dynamic clustering\nwas then successfully tested on turbulent flow data. Two distinct and\nwell-defined clusters were identified and their temporal evolution was\nreconstructed.\n","authors":["Nataliya Sevryugina","Serena Costanzo","Stephen de Bruyn Kops","Colm-cille Caulfield","Iraj Mortazavi","Taraneh Sayadi"],"pdf_url":"https://arxiv.org/pdf/2411.09545v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02273v1","updated":"2024-12-03T08:45:50Z","published":"2024-12-03T08:45:50Z","title":"Step-by-Step Guidance to Differential Anemia Diagnosis with Real-World\n Data and Deep Reinforcement Learning","summary":" Clinical diagnostic guidelines outline the key questions to answer to reach a\ndiagnosis. Inspired by guidelines, we aim to develop a model that learns from\nelectronic health records to determine the optimal sequence of actions for\naccurate diagnosis. Focusing on anemia and its sub-types, we employ deep\nreinforcement learning (DRL) algorithms and evaluate their performance on both\na synthetic dataset, which is based on expert-defined diagnostic pathways, and\na real-world dataset. We investigate the performance of these algorithms across\nvarious scenarios. Our experimental results demonstrate that DRL algorithms\nperform competitively with state-of-the-art methods while offering the\nsignificant advantage of progressively generating pathways to the suggested\ndiagnosis, providing a transparent decision-making process that can guide and\nexplain diagnostic reasoning.\n","authors":["Lillian Muyama","Estelle Lu","Geoffrey Cheminet","Jacques Pouchot","Bastien Rance","Anne-Isabelle Tropeano","Antoine Neuraz","Adrien Coulet"],"pdf_url":"https://arxiv.org/pdf/2412.02273v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.05913"},{"id":"http://arxiv.org/abs/2405.16158v3","updated":"2024-12-03T08:42:49Z","published":"2024-05-25T09:53:25Z","title":"Bigger, Regularized, Optimistic: scaling for compute and\n sample-efficient continuous control","summary":" Sample efficiency in Reinforcement Learning (RL) has traditionally been\ndriven by algorithmic enhancements. In this work, we demonstrate that scaling\ncan also lead to substantial improvements. We conduct a thorough investigation\ninto the interplay of scaling model capacity and domain-specific RL\nenhancements. These empirical findings inform the design choices underlying our\nproposed BRO (Bigger, Regularized, Optimistic) algorithm. The key innovation\nbehind BRO is that strong regularization allows for effective scaling of the\ncritic networks, which, paired with optimistic exploration, leads to superior\nperformance. BRO achieves state-of-the-art results, significantly outperforming\nthe leading model-based and model-free algorithms across 40 complex tasks from\nthe DeepMind Control, MetaWorld, and MyoSuite benchmarks. BRO is the first\nmodel-free algorithm to achieve near-optimal policies in the notoriously\nchallenging Dog and Humanoid tasks.\n","authors":["Michal Nauman","Mateusz Ostaszewski","Krzysztof Jankowski","Piotr Miłoś","Marek Cygan"],"pdf_url":"https://arxiv.org/pdf/2405.16158v3.pdf","comment":"NeurIPS 2024 Spotlight"},{"id":"http://arxiv.org/abs/2412.01566v2","updated":"2024-12-03T08:42:37Z","published":"2024-12-02T14:51:21Z","title":"Multi-objective Deep Learning: Taxonomy and Survey of the State of the\n Art","summary":" Simultaneously considering multiple objectives in machine learning has been a\npopular approach for several decades, with various benefits for multi-task\nlearning, the consideration of secondary goals such as sparsity, or\nmulticriteria hyperparameter tuning. However - as multi-objective optimization\nis significantly more costly than single-objective optimization - the recent\nfocus on deep learning architectures poses considerable additional challenges\ndue to the very large number of parameters, strong nonlinearities and\nstochasticity. This survey covers recent advancements in the area of\nmulti-objective deep learning. We introduce a taxonomy of existing methods -\nbased on the type of training algorithm as well as the decision maker's needs -\nbefore listing recent advancements, and also successful applications. All three\nmain learning paradigms supervised learning, unsupervised learning and\nreinforcement learning are covered, and we also address the recently very\npopular area of generative modeling.\n","authors":["Sebastian Peitz","Sedjro Salomon Hotegni"],"pdf_url":"https://arxiv.org/pdf/2412.01566v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02266v1","updated":"2024-12-03T08:38:30Z","published":"2024-12-03T08:38:30Z","title":"BOTracle: A framework for Discriminating Bots and Humans","summary":" Bots constitute a significant portion of Internet traffic and are a source of\nvarious issues across multiple domains. Modern bots often become\nindistinguishable from real users, as they employ similar methods to browse the\nweb, including using real browsers. We address the challenge of bot detection\nin high-traffic scenarios by analyzing three distinct detection methods. The\nfirst method operates on heuristics, allowing for rapid detection. The second\nmethod utilizes, well known, technical features, such as IP address, window\nsize, and user agent. It serves primarily for comparison with the third method.\nIn the third method, we rely solely on browsing behavior, omitting all static\nfeatures and focusing exclusively on how clients behave on a website. In\ncontrast to related work, we evaluate our approaches using real-world\ne-commerce traffic data, comprising 40 million monthly page visits. We further\ncompare our methods against another bot detection approach, Botcha, on the same\ndataset. Our performance metrics, including precision, recall, and AUC, reach\n98 percent or higher, surpassing Botcha.\n","authors":["Jan Kadel","August See","Ritwik Sinha","Mathias Fischer"],"pdf_url":"https://arxiv.org/pdf/2412.02266v1.pdf","comment":"Bot Detection; User Behaviour Analysis; Published at ESORICS\n International Workshops 2024"},{"id":"http://arxiv.org/abs/2412.02265v1","updated":"2024-12-03T08:37:28Z","published":"2024-12-03T08:37:28Z","title":"Diabetic Retinopathy Classification from Retinal Images using Machine\n Learning Approaches","summary":" Diabetic Retinopathy is one of the most familiar diseases and is a diabetes\ncomplication that affects eyes. Initially, diabetic retinopathy may cause no\nsymptoms or only mild vision problems. Eventually, it can cause blindness. So\nearly detection of symptoms could help to avoid blindness. In this paper, we\npresent some experiments on some features of diabetic retinopathy, like\nproperties of exudates, properties of blood vessels and properties of\nmicroaneurysm. Using the features, we can classify healthy, mild\nnon-proliferative, moderate non-proliferative, severe non-proliferative and\nproliferative stages of DR. Support Vector Machine, Random Forest and Naive\nBayes classifiers are used to classify the stages. Finally, Random Forest is\nfound to be the best for higher accuracy, sensitivity and specificity of 76.5%,\n77.2% and 93.3% respectively.\n","authors":["Indronil Bhattacharjee"," Al-Mahmud","Tareq Mahmud"],"pdf_url":"https://arxiv.org/pdf/2412.02265v1.pdf","comment":"5 pages, 9 figures, 2 tables. International Conference on Advanced\n Engineering, Technology and Applications (ICAETA-2021), Istanbul, Turkey"},{"id":"http://arxiv.org/abs/2412.02264v1","updated":"2024-12-03T08:37:27Z","published":"2024-12-03T08:37:27Z","title":"Technical Report on Reinforcement Learning Control on the Lucas-Nülle\n Inverted Pendulum","summary":" The discipline of automatic control is making increased use of concepts that\noriginate from the domain of machine learning. Herein, reinforcement learning\n(RL) takes an elevated role, as it is inherently designed for sequential\ndecision making, and can be applied to optimal control problems without the\nneed for a plant system model. To advance education of control engineers and\noperators in this field, this contribution targets an RL framework that can be\napplied to educational hardware provided by the Lucas-N\\\"ulle company.\nSpecifically, the goal of inverted pendulum control is pursued by means of RL,\nincluding both, swing-up and stabilization within a single holistic design\napproach. Herein, the actual learning is enabled by separating corresponding\ncomputations from the real-time control computer and outsourcing them to a\ndifferent hardware. This distributed architecture, however, necessitates\ncommunication of the involved components, which is realized via CAN bus. The\nexperimental proof of concept is presented with an applied safeguarding\nalgorithm that prevents the plant from being operated harmfully during the\ntrial-and-error training phase.\n","authors":["Maximilian Schenke","Shalbus Bukarov"],"pdf_url":"https://arxiv.org/pdf/2412.02264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02262v1","updated":"2024-12-03T08:34:42Z","published":"2024-12-03T08:34:42Z","title":"Composing Open-domain Vision with RAG for Ocean Monitoring and\n Conservation","summary":" Climate change's destruction of marine biodiversity is threatening\ncommunities and economies around the world which rely on healthy oceans for\ntheir livelihoods. The challenge of applying computer vision to niche,\nreal-world domains such as ocean conservation lies in the dynamic and diverse\nenvironments where traditional top-down learning struggle with long-tailed\ndistributions, generalization, and domain transfer. Scalable species\nidentification for ocean monitoring is particularly difficult due to the need\nto adapt models to new environments and identify rare or unseen species. To\novercome these limitations, we propose leveraging bottom-up, open-domain\nlearning frameworks as a resilient, scalable solution for image and video\nanalysis in marine applications. Our preliminary demonstration uses pretrained\nvision-language models (VLMs) combined with retrieval-augmented generation\n(RAG) as grounding, leaving the door open for numerous architectural, training\nand engineering optimizations. We validate this approach through a preliminary\napplication in classifying fish from video onboard fishing vessels,\ndemonstrating impressive emergent retrieval and prediction capabilities without\ndomain-specific training or knowledge of the task itself.\n","authors":["Sepand Dyanatkar","Angran Li","Alexander Dungate"],"pdf_url":"https://arxiv.org/pdf/2412.02262v1.pdf","comment":"Accepted to Climate Change AI Workshop at NeurIPS 2024. 9 pages, 6\n figures, 1 table"},{"id":"http://arxiv.org/abs/2410.13637v2","updated":"2024-12-03T08:29:54Z","published":"2024-10-17T15:07:56Z","title":"Normalizing self-supervised learning for provably reliable Change Point\n Detection","summary":" Change point detection (CPD) methods aim to identify abrupt shifts in the\ndistribution of input data streams. Accurate estimators for this task are\ncrucial across various real-world scenarios. Yet, traditional unsupervised CPD\ntechniques face significant limitations, often relying on strong assumptions or\nsuffering from low expressive power due to inherent model simplicity. In\ncontrast, representation learning methods overcome these drawbacks by offering\nflexibility and the ability to capture the full complexity of the data without\nimposing restrictive assumptions. However, these approaches are still emerging\nin the CPD field and lack robust theoretical foundations to ensure their\nreliability. Our work addresses this gap by integrating the expressive power of\nrepresentation learning with the groundedness of traditional CPD techniques. We\nadopt spectral normalization (SN) for deep representation learning in CPD tasks\nand prove that the embeddings after SN are highly informative for CPD. Our\nmethod significantly outperforms current state-of-the-art methods during the\ncomprehensive evaluation via three standard CPD datasets.\n","authors":["Alexandra Bazarova","Evgenia Romanenkova","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2410.13637v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02251v1","updated":"2024-12-03T08:28:47Z","published":"2024-12-03T08:28:47Z","title":"Selective Reviews of Bandit Problems in AI via a Statistical View","summary":" Reinforcement Learning (RL) is a widely researched area in artificial\nintelligence that focuses on teaching agents decision-making through\ninteractions with their environment. A key subset includes stochastic\nmulti-armed bandit (MAB) and continuum-armed bandit (SCAB) problems, which\nmodel sequential decision-making under uncertainty. This review outlines the\nfoundational models and assumptions of bandit problems, explores non-asymptotic\ntheoretical tools like concentration inequalities and minimax regret bounds,\nand compares frequentist and Bayesian algorithms for managing\nexploration-exploitation trade-offs. We also extend the discussion to $K$-armed\ncontextual bandits and SCAB, examining their methodologies, regret analyses,\nand discussing the relation between the SCAB problems and the functional data\nanalysis. Finally, we highlight recent advances and ongoing challenges in the\nfield.\n","authors":["Pengjie Zhou","Haoyu Wei","Huiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02251v1.pdf","comment":"46 pages, 5 figures,"},{"id":"http://arxiv.org/abs/2406.07522v2","updated":"2024-12-03T08:27:49Z","published":"2024-06-11T17:50:51Z","title":"Samba: Simple Hybrid State Space Models for Efficient Unlimited Context\n Language Modeling","summary":" Efficiently modeling sequences with infinite context length has long been a\nchallenging problem. Previous approaches have either suffered from quadratic\ncomputational complexity or limited extrapolation ability in length\ngeneralization. In this work, we present Samba, a simple hybrid architecture\nthat layer-wise combines Mamba, a selective State Space Model (SSM), with\nSliding Window Attention (SWA). Samba selectively compresses a given sequence\ninto recurrent hidden states while still maintaining the ability to precisely\nrecall recent memories with the attention mechanism. We scale Samba up to 3.8B\nparameters with 3.2T training tokens and demonstrate that it significantly\noutperforms state-of-the-art models across a variety of benchmarks. Pretrained\non sequences of 4K length, Samba shows improved perplexity in context lengths\nof up to 1M in zero-shot. When finetuned on 4K-length sequences, Samba\nefficiently extrapolates to a 256K context length with perfect memory recall on\nthe Passkey Retrieval task, and exhibits superior retrieval extrapolation on\nthe challenging Phonebook task compared to full-attention models. As a\nlinear-time sequence model, Samba achieves a 3.73x higher throughput compared\nto Transformers with grouped-query attention for user prompts of 128K length,\nand a 3.64x speedup when generating 64K tokens with unlimited streaming. Our\ncode for training on open source data is publicly available at\nhttps://github.com/microsoft/Samba.\n","authors":["Liliang Ren","Yang Liu","Yadong Lu","Yelong Shen","Chen Liang","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2406.07522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.16469v2","updated":"2024-12-03T08:18:17Z","published":"2024-03-25T06:50:25Z","title":"Learning from Reduced Labels for Long-Tailed Data","summary":" Long-tailed data is prevalent in real-world classification tasks and heavily\nrelies on supervised information, which makes the annotation process\nexceptionally labor-intensive and time-consuming. Unfortunately, despite being\na common approach to mitigate labeling costs, existing weakly supervised\nlearning methods struggle to adequately preserve supervised information for\ntail samples, resulting in a decline in accuracy for the tail classes. To\nalleviate this problem, we introduce a novel weakly supervised labeling setting\ncalled Reduced Label. The proposed labeling setting not only avoids the decline\nof supervised information for the tail samples, but also decreases the labeling\ncosts associated with long-tailed data. Additionally, we propose an\nstraightforward and highly efficient unbiased framework with strong theoretical\nguarantees to learn from these Reduced Labels. Extensive experiments conducted\non benchmark datasets including ImageNet validate the effectiveness of our\napproach, surpassing the performance of state-of-the-art weakly supervised\nmethods.\n","authors":["Meng Wei","Zhongnian Li","Yong Zhou","Xinzheng Xu"],"pdf_url":"https://arxiv.org/pdf/2403.16469v2.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2412.02244v1","updated":"2024-12-03T08:16:59Z","published":"2024-12-03T08:16:59Z","title":"On Simplifying Large-Scale Spatial Vectors: Fast, Memory-Efficient, and\n Cost-Predictable k-means","summary":" The k-means algorithm can simplify large-scale spatial vectors, such as 2D\ngeo-locations and 3D point clouds, to support fast analytics and learning.\nHowever, when processing large-scale datasets, existing k-means algorithms have\nbeen developed to achieve high performance with significant computational\nresources, such as memory and CPU usage time. These algorithms, though\neffective, are not well-suited for resource-constrained devices. In this paper,\nwe propose a fast, memory-efficient, and cost-predictable k-means called\nDask-means. We first accelerate k-means by designing a memory-efficient\naccelerator, which utilizes an optimized nearest neighbor search over a\nmemory-tunable index to assign spatial vectors to clusters in batches. We then\ndesign a lightweight cost estimator to predict the memory cost and runtime of\nthe k-means task, allowing it to request appropriate memory from devices or\nadjust the accelerator's required space to meet memory constraints, and ensure\nsufficient CPU time for running k-means. Experiments show that when simplifying\ndatasets with scale such as $10^6$, Dask-means uses less than $30$MB of memory,\nachieves over $168$ times speedup compared to the widely-used Lloyd's\nalgorithm. We also validate Dask-means on mobile devices, where it demonstrates\nsignificant speedup and low memory cost compared to other state-of-the-art\n(SOTA) k-means algorithms. Our cost estimator estimates the memory cost with a\ndifference of less than $3\\%$ from the actual ones and predicts runtime with an\nMSE up to $33.3\\%$ lower than SOTA methods.\n","authors":["Yushuai Ji","Zepeng Liu","Sheng Wang","Yuan Sun","Zhiyong Peng"],"pdf_url":"https://arxiv.org/pdf/2412.02244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02242v1","updated":"2024-12-03T08:11:06Z","published":"2024-12-03T08:11:06Z","title":"U-Net in Medical Image Segmentation: A Review of Its Applications Across\n Modalities","summary":" Medical imaging is essential in healthcare to provide key insights into\npatient anatomy and pathology, aiding in diagnosis and treatment. Non-invasive\ntechniques such as X-ray, Magnetic Resonance Imaging (MRI), Computed Tomography\n(CT), and Ultrasound (US), capture detailed images of organs, tissues, and\nabnormalities. Effective analysis of these images requires precise segmentation\nto delineate regions of interest (ROI), such as organs or lesions. Traditional\nsegmentation methods, relying on manual feature-extraction, are labor-intensive\nand vary across experts. Recent advancements in Artificial Intelligence (AI)\nand Deep Learning (DL), particularly convolutional models such as U-Net and its\nvariants (U-Net++ and U-Net 3+), have transformed medical image segmentation\n(MIS) by automating the process and enhancing accuracy. These models enable\nefficient, precise pixel-wise classification across various imaging modalities,\novercoming the limitations of manual segmentation. This review explores various\nmedical imaging techniques, examines the U-Net architectures and their\nadaptations, and discusses their application across different modalities. It\nalso identifies common challenges in MIS and proposes potential solutions.\n","authors":["Fnu Neha","Deepshikha Bhati","Deepak Kumar Shukla","Sonavi Makarand Dalvi","Nikolaos Mantzou","Safa Shubbar"],"pdf_url":"https://arxiv.org/pdf/2412.02242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02240v1","updated":"2024-12-03T08:09:06Z","published":"2024-12-03T08:09:06Z","title":"ESA: Example Sieve Approach for Multi-Positive and Unlabeled Learning","summary":" Learning from Multi-Positive and Unlabeled (MPU) data has gradually attracted\nsignificant attention from practical applications. Unfortunately, the risk of\nMPU also suffer from the shift of minimum risk, particularly when the models\nare very flexible as shown in Fig.\\ref{moti}. In this paper, to alleviate the\nshifting of minimum risk problem, we propose an Example Sieve Approach (ESA) to\nselect examples for training a multi-class classifier. Specifically, we sieve\nout some examples by utilizing the Certain Loss (CL) value of each example in\nthe training stage and analyze the consistency of the proposed risk estimator.\nBesides, we show that the estimation error of proposed ESA obtains the optimal\nparametric convergence rate. Extensive experiments on various real-world\ndatasets show the proposed approach outperforms previous methods.\n","authors":["Zhongnian Li","Meng Wei","Peng Ying","Xinzheng Xu"],"pdf_url":"https://arxiv.org/pdf/2412.02240v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2406.14026v3","updated":"2024-12-03T08:03:25Z","published":"2024-06-20T06:46:23Z","title":"Demystifying Language Model Forgetting with Low-rank Example\n Associations","summary":" Large Language models (LLMs) suffer from forgetting of upstream data when\nfine-tuned. Despite efforts on mitigating forgetting, few have investigated\nwhether, and how forgotten upstream examples are dependent on and associated\nwith newly learned tasks. Insights on such associations enable efficient and\ntargeted mitigation of forgetting. In this paper, we empirically analyze\nforgetting (measured in log-perplexity increase) that occurs in $N$ upstream\nexamples of language modeling or instruction-tuning after fine-tuning LLMs on\none of $M$ new tasks, visualized in $M\\times N$ matrices. We demonstrate that\nthe matrices display simple low-rank patterns, often well-approximated with\nmultiplicative scalar effects of upstream examples and newly learned tasks. We\nalso examine fine-grained associations with visualization and statistics.\nLeveraging the low-rank nature of the associations, we predict forgetting of\nupstream examples when fine-tuning on unseen tasks with matrix completion over\nthe empirical associations. This enables fast identification of most forgotten\nexamples without expensive inference on the entire upstream data. The approach,\ndespite simplicity, outperforms prior approaches that learn semantic\nrelationships of learned tasks and upstream examples with LMs for predicting\nforgetting. We demonstrate the practical utility of our analysis by showing\nstatistically significantly reduced forgetting as we upweight predicted\nexamples for replay at fine-tuning. Project page:\nhttps://inklab.usc.edu/lm-forgetting-prediction/\n","authors":["Xisen Jin","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2406.14026v3.pdf","comment":"10 pages; preprint"},{"id":"http://arxiv.org/abs/2412.02230v1","updated":"2024-12-03T08:00:19Z","published":"2024-12-03T08:00:19Z","title":"Learning from Concealed Labels","summary":" Annotating data for sensitive labels (e.g., disease, smoking) poses a\npotential threats to individual privacy in many real-world scenarios. To cope\nwith this problem, we propose a novel setting to protect privacy of each\ninstance, namely learning from concealed labels for multi-class classification.\nConcealed labels prevent sensitive labels from appearing in the label set\nduring the label collection stage, which specifies none and some random sampled\ninsensitive labels as concealed labels set to annotate sensitive data. In this\npaper, an unbiased estimator can be established from concealed data under mild\nassumptions, and the learned multi-class classifier can not only classify the\ninstance from insensitive labels accurately but also recognize the instance\nfrom the sensitive labels. Moreover, we bound the estimation error and show\nthat the multi-class classifier achieves the optimal parametric convergence\nrate. Experiments demonstrate the significance and effectiveness of the\nproposed method for concealed labels in synthetic and real-world datasets.\n","authors":["Zhongnian Li","Meng Wei","Peng Ying","Tongfeng Sun","Xinzheng Xu"],"pdf_url":"https://arxiv.org/pdf/2412.02230v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.02228v1","updated":"2024-12-03T07:51:14Z","published":"2024-12-03T07:51:14Z","title":"BANER: Boundary-Aware LLMs for Few-Shot Named Entity Recognition","summary":" Despite the recent success of two-stage prototypical networks in few-shot\nnamed entity recognition (NER), challenges such as over/under-detected false\nspans in the span detection stage and unaligned entity prototypes in the type\nclassification stage persist. Additionally, LLMs have not proven to be\neffective few-shot information extractors in general. In this paper, we propose\nan approach called Boundary-Aware LLMs for Few-Shot Named Entity Recognition to\naddress these issues. We introduce a boundary-aware contrastive learning\nstrategy to enhance the LLM's ability to perceive entity boundaries for\ngeneralized entity spans. Additionally, we utilize LoRAHub to align information\nfrom the target domain to the source domain, thereby enhancing adaptive\ncross-domain classification capabilities. Extensive experiments across various\nbenchmarks demonstrate that our framework outperforms prior methods, validating\nits effectiveness. In particular, the proposed strategies demonstrate\neffectiveness across a range of LLM architectures. The code and data are\nreleased on https://github.com/UESTC-GQJ/BANER.\n","authors":["Quanjiang Guo","Yihong Dong","Ling Tian","Zhao Kang","Yu Zhang","Sijie Wang"],"pdf_url":"https://arxiv.org/pdf/2412.02228v1.pdf","comment":"Appear on COLING 2025"},{"id":"http://arxiv.org/abs/2403.08978v2","updated":"2024-12-03T07:36:47Z","published":"2024-03-13T22:06:03Z","title":"AutoGuide: Automated Generation and Selection of Context-Aware\n Guidelines for Large Language Model Agents","summary":" Recent advances in large language models (LLMs) have empowered AI agents\ncapable of performing various sequential decision-making tasks. However,\neffectively guiding LLMs to perform well in unfamiliar domains like web\nnavigation, where they lack sufficient knowledge, has proven to be difficult\nwith the demonstration-based in-context learning paradigm. In this paper, we\nintroduce a novel framework, called AutoGuide, which addresses this limitation\nby automatically generating context-aware guidelines from offline experiences.\nImportantly, each context-aware guideline is expressed in concise natural\nlanguage and follows a conditional structure, clearly describing the context\nwhere it is applicable. As a result, our guidelines facilitate the provision of\nrelevant knowledge for the agent's current decision-making process, overcoming\nthe limitations of the conventional demonstration-based learning paradigm. Our\nevaluation demonstrates that AutoGuide significantly outperforms competitive\nbaselines in complex benchmark domains, including real-world web navigation.\n","authors":["Yao Fu","Dong-Ki Kim","Jaekyeom Kim","Sungryull Sohn","Lajanugen Logeswaran","Kyunghoon Bae","Honglak Lee"],"pdf_url":"https://arxiv.org/pdf/2403.08978v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02220v1","updated":"2024-12-03T07:25:30Z","published":"2024-12-03T07:25:30Z","title":"Unlocking Tuning-Free Few-Shot Adaptability in Visual Foundation Models\n by Recycling Pre-Tuned LoRAs","summary":" Large Language Models (LLMs) such as ChatGPT demonstrate strong few-shot\nadaptability without requiring fine-tuning, positioning them ideal for\ndata-limited and real-time applications. However, this adaptability has not yet\nbeen replicated in current Visual Foundation Models (VFMs), which require\nexplicit fine-tuning with sufficient tuning data. Besides, the\npretraining-finetuning paradigm has led to the surge of numerous task-specific\nmodular components, such as Low-Rank Adaptation (LoRA). For the first time, we\nexplore the potential of reusing diverse pre-tuned LoRAs without accessing\ntheir original training data, to achieve tuning-free few-shot adaptation in\nVFMs. Our framework, LoRA Recycle, distills a meta-LoRA from diverse pre-tuned\nLoRAs with a meta-learning objective, using surrogate data generated inversely\nfrom pre-tuned LoRAs themselves. The VFM, once equipped with the meta-LoRA, is\nempowered to solve new few-shot tasks in a single forward pass, akin to the\nin-context learning of LLMs. Additionally, we incorporate a double-efficient\nmechanism tailored to our framework, significantly accelerating the\nmeta-training process while maintaining or even improving performance.\nExtensive experiments across various few-shot classification benchmarks across\nboth in- and cross-domain scenarios demonstrate the superiority of our\nframework.\n","authors":["Zixuan Hu","Yongxian Wei","Li Shen","Chun Yuan","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2412.02220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08830v2","updated":"2024-12-03T07:23:25Z","published":"2024-06-13T05:49:29Z","title":"Center-Sensitive Kernel Optimization for Efficient On-Device Incremental\n Learning","summary":" To facilitate the evolution of edge intelligence in ever-changing\nenvironments, we study on-device incremental learning constrained in limited\ncomputation resource in this paper. Current on-device training methods just\nfocus on efficient training without considering the catastrophic forgetting,\npreventing the model getting stronger when continually exploring the world. To\nsolve this problem, a direct solution is to involve the existing incremental\nlearning mechanisms into the on-device training framework. Unfortunately, such\na manner cannot work well as those mechanisms usually introduce large\nadditional computational cost to the network optimization process, which would\ninevitably exceed the memory capacity of the edge devices. To address this\nissue, this paper makes an early effort to propose a simple but effective\nedge-friendly incremental learning framework. Based on an empirical study on\nthe knowledge intensity of the kernel elements of the neural network, we find\nthat the center kernel is the key for maximizing the knowledge intensity for\nlearning new data, while freezing the other kernel elements would get a good\nbalance on the model's capacity for overcoming catastrophic forgetting. Upon\nthis finding, we further design a center-sensitive kernel optimization\nframework to largely alleviate the cost of the gradient computation and\nback-propagation. Besides, a dynamic channel element selection strategy is also\nproposed to facilitate a sparse orthogonal gradient projection for further\nreducing the optimization complexity, upon the knowledge explored from the new\ntask data. Extensive experiments validate our method is efficient and\neffective, e.g., our method achieves average accuracy boost of 38.08% with even\nless memory and approximate computation compared to existing on-device training\nmethods, indicating its significant potential for on-device incremental\nlearning.\n","authors":["Dingwen Zhang","Yan Li","De Cheng","Nannan Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2406.08830v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00156v2","updated":"2024-12-03T07:18:25Z","published":"2024-11-29T08:10:49Z","title":"VISION-XL: High Definition Video Inverse Problem Solver using Latent\n Image Diffusion Models","summary":" In this paper, we propose a novel framework for solving high-definition video\ninverse problems using latent image diffusion models. Building on recent\nadvancements in spatio-temporal optimization for video inverse problems using\nimage diffusion models, our approach leverages latent-space diffusion models to\nachieve enhanced video quality and resolution. To address the high\ncomputational demands of processing high-resolution frames, we introduce a\npseudo-batch consistent sampling strategy, allowing efficient operation on a\nsingle GPU. Additionally, to improve temporal consistency, we present\nbatch-consistent inversion, an initialization technique that incorporates\ninformative latents from the measurement frame. By integrating with SDXL, our\nframework achieves state-of-the-art video reconstruction across a wide range of\nspatio-temporal inverse problems, including complex combinations of frame\naveraging and various spatial degradations, such as deblurring,\nsuper-resolution, and inpainting. Unlike previous methods, our approach\nsupports multiple aspect ratios (landscape, vertical, and square) and delivers\nHD-resolution reconstructions (exceeding 1280x720) in under 2.5 minutes on a\nsingle NVIDIA 4090 GPU.\n","authors":["Taesung Kwon","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2412.00156v2.pdf","comment":"Project page: https://vision-xl.github.io/"},{"id":"http://arxiv.org/abs/2412.02215v1","updated":"2024-12-03T07:11:21Z","published":"2024-12-03T07:11:21Z","title":"Recovering implicit physics model under real-world constraints","summary":" Recovering a physics-driven model, i.e. a governing set of equations of the\nunderlying dynamical systems, from the real-world data has been of recent\ninterest. Most existing methods either operate on simulation data with\nunrealistically high sampling rates or require explicit measurements of all\nsystem variables, which is not amenable in real-world deployments. Moreover,\nthey assume the timestamps of external perturbations to the physical system are\nknown a priori, without uncertainty, implicitly discounting any sensor\ntime-synchronization or human reporting errors. In this paper, we propose a\nnovel liquid time constant neural network (LTC-NN) based architecture to\nrecover underlying model of physical dynamics from real-world data. The\nautomatic differentiation property of LTC-NN nodes overcomes problems\nassociated with low sampling rates, the input dependent time constant in the\nforward pass of the hidden layer of LTC-NN nodes creates a massive search space\nof implicit physical dynamics, the physics model solver based data\nreconstruction loss guides the search for the correct set of implicit dynamics,\nand the use of the dropout regularization in the dense layer ensures extraction\nof the sparsest model. Further, to account for the perturbation timing error,\nwe utilize dense layer nodes to search through input shifts that results in the\nlowest reconstruction loss. Experiments on four benchmark dynamical systems,\nthree with simulation data and one with the real-world data show that the\nLTC-NN architecture is more accurate in recovering implicit physics model\ncoefficients than the state-of-the-art sparse model recovery approaches. We\nalso introduce four additional case studies (total eight) on real-life medical\nexamples in simulation and with real-world clinical data to show effectiveness\nof our approach in recovering underlying model in practice.\n","authors":["Ayan Banerjee","Sandeep K. S. Gupta"],"pdf_url":"https://arxiv.org/pdf/2412.02215v1.pdf","comment":"This paper is published in ECAI 2024,\n https://ebooks.iospress.nl/volumearticle/69651"},{"id":"http://arxiv.org/abs/2404.18247v2","updated":"2024-12-03T07:07:45Z","published":"2024-04-28T17:02:24Z","title":"Classical integrability in the presence of a cosmological constant:\n analytic and machine learning results","summary":" We study the integrability of two-dimensional theories that are obtained by a\ndimensional reduction of certain four-dimensional gravitational theories\ndescribing the coupling of Maxwell fields and neutral scalar fields to gravity\nin the presence of a potential for the neutral scalar fields. For a certain\nsolution subspace, we demonstrate partial integrability by showing that a\nsubset of the equations of motion in two dimensions are the compatibility\nconditions for a linear system. Subsequently, we study the integrability of\nthese two-dimensional models from a complementary one-dimensional point of\nview, framed in terms of Liouville integrability. In this endeavour, we employ\nvarious machine learning techniques to systematise our search for numerical Lax\npair matrices for these models, as well as conserved currents expressed as\nfunctions of phase space variables.\n","authors":["Gabriel Lopes Cardoso","Damián Mayorga Peña","Suresh Nampuri"],"pdf_url":"https://arxiv.org/pdf/2404.18247v2.pdf","comment":"38 pages, 9 figures, typographical corrections and assorted\n improvements"},{"id":"http://arxiv.org/abs/2412.02211v1","updated":"2024-12-03T07:04:10Z","published":"2024-12-03T07:04:10Z","title":"An Automated Data Mining Framework Using Autoencoders for Feature\n Extraction and Dimensionality Reduction","summary":" This study proposes an automated data mining framework based on autoencoders\nand experimentally verifies its effectiveness in feature extraction and data\ndimensionality reduction. Through the encoding-decoding structure, the\nautoencoder can capture the data's potential characteristics and achieve noise\nreduction and anomaly detection, providing an efficient and stable solution for\nthe data mining process. The experiment compared the performance of the\nautoencoder with traditional dimensionality reduction methods (such as PCA, FA,\nT-SNE, and UMAP). The results showed that the autoencoder performed best in\nterms of reconstruction error and root mean square error and could better\nretain data structure and enhance the generalization ability of the model. The\nautoencoder-based framework not only reduces manual intervention but also\nsignificantly improves the automation of data processing. In the future, with\nthe advancement of deep learning and big data technology, the autoencoder\nmethod combined with a generative adversarial network (GAN) or graph neural\nnetwork (GNN) is expected to be more widely used in the fields of complex data\nprocessing, real-time data analysis and intelligent decision-making.\n","authors":["Yaxin Liang","Xinshi Li","Xin Huang","Ziqi Zhang","Yue Yao"],"pdf_url":"https://arxiv.org/pdf/2412.02211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10656v2","updated":"2024-12-03T07:02:05Z","published":"2023-08-21T11:48:34Z","title":"Practical Parallel Algorithms for Non-Monotone Submodular Maximization","summary":" Submodular maximization has found extensive applications in various domains\nwithin the field of artificial intelligence, including but not limited to\nmachine learning, computer vision, and natural language processing. With the\nincreasing size of datasets in these domains, there is a pressing need to\ndevelop efficient and parallelizable algorithms for submodular maximization.\nOne measure of the parallelizability of a submodular maximization algorithm is\nits adaptive complexity, which indicates the number of sequential rounds where\na polynomial number of queries to the objective function can be executed in\nparallel. In this paper, we study the problem of non-monotone submodular\nmaximization subject to a knapsack constraint, and propose the first\ncombinatorial algorithm achieving an $(8+\\epsilon)$-approximation under\n$\\mathcal{O}(\\log n)$ adaptive complexity, which is \\textit{optimal} up to a\nfactor of $\\mathcal{O}(\\log\\log n)$. Moreover, we also propose the first\nalgorithm with both provable approximation ratio and sublinear adaptive\ncomplexity for the problem of non-monotone submodular maximization subject to a\n$k$-system constraint. As a by-product, we show that our two algorithms can\nalso be applied to the special case of submodular maximization subject to a\ncardinality constraint, and achieve performance bounds comparable with those of\nstate-of-the-art algorithms. Finally, the effectiveness of our approach is\ndemonstrated by extensive experiments on real-world applications.\n","authors":["Shuang Cui","Kai Han","Jing Tang","Xueying Li","Aakas Zhiyuli","Hanxiao Li"],"pdf_url":"https://arxiv.org/pdf/2308.10656v2.pdf","comment":"Part of the contribution appears in AAAI-2023"},{"id":"http://arxiv.org/abs/2112.04948v2","updated":"2024-12-03T07:00:13Z","published":"2021-12-09T14:26:13Z","title":"Guardian of the Ensembles: Introducing Pairwise Adversarially Robust\n Loss for Resisting Adversarial Attacks in DNN Ensembles","summary":" Adversarial attacks rely on transferability, where an adversarial example\n(AE) crafted on a surrogate classifier tends to mislead a target classifier.\nRecent ensemble methods demonstrate that AEs are less likely to mislead\nmultiple classifiers in an ensemble. This paper proposes a new ensemble\ntraining using a Pairwise Adversarially Robust Loss (PARL) that by construction\nproduces an ensemble of classifiers with diverse decision boundaries. PARL\nutilizes outputs and gradients of each layer with respect to network parameters\nin every classifier within the ensemble simultaneously. PARL is demonstrated to\nachieve higher robustness against black-box transfer attacks than previous\nensemble methods as well as adversarial training without adversely affecting\nclean example accuracy. Extensive experiments using standard Resnet20,\nWideResnet28-10 classifiers demonstrate the robustness of PARL against\nstate-of-the-art adversarial attacks. While maintaining similar clean accuracy\nand lesser training time, the proposed architecture has a 24.8% increase in\nrobust accuracy ($\\epsilon$ = 0.07) from the state-of-the art method.\n","authors":["Shubhi Shukla","Subhadeep Dalui","Manaar Alam","Shubhajit Datta","Arijit Mondal","Debdeep Mukhopadhyay","Partha Pratim Chakrabarti"],"pdf_url":"https://arxiv.org/pdf/2112.04948v2.pdf","comment":"Accepted at IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV 2025)"},{"id":"http://arxiv.org/abs/2408.17355v3","updated":"2024-12-03T06:53:58Z","published":"2024-08-30T15:39:34Z","title":"Bidirectional Decoding: Improving Action Chunking via Closed-Loop\n Resampling","summary":" Predicting and executing a sequence of actions without intermediate\nreplanning, known as action chunking, is increasingly used in robot learning\nfrom human demonstrations. Yet, its reported effects on the learned policy are\ninconsistent: some studies find it crucial for achieving strong results, while\nothers observe decreased performance. In this paper, we first dissect how\naction chunking impacts the divergence between a learner and a demonstrator. We\nfind that action chunking allows the learner to better capture the temporal\ndependencies in demonstrations but at the cost of reduced reactivity in\nstochastic environments. To address this tradeoff, we propose Bidirectional\nDecoding (BID), a test-time inference algorithm that bridges action chunking\nwith closed-loop operations. BID samples multiple predictions at each time step\nand searches for the optimal one based on two criteria: (i) backward coherence,\nwhich favors samples that align with previous decisions; (ii) forward contrast,\nwhich seeks samples of high likelihood for future plans. By coupling decisions\nwithin and across action chunks, BID promotes consistency over time while\nmaintaining reactivity to unexpected changes. Experimental results show that\nBID boosts the performance of two state-of-the-art generative policies across\nseven simulation benchmarks and two real-world tasks. Code and videos are\navailable at https://bid-robot.github.io.\n","authors":["Yuejiang Liu","Jubayer Ibn Hamid","Annie Xie","Yoonho Lee","Maximilian Du","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2408.17355v3.pdf","comment":"Project website: https://bid-robot.github.io/"},{"id":"http://arxiv.org/abs/2402.10946v3","updated":"2024-12-03T06:52:34Z","published":"2024-02-09T04:02:43Z","title":"CultureLLM: Incorporating Cultural Differences into Large Language\n Models","summary":" Large language models (LLMs) are reported to be partial to certain cultures\nowing to the training data dominance from the English corpora. Since\nmultilingual cultural data are often expensive to collect, existing efforts\nhandle this by prompt engineering or culture-specific pre-training. However,\nthey might overlook the knowledge deficiency of low-resource culture and\nrequire extensive computing resources. In this paper, we propose CultureLLM, a\ncost-effective solution to incorporate cultural differences into LLMs.\nCultureLLM adopts World Value Survey (WVS) as seed data and generates\nsemantically equivalent training data via the proposed semantic data\naugmentation. Using only 50 seed samples from WVS with augmented data, we\nfine-tune culture-specific LLMs and one unified model (CultureLLM-One) for 9\ncultures covering rich and low-resource languages. Extensive experiments on 60\nculture-related datasets demonstrate that CultureLLM significantly outperforms\nvarious counterparts such as GPT-3.5 (by 8.1%) and Gemini Pro (by 9.5%) with\ncomparable performance to GPT-4 or even better. Our human study shows that the\ngenerated samples are semantically equivalent to the original samples,\nproviding an effective solution for LLMs augmentation. Code is released at\nhttps://github.com/Scarelette/CultureLLM.\n","authors":["Cheng Li","Mengzhou Chen","Jindong Wang","Sunayana Sitaram","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2402.10946v3.pdf","comment":"NeurIPS 2024; Code is at https://github.com/Scarelette/CultureLLM"},{"id":"http://arxiv.org/abs/2409.18169v5","updated":"2024-12-03T06:52:11Z","published":"2024-09-26T17:55:22Z","title":"Harmful Fine-tuning Attacks and Defenses for Large Language Models: A\n Survey","summary":" Recent research demonstrates that the nascent fine-tuning-as-a-service\nbusiness model exposes serious safety concerns -- fine-tuning over a few\nharmful data uploaded by the users can compromise the safety alignment of the\nmodel. The attack, known as harmful fine-tuning attack, has raised a broad\nresearch interest among the community. However, as the attack is still new,\n\\textbf{we observe that there are general misunderstandings within the research\ncommunity.} To clear up concern, this paper provide a comprehensive overview to\nthree aspects of harmful fine-tuning: attacks setting, defense design and\nevaluation methodology. Specifically, we first present the threat model of the\nproblem, and introduce the harmful fine-tuning attack and its variants. Then we\nsystematically survey the existing literature on attacks/defenses/mechanical\nanalysis of the problem. Finally, we introduce the evaluation methodology and\noutline future research directions that might contribute to the development of\nthe field. Additionally, we present a list of questions of interest, which\nmight be useful to refer to when reviewers in the peer review process question\nthe realism of the experiment/attack/defense setting. A curated list of\nrelevant papers is maintained and made accessible at:\nhttps://github.com/git-disl/awesome_LLM-harmful-fine-tuning-papers.\n","authors":["Tiansheng Huang","Sihao Hu","Fatih Ilhan","Selim Furkan Tekin","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2409.18169v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15143v3","updated":"2024-12-03T06:43:39Z","published":"2024-05-24T01:45:27Z","title":"Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation\n Models","summary":" Go-Explore is a powerful family of algorithms designed to solve\nhard-exploration problems built on the principle of archiving discovered\nstates, and iteratively returning to and exploring from the most promising\nstates. This approach has led to superhuman performance across a wide variety\nof challenging problems including Atari games and robotic control, but requires\nmanually designing heuristics to guide exploration (i.e., determine which\nstates to save and explore from, and what actions to consider next), which is\ntime-consuming and infeasible in general. To resolve this, we propose\nIntelligent Go-Explore (IGE) which greatly extends the scope of the original\nGo-Explore by replacing these handcrafted heuristics with the intelligence and\ninternalized human notions of interestingness captured by giant pretrained\nfoundation models (FMs). This provides IGE with a human-like ability to\ninstinctively identify how interesting or promising any new state is (e.g.,\ndiscovering new objects, locations, or behaviors), even in complex environments\nwhere heuristics are hard to define. Moreover, IGE offers the exciting\nopportunity to recognize and capitalize on serendipitous discoveries-states\nencountered during exploration that are valuable in terms of exploration, yet\nwhere what makes them interesting was not anticipated by the human user. We\nevaluate our algorithm on a diverse range of language and vision-based tasks\nthat require search and exploration. Across these tasks, IGE strongly exceeds\nclassic reinforcement learning and graph search baselines, and also succeeds\nwhere prior state-of-the-art FM agents like Reflexion completely fail. Overall,\nIntelligent Go-Explore combines the tremendous strengths of FMs and the\npowerful Go-Explore algorithm, opening up a new frontier of research into\ncreating more generally capable agents with impressive exploration\ncapabilities.\n","authors":["Cong Lu","Shengran Hu","Jeff Clune"],"pdf_url":"https://arxiv.org/pdf/2405.15143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02196v1","updated":"2024-12-03T06:21:35Z","published":"2024-12-03T06:21:35Z","title":"SA-GNAS: Seed Architecture Expansion for Efficient Large-scale Graph\n Neural Architecture Search","summary":" GNAS (Graph Neural Architecture Search) has demonstrated great effectiveness\nin automatically designing the optimal graph neural architectures for multiple\ndownstream tasks, such as node classification and link prediction. However,\nmost existing GNAS methods cannot efficiently handle large-scale graphs\ncontaining more than million-scale nodes and edges due to the expensive\ncomputational and memory overhead. To scale GNAS on large graphs while\nachieving better performance, we propose SA-GNAS, a novel framework based on\nseed architecture expansion for efficient large-scale GNAS. Similar to the cell\nexpansion in biotechnology, we first construct a seed architecture and then\nexpand the seed architecture iteratively. Specifically, we first propose a\nperformance ranking consistency-based seed architecture selection method, which\nselects the architecture searched on the subgraph that best matches the\noriginal large-scale graph. Then, we propose an entropy minimization-based seed\narchitecture expansion method to further improve the performance of the seed\narchitecture. Extensive experimental results on five large-scale graphs\ndemonstrate that the proposed SA-GNAS outperforms human-designed\nstate-of-the-art GNN architectures and existing graph NAS methods. Moreover,\nSA-GNAS can significantly reduce the search time, showing better search\nefficiency. For the largest graph with billion edges, SA-GNAS can achieve 2.8\ntimes speedup compared to the SOTA large-scale GNAS method GAUSS. Additionally,\nsince SA-GNAS is inherently parallelized, the search efficiency can be further\nimproved with more GPUs. SA-GNAS is available at\nhttps://github.com/PasaLab/SAGNAS.\n","authors":["Guanghui Zhu","Zipeng Ji","Jingyan Chen","Limin Wang","Chunfeng Yuan","Yihua Huang"],"pdf_url":"https://arxiv.org/pdf/2412.02196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02187v1","updated":"2024-12-03T05:59:34Z","published":"2024-12-03T05:59:34Z","title":"Deep Learning, Machine Learning, Advancing Big Data Analytics and\n Management","summary":" Advancements in artificial intelligence, machine learning, and deep learning\nhave catalyzed the transformation of big data analytics and management into\npivotal domains for research and application. This work explores the\ntheoretical foundations, methodological advancements, and practical\nimplementations of these technologies, emphasizing their role in uncovering\nactionable insights from massive, high-dimensional datasets. The study presents\na systematic overview of data preprocessing techniques, including data\ncleaning, normalization, integration, and dimensionality reduction, to prepare\nraw data for analysis. Core analytics methodologies such as classification,\nclustering, regression, and anomaly detection are examined, with a focus on\nalgorithmic innovation and scalability. Furthermore, the text delves into\nstate-of-the-art frameworks for data mining and predictive modeling,\nhighlighting the role of neural networks, support vector machines, and ensemble\nmethods in tackling complex analytical challenges. Special emphasis is placed\non the convergence of big data with distributed computing paradigms, including\ncloud and edge computing, to address challenges in storage, computation, and\nreal-time analytics. The integration of ethical considerations, including data\nprivacy and compliance with global standards, ensures a holistic perspective on\ndata management. Practical applications across healthcare, finance, marketing,\nand policy-making illustrate the real-world impact of these technologies.\nThrough comprehensive case studies and Python-based implementations, this work\nequips researchers, practitioners, and data enthusiasts with the tools to\nnavigate the complexities of modern data analytics. It bridges the gap between\ntheory and practice, fostering the development of innovative solutions for\nmanaging and leveraging data in the era of artificial intelligence.\n","authors":["Weiche Hsieh","Ziqian Bi","Keyu Chen","Benji Peng","Sen Zhang","Jiawei Xu","Jinlang Wang","Caitlyn Heqi Yin","Yichao Zhang","Pohsun Feng","Yizhu Wen","Tianyang Wang","Ming Li","Chia Xin Liang","Jintao Ren","Qian Niu","Silin Chen","Lawrence K. Q. Yan","Han Xu","Hong-Ming Tseng","Xinyuan Song","Bowen Jing","Junjie Yang","Junhao Song","Junyu Liu","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2412.02187v1.pdf","comment":"174 pages"},{"id":"http://arxiv.org/abs/2410.15876v3","updated":"2024-12-03T05:59:09Z","published":"2024-10-21T10:57:45Z","title":"FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL","summary":" Multi-agent reinforcement learning has demonstrated significant potential in\naddressing complex cooperative tasks across various real-world applications.\nHowever, existing MARL approaches often rely on the restrictive assumption that\nthe number of entities (e.g., agents, obstacles) remains constant between\ntraining and inference. This overlooks scenarios where entities are dynamically\nremoved or added during the inference trajectory -- a common occurrence in\nreal-world environments like search and rescue missions and dynamic combat\nsituations. In this paper, we tackle the challenge of intra-trajectory dynamic\nentity composition under zero-shot out-of-domain (OOD) generalization, where\nsuch dynamic changes cannot be anticipated beforehand. Our empirical studies\nreveal that existing MARL methods suffer significant performance degradation\nand increased uncertainty in these scenarios. In response, we propose\nFlickerFusion, a novel OOD generalization method that acts as a universally\napplicable augmentation technique for MARL backbone methods. FlickerFusion\nstochastically drops out parts of the observation space, emulating being\nin-domain when inferenced OOD. The results show that FlickerFusion not only\nachieves superior inference rewards but also uniquely reduces uncertainty\nvis-\\`a-vis the backbone, compared to existing methods. Benchmarks,\nimplementations, and model weights are organized and open-sourced at\nflickerfusion305.github.io, accompanied by ample demo video renderings.\n","authors":["Woosung Koh","Wonbeen Oh","Siyeol Kim","Suhin Shin","Hyeongjin Kim","Jaein Jang","Junghyun Lee","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2410.15876v3.pdf","comment":"NeurIPS '24 Open-World Agents Workshop"},{"id":"http://arxiv.org/abs/2412.01650v2","updated":"2024-12-03T05:46:35Z","published":"2024-12-02T15:59:35Z","title":"Privacy-Preserving Federated Learning via Homomorphic Adversarial\n Networks","summary":" Privacy-preserving federated learning (PPFL) aims to train a global model for\nmultiple clients while maintaining their data privacy. However, current PPFL\nprotocols exhibit one or more of the following insufficiencies: considerable\ndegradation in accuracy, the requirement for sharing keys, and cooperation\nduring the key generation or decryption processes. As a mitigation, we develop\nthe first protocol that utilizes neural networks to implement PPFL, as well as\nincorporating an Aggregatable Hybrid Encryption scheme tailored to the needs of\nPPFL. We name these networks as Homomorphic Adversarial Networks (HANs) which\ndemonstrate that neural networks are capable of performing tasks similar to\nmulti-key homomorphic encryption (MK-HE) while solving the problems of key\ndistribution and collaborative decryption. Our experiments show that HANs are\nrobust against privacy attacks. Compared with non-private federated learning,\nexperiments conducted on multiple datasets demonstrate that HANs exhibit a\nnegligible accuracy loss (at most 1.35%). Compared to traditional MK-HE\nschemes, HANs increase encryption aggregation speed by 6,075 times while\nincurring a 29.2 times increase in communication overhead.\n","authors":["Wenhan Dong","Chao Lin","Xinlei He","Xinyi Huang","Shengmin Xu"],"pdf_url":"https://arxiv.org/pdf/2412.01650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02181v1","updated":"2024-12-03T05:35:44Z","published":"2024-12-03T05:35:44Z","title":"Generalizing Weisfeiler-Lehman Kernels to Subgraphs","summary":" Subgraph representation learning has been effective in solving various\nreal-world problems. However, current graph neural networks (GNNs) produce\nsuboptimal results for subgraph-level tasks due to their inability to capture\ncomplex interactions within and between subgraphs. To provide a more expressive\nand efficient alternative, we propose WLKS, a Weisfeiler-Lehman (WL) kernel\ngeneralized for subgraphs by applying the WL algorithm on induced $k$-hop\nneighborhoods. We combine kernels across different $k$-hop levels to capture\nricher structural information that is not fully encoded in existing models. Our\napproach can balance expressiveness and efficiency by eliminating the need for\nneighborhood sampling. In experiments on eight real-world and synthetic\nbenchmarks, WLKS significantly outperforms leading approaches on five datasets\nwhile reducing training time, ranging from 0.01x to 0.25x compared to the\nstate-of-the-art.\n","authors":["Dongkwan Kim","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2412.02181v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2408.17151v2","updated":"2024-12-03T05:27:59Z","published":"2024-08-30T09:40:52Z","title":"Investigating Privacy Leakage in Dimensionality Reduction Methods via\n Reconstruction Attack","summary":" This study investigates privacy leakage in dimensionality reduction methods\nthrough a novel machine learning-based reconstruction attack. Employing an\ninformed adversary threat model, we develop a neural network capable of\nreconstructing high-dimensional data from low-dimensional embeddings.\n We evaluate six popular dimensionality reduction techniques: PCA, sparse\nrandom projection (SRP), multidimensional scaling (MDS), Isomap, t-SNE, and\nUMAP. Using both MNIST and NIH Chest X-ray datasets, we perform a qualitative\nanalysis to identify key factors affecting reconstruction quality. Furthermore,\nwe assess the effectiveness of an additive noise mechanism in mitigating these\nreconstruction attacks. Our experimental results on both datasets reveal that\nthe attack is effective against deterministic methods (PCA and Isomap), but\nineffective against methods that employ random initialization (SRP, MDS, t-SNE\nand UMAP). When adding the images with large noises before performing PCA or\nIsomap, the attack produced severely distorted reconstructions. In contrast,\nfor the other four methods, the reconstructions still show some recognizable\nfeatures, though they bear little resemblance to the original images.\n","authors":["Chayadon Lumbut","Donlapark Ponnoprat"],"pdf_url":"https://arxiv.org/pdf/2408.17151v2.pdf","comment":"Major revision"},{"id":"http://arxiv.org/abs/2412.02175v1","updated":"2024-12-03T05:20:05Z","published":"2024-12-03T05:20:05Z","title":"Improved Complexity for Smooth Nonconvex Optimization: A Two-Level\n Online Learning Approach with Quasi-Newton Methods","summary":" We study the problem of finding an $\\epsilon$-first-order stationary point\n(FOSP) of a smooth function, given access only to gradient information. The\nbest-known gradient query complexity for this task, assuming both the gradient\nand Hessian of the objective function are Lipschitz continuous, is\n${O}(\\epsilon^{-7/4})$. In this work, we propose a method with a gradient\ncomplexity of ${O}(d^{1/4}\\epsilon^{-13/8})$, where $d$ is the problem\ndimension, leading to an improved complexity when $d = {O}(\\epsilon^{-1/2})$.\nTo achieve this result, we design an optimization algorithm that, underneath,\ninvolves solving two online learning problems. Specifically, we first\nreformulate the task of finding a stationary point for a nonconvex problem as\nminimizing the regret in an online convex optimization problem, where the loss\nis determined by the gradient of the objective function. Then, we introduce a\nnovel optimistic quasi-Newton method to solve this online learning problem,\nwith the Hessian approximation update itself framed as an online learning\nproblem in the space of matrices. Beyond improving the complexity bound for\nachieving an $\\epsilon$-FOSP using a gradient oracle, our result provides the\nfirst guarantee suggesting that quasi-Newton methods can potentially outperform\ngradient descent-type methods in nonconvex settings.\n","authors":["Ruichen Jiang","Aryan Mokhtari","Francisco Patitucci"],"pdf_url":"https://arxiv.org/pdf/2412.02175v1.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2403.12820v3","updated":"2024-12-03T05:15:15Z","published":"2024-03-19T15:21:00Z","title":"A Physics-embedded Deep Learning Framework for Cloth Simulation","summary":" Delicate cloth simulations have long been desired in computer graphics.\nVarious methods were proposed to improve engaged force interactions, collision\nhandling, and numerical integrations. Deep learning has the potential to\nachieve fast and real-time simulation, but common neural network structures\noften demand many parameters to capture cloth dynamics. This paper proposes a\nphysics-embedded learning framework that directly encodes physical features of\ncloth simulation. The convolutional neural network is used to represent spatial\ncorrelations of the mass-spring system, after which three branches are designed\nto learn linear, nonlinear, and time derivate features of cloth physics. The\nframework can also integrate with other external forces and collision handling\nthrough either traditional simulators or sub neural networks. The model is\ntested across different cloth animation cases, without training with new data.\nAgreement with baselines and predictive realism successfully validate its\ngeneralization ability. Inference efficiency of the proposed model also defeats\ntraditional physics simulation. This framework is also designed to easily\nintegrate with other visual refinement techniques like wrinkle carving, which\nleaves significant chances to incorporate prevailing macing learning techniques\nin 3D cloth amination.\n","authors":["Zhiwei Zhao"],"pdf_url":"https://arxiv.org/pdf/2403.12820v3.pdf","comment":"updated version"},{"id":"http://arxiv.org/abs/2412.01253v2","updated":"2024-12-03T04:51:10Z","published":"2024-12-02T08:22:56Z","title":"Yi-Lightning Technical Report","summary":" This technical report presents Yi-Lightning, our latest flagship large\nlanguage model (LLM). It achieves exceptional performance, ranking 6th overall\non Chatbot Arena, with particularly strong results (2nd to 4th place) in\nspecialized categories including Chinese, Math, Coding, and Hard Prompts.\nYi-Lightning leverages an enhanced Mixture-of-Experts (MoE) architecture,\nfeaturing advanced expert segmentation and routing mechanisms coupled with\noptimized KV-caching techniques. Our development process encompasses\ncomprehensive pre-training, supervised fine-tuning (SFT), and reinforcement\nlearning from human feedback (RLHF), where we devise deliberate strategies for\nmulti-stage training, synthetic data construction, and reward modeling.\nFurthermore, we implement RAISE (Responsible AI Safety Engine), a\nfour-component framework to address safety issues across pre-training,\npost-training, and serving phases. Empowered by our scalable super-computing\ninfrastructure, all these innovations substantially reduce training, deployment\nand inference costs while maintaining high-performance standards. With further\nevaluations on public academic benchmarks, Yi-Lightning demonstrates\ncompetitive performance against top-tier LLMs, while we observe a notable\ndisparity between traditional, static benchmark results and real-world, dynamic\nhuman preferences. This observation prompts a critical reassessment of\nconventional benchmarks' utility in guiding the development of more intelligent\nand powerful AI systems for practical applications. Yi-Lightning is now\navailable through our developer platform at https://platform.lingyiwanwu.com.\n","authors":["01. AI"," :","Alan Wake","Albert Wang","Bei Chen","C. X. Lv","Chao Li","Chengen Huang","Chenglin Cai","Chujie Zheng","Daniel Cooper","Ethan Dai","Fan Zhou","Feng Hu","Heng Ji","Howard Qiu","Jiangcheng Zhu","Jun Tian","Katherine Su","Lihuan Zhang","Liying Li","Ming Song","Mou Li","Peng Liu","Qichen Hu","Shawn Wang","Shijun Zhou","Shiyong Li","Tianhang Zhu","Wen Xie","Xiang He","Xiaobo Chen","Xiaohui Hu","Xiaoyi Ren","Xinyao Niu","Yanpeng Li","Yongke Zhao","Yongzhen Luo","Yuchi Xu","Yuxuan Sha","Zhaodong Yan","Zhiyuan Liu","Zirui Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.01253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01460v2","updated":"2024-12-03T04:48:22Z","published":"2024-12-02T12:54:11Z","title":"A Comprehensive Study of Shapley Value in Data Analytics","summary":" Over the recent years, Shapley value (SV), a solution concept from\ncooperative game theory, has found numerous applications in data analytics\n(DA). This paper provides the first comprehensive study of SV used throughout\nthe DA workflow, which involves three main steps: data fabric, data\nexploration, and result reporting. We summarize existing versatile forms of SV\nused in these steps by a unified definition and clarify the essential\nfunctionalities that SV can provide for data scientists. We categorize the arts\nin this field based on the technical challenges they tackled, which include\ncomputation efficiency, approximation error, privacy preservation, and\nappropriate interpretations. We discuss these challenges and analyze the\ncorresponding solutions. We also implement SVBench, the first open-sourced\nbenchmark for developing SV applications, and conduct experiments on six DA\ntasks to validate our analysis and discussions. Based on the qualitative and\nquantitative results, we identify the limitations of current efforts for\napplying SV to DA and highlight the directions of future research and\nengineering.\n","authors":["Hong Lin","Shixin Wan","Zhongle Xie","Ke Chen","Meihui Zhang","Lidan Shou","Gang Chen"],"pdf_url":"https://arxiv.org/pdf/2412.01460v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01654v2","updated":"2024-12-03T04:40:13Z","published":"2024-12-02T16:04:15Z","title":"FSMLP: Modelling Channel Dependencies With Simplex Theory Based\n Multi-Layer Perceptions In Frequency Domain","summary":" Time series forecasting (TSF) plays a crucial role in various domains,\nincluding web data analysis, energy consumption prediction, and weather\nforecasting. While Multi-Layer Perceptrons (MLPs) are lightweight and effective\nfor capturing temporal dependencies, they are prone to overfitting when used to\nmodel inter-channel dependencies. In this paper, we investigate the overfitting\nproblem in channel-wise MLPs using Rademacher complexity theory, revealing that\nextreme values in time series data exacerbate this issue. To mitigate this\nissue, we introduce a novel Simplex-MLP layer, where the weights are\nconstrained within a standard simplex. This strategy encourages the model to\nlearn simpler patterns and thereby reducing overfitting to extreme values.\nBased on the Simplex-MLP layer, we propose a novel \\textbf{F}requency\n\\textbf{S}implex \\textbf{MLP} (FSMLP) framework for time series forecasting,\ncomprising of two kinds of modules: \\textbf{S}implex\n\\textbf{C}hannel-\\textbf{W}ise MLP (SCWM) and \\textbf{F}requency\n\\textbf{T}emporal \\textbf{M}LP (FTM). The SCWM effectively leverages the\nSimplex-MLP to capture inter-channel dependencies, while the FTM is a simple\nyet efficient temporal MLP designed to extract temporal information from the\ndata. Our theoretical analysis shows that the upper bound of the Rademacher\nComplexity for Simplex-MLP is lower than that for standard MLPs. Moreover, we\nvalidate our proposed method on seven benchmark datasets, demonstrating\nsignificant improvements in forecasting accuracy and efficiency, while also\nshowcasing superior scalability. Additionally, we demonstrate that Simplex-MLP\ncan improve other methods that use channel-wise MLP to achieve less overfitting\nand improved performance. Code are available\n\\href{https://github.com/FMLYD/FSMLP}{\\textcolor{red}{here}}.\n","authors":["Zhengnan Li","Haoxuan Li","Hao Wang","Jun Fang","Duoyin Li Yunxiao Qin"],"pdf_url":"https://arxiv.org/pdf/2412.01654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00218v2","updated":"2024-12-03T04:38:31Z","published":"2024-11-29T19:25:00Z","title":"NüshuRescue: Revitalization of the endangered Nüshu Language with AI","summary":" The preservation and revitalization of endangered and extinct languages is a\nmeaningful endeavor, conserving cultural heritage while enriching fields like\nlinguistics and anthropology. However, these languages are typically\nlow-resource, making their reconstruction labor-intensive and costly. This\nchallenge is exemplified by N\\\"ushu, a rare script historically used by Yao\nwomen in China for self-expression within a patriarchal society. To address\nthis challenge, we introduce N\\\"ushuRescue, an AI-driven framework designed to\ntrain large language models (LLMs) on endangered languages with minimal data.\nN\\\"ushuRescue automates evaluation and expands target corpora to accelerate\nlinguistic revitalization. As a foundational component, we developed NCGold, a\n500-sentence N\\\"ushu-Chinese parallel corpus, the first publicly available\ndataset of its kind. Leveraging GPT-4-Turbo, with no prior exposure to N\\\"ushu\nand only 35 short examples from NCGold, N\\\"ushuRescue achieved 48.69\\%\ntranslation accuracy on 50 withheld sentences and generated NCSilver, a set of\n98 newly translated modern Chinese sentences of varying lengths. A sample of\nboth NCGold and NCSilver is included in the Supplementary Materials.\nAdditionally, we developed FastText-based and Seq2Seq models to further support\nresearch on N\\\"ushu. N\\\"ushuRescue provides a versatile and scalable tool for\nthe revitalization of endangered languages, minimizing the need for extensive\nhuman input.\n","authors":["Ivory Yang","Weicheng Ma","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2412.00218v2.pdf","comment":"Accepted to COLING 2025"},{"id":"http://arxiv.org/abs/2412.02161v1","updated":"2024-12-03T04:37:28Z","published":"2024-12-03T04:37:28Z","title":"Towards the efficacy of federated prediction for epidemics on networks","summary":" Epidemic prediction is of practical significance in public health, enabling\nearly intervention, resource allocation, and strategic planning. However,\nprivacy concerns often hinder the sharing of health data among institutions,\nlimiting the development of accurate prediction models. In this paper, we\ndevelop a general privacy-preserving framework for node-level epidemic\nprediction on networks based on federated learning (FL). We frame the\nspatio-temporal spread of epidemics across multiple data-isolated subnetworks,\nwhere each node state represents the aggregate epidemic severity within a\ncommunity. Then, both the pure temporal LSTM model and the spatio-temporal\nmodel i.e., Spatio-Temporal Graph Attention Network (STGAT) are proposed to\naddress the federated epidemic prediction. Extensive experiments are conducted\non various epidemic processes using a practical airline network, offering a\ncomprehensive assessment of FL efficacy under diverse scenarios. By introducing\nthe efficacy energy metric to measure system robustness under various client\nconfigurations, we systematically explore key factors influencing FL\nperformance, including client numbers, aggregation strategies, graph\npartitioning, missing infectious reports. Numerical results manifest that STGAT\nexcels in capturing spatio-temporal dependencies in dynamic processes whereas\nLSTM performs well in simpler pattern. Moreover, our findings highlight the\nimportance of balancing feature consistency and volume uniformity among\nclients, as well as the prediction dilemma between information richness and\nintrinsic stochasticity of dynamic processes. This study offers practical\ninsights into the efficacy of FL scenario in epidemic management, demonstrates\nthe potential of FL to address broader collective dynamics.\n","authors":["Chengpeng Fu","Tong Li","Hao Chen","Wen Du","Zhidong He"],"pdf_url":"https://arxiv.org/pdf/2412.02161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01269v2","updated":"2024-12-03T04:37:03Z","published":"2024-12-02T08:35:54Z","title":"CPRM: A LLM-based Continual Pre-training Framework for Relevance\n Modeling in Commercial Search","summary":" Relevance modeling between queries and items stands as a pivotal component in\ncommercial search engines, directly affecting the user experience. Given the\nremarkable achievements of large language models (LLMs) in various natural\nlanguage processing (NLP) tasks, LLM-based relevance modeling is gradually\nbeing adopted within industrial search systems. Nevertheless, foundational LLMs\nlack domain-specific knowledge and do not fully exploit the potential of\nin-context learning. Furthermore, structured item text remains underutilized,\nand there is a shortage in the supply of corresponding queries and background\nknowledge. We thereby propose CPRM (Continual Pre-training for Relevance\nModeling), a framework designed for the continual pre-training of LLMs to\naddress these issues. Our CPRM framework includes three modules: 1) employing\nboth queries and multi-field item to jointly pre-train for enhancing domain\nknowledge, 2) applying in-context pre-training, a novel approach where LLMs are\npre-trained on a sequence of related queries or items, and 3) conducting\nreading comprehension on items to produce associated domain knowledge and\nbackground information (e.g., generating summaries and corresponding queries)\nto further strengthen LLMs. Results on offline experiments and online A/B\ntesting demonstrate that our model achieves convincing performance compared to\nstrong baselines.\n","authors":["Kaixin Wu","Yixin Ji","Zeyuan Chen","Qiang Wang","Cunxiang Wang","Hong Liu","Baijun Ji","Jia Xu","Zhongyi Liu","Jinjie Gu","Yuan Zhou","Linjian Mo"],"pdf_url":"https://arxiv.org/pdf/2412.01269v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02159v1","updated":"2024-12-03T04:34:58Z","published":"2024-12-03T04:34:58Z","title":"Jailbreak Defense in a Narrow Domain: Limitations of Existing Methods\n and a New Transcript-Classifier Approach","summary":" Defending large language models against jailbreaks so that they never engage\nin a broadly-defined set of forbidden behaviors is an open problem. In this\npaper, we investigate the difficulty of jailbreak-defense when we only want to\nforbid a narrowly-defined set of behaviors. As a case study, we focus on\npreventing an LLM from helping a user make a bomb. We find that popular\ndefenses such as safety training, adversarial training, and input/output\nclassifiers are unable to fully solve this problem. In pursuit of a better\nsolution, we develop a transcript-classifier defense which outperforms the\nbaseline defenses we test. However, our classifier defense still fails in some\ncircumstances, which highlights the difficulty of jailbreak-defense even in a\nnarrow domain.\n","authors":["Tony T. Wang","John Hughes","Henry Sleight","Rylan Schaeffer","Rajashree Agrawal","Fazl Barez","Mrinank Sharma","Jesse Mu","Nir Shavit","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2412.02159v1.pdf","comment":"Accepted to the AdvML-Frontiers and SoLaR workshops at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.02155v1","updated":"2024-12-03T04:29:27Z","published":"2024-12-03T04:29:27Z","title":"CausalMob: Causal Human Mobility Prediction with LLMs-derived Human\n Intentions toward Public Events","summary":" Large-scale human mobility exhibits spatial and temporal patterns that can\nassist policymakers in decision making. Although traditional prediction models\nattempt to capture these patterns, they often interfered by non-periodic public\nevents, such as disasters and occasional celebrations. Since regular human\nmobility patterns are heavily affected by these events, estimating their causal\neffects is critical to accurate mobility predictions. Although news articles\nprovide unique perspectives on these events in an unstructured format,\nprocessing is a challenge. In this study, we propose a causality-augmented\nprediction model, called \\textbf{CausalMob}, to analyze the causal effects of\npublic events. We first utilize large language models (LLMs) to extract human\nintentions from news articles and transform them into features that act as\ncausal treatments. Next, the model learns representations of spatio-temporal\nregional covariates from multiple data sources to serve as confounders for\ncausal inference. Finally, we present a causal effect estimation framework to\nensure event features remain independent of confounders during prediction.\nBased on large-scale real-world data, the experimental results show that the\nproposed model excels in human mobility prediction, outperforming\nstate-of-the-art models.\n","authors":["Xiaojie Yang","Hangli Ge","Jiawei Wang","Zipei Fan","Renhe Jiang","Ryosuke Shibasaki","Noboru Koshizuka"],"pdf_url":"https://arxiv.org/pdf/2412.02155v1.pdf","comment":"Accepted by KDD 2025"},{"id":"http://arxiv.org/abs/2412.02154v1","updated":"2024-12-03T04:28:58Z","published":"2024-12-03T04:28:58Z","title":"Failure Probability Estimation for Black-Box Autonomous Systems using\n State-Dependent Importance Sampling Proposals","summary":" Estimating the probability of failure is a critical step in developing\nsafety-critical autonomous systems. Direct estimation methods such as Monte\nCarlo sampling are often impractical due to the rarity of failures in these\nsystems. Existing importance sampling approaches do not scale to sequential\ndecision-making systems with large state spaces and long horizons. We propose\nan adaptive importance sampling algorithm to address these limitations. Our\nmethod minimizes the forward Kullback-Leibler divergence between a\nstate-dependent proposal distribution and a relaxed form of the optimal\nimportance sampling distribution. Our method uses Markov score ascent methods\nto estimate this objective. We evaluate our approach on four sequential systems\nand show that it provides more accurate failure probability estimates than\nbaseline Monte Carlo and importance sampling techniques. This work is open\nsourced.\n","authors":["Harrison Delecki","Sydney M. Katz","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2412.02154v1.pdf","comment":"Submitted to L4DC 2025"},{"id":"http://arxiv.org/abs/2412.02153v1","updated":"2024-12-03T04:28:14Z","published":"2024-12-03T04:28:14Z","title":"Revisiting the Initial Steps in Adaptive Gradient Descent Optimization","summary":" Adaptive gradient optimization methods, such as Adam, are prevalent in\ntraining deep neural networks across diverse machine learning tasks due to\ntheir ability to achieve faster convergence. However, these methods often\nsuffer from suboptimal generalization compared to stochastic gradient descent\n(SGD) and exhibit instability, particularly when training Transformer models.\nIn this work, we show the standard initialization of the second-order moment\nestimation ($v_0 =0$) as a significant factor contributing to these\nlimitations. We introduce simple yet effective solutions: initializing the\nsecond-order moment estimation with non-zero values, using either data-driven\nor random initialization strategies. Empirical evaluations demonstrate that our\napproach not only stabilizes convergence but also enhances the final\nperformance of adaptive gradient optimizers. Furthermore, by adopting the\nproposed initialization strategies, Adam achieves performance comparable to\nmany recently proposed variants of adaptive gradient optimization methods,\nhighlighting the practical impact of this straightforward modification.\n","authors":["Abulikemu Abuduweili","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2412.02153v1.pdf","comment":"OPT workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.08666v2","updated":"2024-12-03T04:22:40Z","published":"2024-06-12T22:12:03Z","title":"Interventional Causal Discovery in a Mixture of DAGs","summary":" Causal interactions among a group of variables are often modeled by a single\ncausal graph. In some domains, however, these interactions are best described\nby multiple co-existing causal graphs, e.g., in dynamical systems or genomics.\nThis paper addresses the hitherto unknown role of interventions in learning\ncausal interactions among variables governed by a mixture of causal systems,\neach modeled by one directed acyclic graph (DAG). Causal discovery from\nmixtures is fundamentally more challenging than single-DAG causal discovery.\nTwo major difficulties stem from (i)~an inherent uncertainty about the\nskeletons of the component DAGs that constitute the mixture and (ii)~possibly\ncyclic relationships across these component DAGs. This paper addresses these\nchallenges and aims to identify edges that exist in at least one component DAG\nof the mixture, referred to as the true edges. First, it establishes matching\nnecessary and sufficient conditions on the size of interventions required to\nidentify the true edges. Next, guided by the necessity results, an adaptive\nalgorithm is designed that learns all true edges using $O(n^2)$ interventions,\nwhere $n$ is the number of nodes. Remarkably, the size of the interventions is\noptimal if the underlying mixture model does not contain cycles across its\ncomponents. More generally, the gap between the intervention size used by the\nalgorithm and the optimal size is quantified. It is shown to be bounded by the\ncyclic complexity number of the mixture model, defined as the size of the\nminimal intervention that can break the cycles in the mixture, which is upper\nbounded by the number of cycles among the ancestors of a node.\n","authors":["Burak Varıcı","Dmitriy Katz-Rogozhnikov","Dennis Wei","Prasanna Sattigeri","Ali Tajer"],"pdf_url":"https://arxiv.org/pdf/2406.08666v2.pdf","comment":"NeurIPS 2024 camera-ready version"},{"id":"http://arxiv.org/abs/2412.00648v2","updated":"2024-12-03T04:14:31Z","published":"2024-12-01T02:55:08Z","title":"DFRot: Achieving Outlier-Free and Massive Activation-Free for Rotated\n LLMs with Refined Rotation","summary":" Rotating the activation and weight matrices to reduce the influence of\noutliers in large language models (LLMs) has recently attracted significant\nattention, particularly in the context of model quantization. Prior studies\nhave shown that in low-precision quantization scenarios, such as 4-bit weights\nand 4-bit activations (W4A4), randomized Hadamard transforms can achieve\nsignificantly higher accuracy than randomized orthogonal transforms. Notably,\nthe reason behind this phenomena remains unknown. In this paper, we find that\nthese transformations show substantial improvement in eliminating outliers for\ncommon tokens and achieve similar quantization error. The primary reason for\nthe accuracy difference lies in the fact that randomized Hadamard transforms\ncan slightly reduce the quantization error for tokens with massive activations\nwhile randomized orthogonal transforms increase the quantization error. Due to\nthe extreme rarity of these tokens and their critical impact on model accuracy,\nwe consider this a long-tail optimization problem, and therefore construct a\nsimple yet effective method: a weighted loss function. Additionally, we propose\nan optimization strategy for the rotation matrix that involves alternating\noptimization of quantization parameters while employing orthogonal Procrustes\ntransforms to refine the rotation matrix. This makes the distribution of the\nrotated activation values more conducive to quantization, especially for tokens\nwith massive activations. Our method enhances the Rotated LLMs by achieving\ndual free, Outlier-Free and Massive Activation-Free, dubbed as DFRot. Extensive\nexperiments demonstrate the effectiveness and efficiency of DFRot. By tuning\nthe rotation matrix using just a single sample, DFRot achieves a perplexity\nimprovement of 0.25 and 0.21 on W4A4KV4 and W4A4KV16, respectively, for\nLLaMA3-8B, a model known for its quantization challenges.\n","authors":["Jingyang Xiang","Sai Qian Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.00648v2.pdf","comment":"24 pages, 38 figures, source code\n \\url{https://github.com/JingyangXiang/DFRot}"},{"id":"http://arxiv.org/abs/2407.00382v4","updated":"2024-12-03T04:07:32Z","published":"2024-06-29T09:35:12Z","title":"Towards Universal Mesh Movement Networks","summary":" Solving complex Partial Differential Equations (PDEs) accurately and\nefficiently is an essential and challenging problem in all scientific and\nengineering disciplines. Mesh movement methods provide the capability to\nimprove the accuracy of the numerical solution without increasing the overall\nmesh degree of freedom count. Conventional sophisticated mesh movement methods\nare extremely expensive and struggle to handle scenarios with complex boundary\ngeometries. However, existing learning-based methods require re-training from\nscratch given a different PDE type or boundary geometry, which limits their\napplicability, and also often suffer from robustness issues in the form of\ninverted elements. In this paper, we introduce the Universal Mesh Movement\nNetwork (UM2N), which -- once trained -- can be applied in a non-intrusive,\nzero-shot manner to move meshes with different size distributions and\nstructures, for solvers applicable to different PDE types and boundary\ngeometries. UM2N consists of a Graph Transformer (GT) encoder for extracting\nfeatures and a Graph Attention Network (GAT) based decoder for moving the mesh.\nWe evaluate our method on advection and Navier-Stokes based examples, as well\nas a real-world tsunami simulation case. Our method outperforms existing\nlearning-based mesh movement methods in terms of the benchmarks described\nabove. In comparison to the conventional sophisticated Monge-Amp\\`ere\nPDE-solver based method, our approach not only significantly accelerates mesh\nmovement, but also proves effective in scenarios where the conventional method\nfails. Our project page is at https://erizmr.github.io/UM2N/.\n","authors":["Mingrui Zhang","Chunyang Wang","Stephan Kramer","Joseph G. Wallwork","Siyi Li","Jiancheng Liu","Xiang Chen","Matthew D. Piggott"],"pdf_url":"https://arxiv.org/pdf/2407.00382v4.pdf","comment":"Accepted at NeurIPS 2024 as a spotlight paper"},{"id":"http://arxiv.org/abs/2412.02140v1","updated":"2024-12-03T03:56:01Z","published":"2024-12-03T03:56:01Z","title":"SparseGrasp: Robotic Grasping via 3D Semantic Gaussian Splatting from\n Sparse Multi-View RGB Images","summary":" Language-guided robotic grasping is a rapidly advancing field where robots\nare instructed using human language to grasp specific objects. However,\nexisting methods often depend on dense camera views and struggle to quickly\nupdate scenes, limiting their effectiveness in changeable environments.\n In contrast, we propose SparseGrasp, a novel open-vocabulary robotic grasping\nsystem that operates efficiently with sparse-view RGB images and handles scene\nupdates fastly. Our system builds upon and significantly enhances existing\ncomputer vision modules in robotic learning. Specifically, SparseGrasp utilizes\nDUSt3R to generate a dense point cloud as the initialization for 3D Gaussian\nSplatting (3DGS), maintaining high fidelity even under sparse supervision.\nImportantly, SparseGrasp incorporates semantic awareness from recent vision\nfoundation models. To further improve processing efficiency, we repurpose\nPrincipal Component Analysis (PCA) to compress features from 2D models.\nAdditionally, we introduce a novel render-and-compare strategy that ensures\nrapid scene updates, enabling multi-turn grasping in changeable environments.\n Experimental results show that SparseGrasp significantly outperforms\nstate-of-the-art methods in terms of both speed and adaptability, providing a\nrobust solution for multi-turn grasping in changeable environment.\n","authors":["Junqiu Yu","Xinlin Ren","Yongchong Gu","Haitao Lin","Tianyu Wang","Yi Zhu","Hang Xu","Yu-Gang Jiang","Xiangyang Xue","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2412.02140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02912v1","updated":"2024-12-03T23:37:47Z","published":"2024-12-03T23:37:47Z","title":"ShapeWords: Guiding Text-to-Image Synthesis with 3D Shape-Aware Prompts","summary":" We introduce ShapeWords, an approach for synthesizing images based on 3D\nshape guidance and text prompts. ShapeWords incorporates target 3D shape\ninformation within specialized tokens embedded together with the input text,\neffectively blending 3D shape awareness with textual context to guide the image\nsynthesis process. Unlike conventional shape guidance methods that rely on\ndepth maps restricted to fixed viewpoints and often overlook full 3D structure\nor textual context, ShapeWords generates diverse yet consistent images that\nreflect both the target shape's geometry and the textual description.\nExperimental results show that ShapeWords produces images that are more\ntext-compliant, aesthetically plausible, while also maintaining 3D shape\nawareness.\n","authors":["Dmitry Petrov","Pradyumn Goyal","Divyansh Shivashok","Yuanming Tao","Melinos Averkiou","Evangelos Kalogerakis"],"pdf_url":"https://arxiv.org/pdf/2412.02912v1.pdf","comment":"Project webpage: https://lodurality.github.io/shapewords/"},{"id":"http://arxiv.org/abs/2405.00820v3","updated":"2024-12-03T23:30:43Z","published":"2024-05-01T19:02:18Z","title":"HLSFactory: A Framework Empowering High-Level Synthesis Datasets for\n Machine Learning and Beyond","summary":" Machine learning (ML) techniques have been applied to high-level synthesis\n(HLS) flows for quality-of-result (QoR) prediction and design space exploration\n(DSE). Nevertheless, the scarcity of accessible high-quality HLS datasets and\nthe complexity of building such datasets present challenges. Existing datasets\nhave limitations in terms of benchmark coverage, design space enumeration,\nvendor extensibility, or lack of reproducible and extensible software for\ndataset construction. Many works also lack user-friendly ways to add more\ndesigns, limiting wider adoption of such datasets. In response to these\nchallenges, we introduce HLSFactory, a comprehensive framework designed to\nfacilitate the curation and generation of high-quality HLS design datasets.\nHLSFactory has three main stages: 1) a design space expansion stage to\nelaborate single HLS designs into large design spaces using various\noptimization directives across multiple vendor tools, 2) a design synthesis\nstage to execute HLS and FPGA tool flows concurrently across designs, and 3) a\ndata aggregation stage for extracting standardized data into packaged datasets\nfor ML usage. This tripartite architecture ensures broad design space coverage\nvia design space expansion and supports multiple vendor tools. Users can\ncontribute to each stage with their own HLS designs and synthesis results and\nextend the framework itself with custom frontends and tool flows. We also\ninclude an initial set of built-in designs from common HLS benchmarks curated\nopen-source HLS designs. We showcase the versatility and multi-functionality of\nour framework through seven case studies: I) ML model for QoR prediction; II)\nDesign space sampling; III) Fine-grained parallelism backend speedup; IV)\nTargeting Intel's HLS flow; V) Adding new auxiliary designs; VI) Integrating\npublished HLS data; VII) HLS tool version regression benchmarking.\n","authors":["Stefan Abi-Karam","Rishov Sarkar","Allison Seigler","Sean Lowe","Zhigang Wei","Hanqiu Chen","Nanditha Rao","Lizy John","Aman Arora","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2405.00820v3.pdf","comment":"MLCAD 2024 version of the paper. New case study with ML QoR\n prediction. Artifact evaluation details included"}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.02611v1","updated":"2024-12-03T17:41:23Z","published":"2024-12-03T17:41:23Z","title":"AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand\n Audio-Visual Information?","summary":" Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini\n1.5 Pro, and Reka Core, have expanded their capabilities to include vision and\naudio modalities. While these models demonstrate impressive performance across\na wide range of audio-visual applications, our proposed DeafTest reveals that\nMLLMs often struggle with simple tasks humans find trivial: 1) determining\nwhich of two sounds is louder, and 2) determining which of two sounds has a\nhigher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a\ncomprehensive audio-visual benchmark designed to assess whether those MLLMs can\ntruly understand the audio-visual information. This benchmark encompasses 4,555\ncarefully crafted problems, each incorporating text, visual, and audio\ncomponents. To successfully infer answers, models must effectively leverage\nclues from both visual and audio inputs. To ensure precise and objective\nevaluation of MLLM responses, we have structured the questions as\nmultiple-choice, eliminating the need for human evaluation or LLM-assisted\nassessment. We benchmark a series of closed-source and open-source models and\nsummarize the observations. By revealing the limitations of current models, we\naim to provide useful insight for future dataset collection and model\ndevelopment.\n","authors":["Kaixiong Gong","Kaituo Feng","Bohao Li","Yibing Wang","Mofan Cheng","Shijia Yang","Jiaming Han","Benyou Wang","Yutong Bai","Zhuoran Yang","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2412.02611v1.pdf","comment":"Project page: https://av-odyssey.github.io/"},{"id":"http://arxiv.org/abs/2412.02575v1","updated":"2024-12-03T17:02:40Z","published":"2024-12-03T17:02:40Z","title":"Copy-Move Forgery Detection and Question Answering for Remote Sensing\n Image","summary":" This paper introduces the task of Remote Sensing Copy-Move Question Answering\n(RSCMQA). Unlike traditional Remote Sensing Visual Question Answering (RSVQA),\nRSCMQA focuses on interpreting complex tampering scenarios and inferring\nrelationships between objects. Based on the practical needs of national defense\nsecurity and land resource monitoring, we have developed an accurate and\ncomprehensive global dataset for remote sensing image copy-move question\nanswering, named RS-CMQA-2.1M. These images were collected from 29 different\nregions across 14 countries. Additionally, we have refined a balanced dataset,\nRS-CMQA-B, to address the long-standing issue of long-tail data in the remote\nsensing field. Furthermore, we propose a region-discriminative guided\nmultimodal CMQA model, which enhances the accuracy of answering questions about\ntampered images by leveraging prompt about the differences and connections\nbetween the source and tampered domains. Extensive experiments demonstrate that\nour method provides a stronger benchmark for RS-CMQA compared to general VQA\nand RSVQA models. Our dataset and code are available at\nhttps://github.com/shenyedepisa/RSCMQA.\n","authors":["Ze Zhang","Enyuan Zhao","Ziyi Wan","Jie Nie","Xinyue Liang","Lei Huang"],"pdf_url":"https://arxiv.org/pdf/2412.02575v1.pdf","comment":"7 figs, 7 tables"},{"id":"http://arxiv.org/abs/2412.02419v1","updated":"2024-12-03T12:31:44Z","published":"2024-12-03T12:31:44Z","title":"It Takes Two: Real-time Co-Speech Two-person's Interaction Generation\n via Reactive Auto-regressive Diffusion Model","summary":" Conversational scenarios are very common in real-world settings, yet existing\nco-speech motion synthesis approaches often fall short in these contexts, where\none person's audio and gestures will influence the other's responses.\nAdditionally, most existing methods rely on offline sequence-to-sequence\nframeworks, which are unsuitable for online applications. In this work, we\nintroduce an audio-driven, auto-regressive system designed to synthesize\ndynamic movements for two characters during a conversation. At the core of our\napproach is a diffusion-based full-body motion synthesis model, which is\nconditioned on the past states of both characters, speech audio, and a\ntask-oriented motion trajectory input, allowing for flexible spatial control.\nTo enhance the model's ability to learn diverse interactions, we have enriched\nexisting two-person conversational motion datasets with more dynamic and\ninteractive motions. We evaluate our system through multiple experiments to\nshow it outperforms across a variety of tasks, including single and two-person\nco-speech motion generation, as well as interactive motion generation. To the\nbest of our knowledge, this is the first system capable of generating\ninteractive full-body motions for two characters from speech in an online\nmanner.\n","authors":["Mingyi Shi","Dafei Qin","Leo Ho","Zhouyingcheng Liao","Yinghao Huang","Junichi Yamagishi","Taku Komura"],"pdf_url":"https://arxiv.org/pdf/2412.02419v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.08489v2","updated":"2024-12-03T23:17:44Z","published":"2024-09-13T02:32:10Z","title":"Resource-Efficient Reference-Free Evaluation of Audio Captions","summary":" To establish the trustworthiness of systems that automatically generate text\ncaptions for audio, images and video, existing reference-free metrics rely on\nlarge pretrained models which are impractical to accommodate in\nresource-constrained settings. To address this, we propose some metrics to\nelicit the model's confidence in its own generation. To assess how well these\nmetrics replace correctness measures that leverage reference captions, we test\ntheir calibration with correctness measures. We discuss why some of these\nconfidence metrics align better with certain correctness measures. Further, we\nprovide insight into why temperature scaling of confidence metrics is\neffective. Our main contribution is a suite of well-calibrated lightweight\nconfidence metrics for reference-free evaluation of captions in\nresource-constrained settings.\n","authors":["Rehana Mahfuz","Yinyi Guo","Erik Visser"],"pdf_url":"https://arxiv.org/pdf/2409.08489v2.pdf","comment":null}]},"2024-12-04T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2406.09400v2","updated":"2024-12-04T18:59:56Z","published":"2024-06-13T17:59:29Z","title":"Yo'LLaVA: Your Personalized Language and Vision Assistant","summary":" Large Multimodal Models (LMMs) have shown remarkable capabilities across a\nvariety of tasks (e.g., image captioning, visual question answering). While\nbroad, their knowledge remains generic (e.g., recognizing a dog), and they are\nunable to handle personalized subjects (e.g., recognizing a user's pet dog).\nHuman reasoning, in contrast, typically operates within the context of specific\nsubjects in our surroundings. For example, one might ask, \"What should I buy\nfor my dog's birthday?\"; as opposed to a generic inquiry about \"What should I\nbuy for a dog's birthday?\". Similarly, when looking at a friend's image, the\ninterest lies in seeing their activities (e.g., \"my friend is holding a cat\"),\nrather than merely observing generic human actions (e.g., \"a man is holding a\ncat\"). In this paper, we introduce the novel task of personalizing LMMs, so\nthat they can have conversations about a specific subject. We propose Yo'LLaVA,\nwhich learns to embed a personalized subject into a set of latent tokens given\na handful of example images of the subject. Our qualitative and quantitative\nanalyses reveal that Yo'LLaVA can learn the concept more efficiently using\nfewer tokens and more effectively encode the visual attributes compared to\nstrong prompting baselines (e.g., LLaVA).\n","authors":["Thao Nguyen","Haotian Liu","Yuheng Li","Mu Cai","Utkarsh Ojha","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2406.09400v2.pdf","comment":"NeurIPS 2024; Project page: https://thaoshibe.github.io/YoLLaVA"},{"id":"http://arxiv.org/abs/2412.03572v1","updated":"2024-12-04T18:59:45Z","published":"2024-12-04T18:59:45Z","title":"Navigation World Models","summary":" Navigation is a fundamental skill of agents with visual-motor capabilities.\nWe introduce a Navigation World Model (NWM), a controllable video generation\nmodel that predicts future visual observations based on past observations and\nnavigation actions. To capture complex environment dynamics, NWM employs a\nConditional Diffusion Transformer (CDiT), trained on a diverse collection of\negocentric videos of both human and robotic agents, and scaled up to 1 billion\nparameters. In familiar environments, NWM can plan navigation trajectories by\nsimulating them and evaluating whether they achieve the desired goal. Unlike\nsupervised navigation policies with fixed behavior, NWM can dynamically\nincorporate constraints during planning. Experiments demonstrate its\neffectiveness in planning trajectories from scratch or by ranking trajectories\nsampled from an external policy. Furthermore, NWM leverages its learned visual\npriors to imagine trajectories in unfamiliar environments from a single input\nimage, making it a flexible and powerful tool for next-generation navigation\nsystems.\n","authors":["Amir Bar","Gaoyue Zhou","Danny Tran","Trevor Darrell","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2412.03572v1.pdf","comment":"project page: https://www.amirbar.net/nwm/"},{"id":"http://arxiv.org/abs/2412.03571v1","updated":"2024-12-04T18:59:38Z","published":"2024-12-04T18:59:38Z","title":"Style3D: Attention-guided Multi-view Style Transfer for 3D Object\n Generation","summary":" We present Style3D, a novel approach for generating stylized 3D objects from\na content image and a style image. Unlike most previous methods that require\ncase- or style-specific training, Style3D supports instant 3D object\nstylization. Our key insight is that 3D object stylization can be decomposed\ninto two interconnected processes: multi-view dual-feature alignment and\nsparse-view spatial reconstruction. We introduce MultiFusion Attention, an\nattention-guided technique to achieve multi-view stylization from the\ncontent-style pair. Specifically, the query features from the content image\npreserve geometric consistency across multiple views, while the key and value\nfeatures from the style image are used to guide the stylistic transfer. This\ndual-feature alignment ensures that spatial coherence and stylistic fidelity\nare maintained across multi-view images. Finally, a large 3D reconstruction\nmodel is introduced to generate coherent stylized 3D objects. By establishing\nan interplay between structural and stylistic features across multiple views,\nour approach enables a holistic 3D stylization process. Extensive experiments\ndemonstrate that Style3D offers a more flexible and scalable solution for\ngenerating style-consistent 3D assets, surpassing existing methods in both\ncomputational efficiency and visual quality.\n","authors":["Bingjie Song","Xin Huang","Ruting Xie","Xue Wang","Qing Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03570v1","updated":"2024-12-04T18:59:24Z","published":"2024-12-04T18:59:24Z","title":"Sparse-view Pose Estimation and Reconstruction via Analysis by\n Generative Synthesis","summary":" Inferring the 3D structure underlying a set of multi-view images typically\nrequires solving two co-dependent tasks -- accurate 3D reconstruction requires\nprecise camera poses, and predicting camera poses relies on (implicitly or\nexplicitly) modeling the underlying 3D. The classical framework of analysis by\nsynthesis casts this inference as a joint optimization seeking to explain the\nobserved pixels, and recent instantiations learn expressive 3D representations\n(e.g., Neural Fields) with gradient-descent-based pose refinement of initial\npose estimates. However, given a sparse set of observed views, the observations\nmay not provide sufficient direct evidence to obtain complete and accurate 3D.\nMoreover, large errors in pose estimation may not be easily corrected and can\nfurther degrade the inferred 3D. To allow robust 3D reconstruction and pose\nestimation in this challenging setup, we propose SparseAGS, a method that\nadapts this analysis-by-synthesis approach by: a) including\nnovel-view-synthesis-based generative priors in conjunction with photometric\nobjectives to improve the quality of the inferred 3D, and b) explicitly\nreasoning about outliers and using a discrete search with a continuous\noptimization-based strategy to correct them. We validate our framework across\nreal-world and synthetic datasets in combination with several off-the-shelf\npose estimation systems as initialization. We find that it significantly\nimproves the base systems' pose accuracy while yielding high-quality 3D\nreconstructions that outperform the results from current multi-view\nreconstruction baselines.\n","authors":["Qitao Zhao","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2412.03570v1.pdf","comment":"NeurIPS 2024. Project website: https://qitaozhao.github.io/SparseAGS"},{"id":"http://arxiv.org/abs/2412.03567v1","updated":"2024-12-04T18:58:27Z","published":"2024-12-04T18:58:27Z","title":"Streaming Detection of Queried Event Start","summary":" Robotics, autonomous driving, augmented reality, and many embodied computer\nvision applications must quickly react to user-defined events unfolding in real\ntime. We address this setting by proposing a novel task for multimodal video\nunderstanding-Streaming Detection of Queried Event Start (SDQES). The goal of\nSDQES is to identify the beginning of a complex event as described by a natural\nlanguage query, with high accuracy and low latency. We introduce a new\nbenchmark based on the Ego4D dataset, as well as new task-specific metrics to\nstudy streaming multimodal detection of diverse events in an egocentric video\nsetting. Inspired by parameter-efficient fine-tuning methods in NLP and for\nvideo tasks, we propose adapter-based baselines that enable image-to-video\ntransfer learning, allowing for efficient online video modeling. We evaluate\nthree vision-language backbones and three adapter architectures on both\nshort-clip and untrimmed video settings.\n","authors":["Cristobal Eyzaguirre","Eric Tang","Shyamal Buch","Adrien Gaidon","Jiajun Wu","Juan Carlos Niebles"],"pdf_url":"https://arxiv.org/pdf/2412.03567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03566v1","updated":"2024-12-04T18:58:21Z","published":"2024-12-04T18:58:21Z","title":"FreeSim: Toward Free-viewpoint Camera Simulation in Driving Scenes","summary":" We propose FreeSim, a camera simulation method for autonomous driving.\nFreeSim emphasizes high-quality rendering from viewpoints beyond the recorded\nego trajectories. In such viewpoints, previous methods have unacceptable\ndegradation because the training data of these viewpoints is unavailable. To\naddress such data scarcity, we first propose a generative enhancement model\nwith a matched data construction strategy. The resulting model can generate\nhigh-quality images in a viewpoint slightly deviated from the recorded\ntrajectories, conditioned on the degraded rendering of this viewpoint. We then\npropose a progressive reconstruction strategy, which progressively adds\ngenerated images of unrecorded views into the reconstruction process, starting\nfrom slightly off-trajectory viewpoints and moving progressively farther away.\nWith this progressive generation-reconstruction pipeline, FreeSim supports\nhigh-quality off-trajectory view synthesis under large deviations of more than\n3 meters.\n","authors":["Lue Fan","Hao Zhang","Qitai Wang","Hongsheng Li","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03566v1.pdf","comment":"Project page: https://drive-sim.github.io/freesim"},{"id":"http://arxiv.org/abs/2412.03565v1","updated":"2024-12-04T18:58:10Z","published":"2024-12-04T18:58:10Z","title":"Inst-IT: Boosting Multimodal Instance Understanding via Explicit Visual\n Prompt Instruction Tuning","summary":" Large Multimodal Models (LMMs) have made significant breakthroughs with the\nadvancement of instruction tuning. However, while existing models can\nunderstand images and videos at a holistic level, they still struggle with\ninstance-level understanding that requires a more nuanced comprehension and\nalignment. Instance-level understanding is crucial, as it focuses on the\nspecific elements that we are most interested in. Excitingly, existing works\nfind that the state-of-the-art LMMs exhibit strong instance understanding\ncapabilities when provided with explicit visual cues. Motivated by this, we\nintroduce an automated annotation pipeline assisted by GPT-4o to extract\ninstance-level information from images and videos through explicit visual\nprompting for instance guidance. Building upon this pipeline, we proposed\nInst-IT, a solution to enhance LMMs in Instance understanding via explicit\nvisual prompt Instruction Tuning. Inst-IT consists of a benchmark to diagnose\nmultimodal instance-level understanding, a large-scale instruction-tuning\ndataset, and a continuous instruction-tuning training paradigm to effectively\nenhance spatial-temporal instance understanding capabilities of existing LMMs.\nExperimental results show that, with the boost of Inst-IT, our models not only\nachieve outstanding performance on Inst-IT Bench but also demonstrate\nsignificant improvements across various generic image and video understanding\nbenchmarks. This highlights that our dataset not only boosts instance-level\nunderstanding but also strengthens the overall capabilities of generic image\nand video comprehension.\n","authors":["Wujian Peng","Lingchen Meng","Yitong Chen","Yiweng Xie","Yang Liu","Tao Gui","Hang Xu","Xipeng Qiu","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2412.03565v1.pdf","comment":"Project page at https://inst-it.github.io"},{"id":"http://arxiv.org/abs/2412.03561v1","updated":"2024-12-04T18:56:04Z","published":"2024-12-04T18:56:04Z","title":"FLAIR: VLM with Fine-grained Language-informed Image Representations","summary":" CLIP has shown impressive results in aligning images and texts at scale.\nHowever, its ability to capture detailed visual features remains limited\nbecause CLIP matches images and texts at a global level. To address this issue,\nwe propose FLAIR, Fine-grained Language-informed Image Representations, an\napproach that utilizes long and detailed image descriptions to learn localized\nimage embeddings. By sampling diverse sub-captions that describe fine-grained\ndetails about an image, we train our vision-language model to produce not only\nglobal embeddings but also text-specific image representations. Our model\nintroduces text-conditioned attention pooling on top of local image tokens to\nproduce fine-grained image representations that excel at retrieving detailed\nimage content. We achieve state-of-the-art performance on both, existing\nmultimodal retrieval benchmarks, as well as, our newly introduced fine-grained\nretrieval task which evaluates vision-language models' ability to retrieve\npartial image content. Furthermore, our experiments demonstrate the\neffectiveness of FLAIR trained on 30M image-text pairs in capturing\nfine-grained visual information, including zero-shot semantic segmentation,\noutperforming models trained on billions of pairs. Code is available at\nhttps://github.com/ExplainableML/flair .\n","authors":["Rui Xiao","Sanghwan Kim","Mariana-Iuliana Georgescu","Zeynep Akata","Stephan Alaniz"],"pdf_url":"https://arxiv.org/pdf/2412.03561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03558v1","updated":"2024-12-04T18:52:40Z","published":"2024-12-04T18:52:40Z","title":"MIDI: Multi-Instance Diffusion for Single Image to 3D Scene Generation","summary":" This paper introduces MIDI, a novel paradigm for compositional 3D scene\ngeneration from a single image. Unlike existing methods that rely on\nreconstruction or retrieval techniques or recent approaches that employ\nmulti-stage object-by-object generation, MIDI extends pre-trained image-to-3D\nobject generation models to multi-instance diffusion models, enabling the\nsimultaneous generation of multiple 3D instances with accurate spatial\nrelationships and high generalizability. At its core, MIDI incorporates a novel\nmulti-instance attention mechanism, that effectively captures inter-object\ninteractions and spatial coherence directly within the generation process,\nwithout the need for complex multi-step processes. The method utilizes partial\nobject images and global scene context as inputs, directly modeling object\ncompletion during 3D generation. During training, we effectively supervise the\ninteractions between 3D instances using a limited amount of scene-level data,\nwhile incorporating single-object data for regularization, thereby maintaining\nthe pre-trained generalization ability. MIDI demonstrates state-of-the-art\nperformance in image-to-scene generation, validated through evaluations on\nsynthetic data, real-world scene data, and stylized scene images generated by\ntext-to-image diffusion models.\n","authors":["Zehuan Huang","Yuan-Chen Guo","Xingqiao An","Yunhan Yang","Yangguang Li","Zi-Xin Zou","Ding Liang","Xihui Liu","Yan-Pei Cao","Lu Sheng"],"pdf_url":"https://arxiv.org/pdf/2412.03558v1.pdf","comment":"Project page: https://huanngzh.github.io/MIDI-Page/"},{"id":"http://arxiv.org/abs/2412.03555v1","updated":"2024-12-04T18:50:42Z","published":"2024-12-04T18:50:42Z","title":"PaliGemma 2: A Family of Versatile VLMs for Transfer","summary":" PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM)\nbased on the Gemma 2 family of language models. We combine the SigLIP-So400m\nvision encoder that was also used by PaliGemma with the whole range of Gemma 2\nmodels, from the 2B one all the way up to the 27B model. We train these models\nat three resolutions (224px, 448px, and 896px) in multiple stages to equip them\nwith broad knowledge for transfer via fine-tuning. The resulting family of base\nmodels covering different model sizes and resolutions allows us to investigate\nfactors impacting transfer performance (such as learning rate) and to analyze\nthe interplay between the type of task, model size, and resolution. We further\nincrease the number and breadth of transfer tasks beyond the scope of PaliGemma\nincluding different OCR-related tasks such as table structure recognition,\nmolecular structure recognition, music score recognition, as well as long\nfine-grained captioning and radiography report generation, on which PaliGemma 2\nobtains state-of-the-art results.\n","authors":["Andreas Steiner","André Susano Pinto","Michael Tschannen","Daniel Keysers","Xiao Wang","Yonatan Bitton","Alexey Gritsenko","Matthias Minderer","Anthony Sherbondy","Shangbang Long","Siyang Qin","Reeve Ingle","Emanuele Bugliarello","Sahar Kazemzadeh","Thomas Mesnard","Ibrahim Alabdulmohsin","Lucas Beyer","Xiaohua Zhai"],"pdf_url":"https://arxiv.org/pdf/2412.03555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03552v1","updated":"2024-12-04T18:50:08Z","published":"2024-12-04T18:50:08Z","title":"Imagine360: Immersive 360 Video Generation from Perspective Anchor","summary":" $360^\\circ$ videos offer a hyper-immersive experience that allows the viewers\nto explore a dynamic scene from full 360 degrees. To achieve more user-friendly\nand personalized content creation in $360^\\circ$ video format, we seek to lift\nstandard perspective videos into $360^\\circ$ equirectangular videos. To this\nend, we introduce Imagine360, the first perspective-to-$360^\\circ$ video\ngeneration framework that creates high-quality $360^\\circ$ videos with rich and\ndiverse motion patterns from video anchors. Imagine360 learns fine-grained\nspherical visual and motion patterns from limited $360^\\circ$ video data with\nseveral key designs. 1) Firstly we adopt the dual-branch design, including a\nperspective and a panorama video denoising branch to provide local and global\nconstraints for $360^\\circ$ video generation, with motion module and spatial\nLoRA layers fine-tuned on extended web $360^\\circ$ videos. 2) Additionally, an\nantipodal mask is devised to capture long-range motion dependencies, enhancing\nthe reversed camera motion between antipodal pixels across hemispheres. 3) To\nhandle diverse perspective video inputs, we propose elevation-aware designs\nthat adapt to varying video masking due to changing elevations across frames.\nExtensive experiments show Imagine360 achieves superior graphics quality and\nmotion coherence among state-of-the-art $360^\\circ$ video generation methods.\nWe believe Imagine360 holds promise for advancing personalized, immersive\n$360^\\circ$ video creation.\n","authors":["Jing Tan","Shuai Yang","Tong Wu","Jingwen He","Yuwei Guo","Ziwei Liu","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2412.03552v1.pdf","comment":"Project page: https://ys-imtech.github.io/projects/Imagine360"},{"id":"http://arxiv.org/abs/2412.03548v1","updated":"2024-12-04T18:45:35Z","published":"2024-12-04T18:45:35Z","title":"Perception Tokens Enhance Visual Reasoning in Multimodal Language Models","summary":" Multimodal language models (MLMs) still face challenges in fundamental visual\nperception tasks where specialized models excel. Tasks requiring reasoning\nabout 3D structures benefit from depth estimation, and reasoning about 2D\nobject instances benefits from object detection. Yet, MLMs can not produce\nintermediate depth or boxes to reason over. Finetuning MLMs on relevant data\ndoesn't generalize well and outsourcing computation to specialized vision tools\nis too compute-intensive and memory-inefficient. To address this, we introduce\nPerception Tokens, intrinsic image representations designed to assist reasoning\ntasks where language is insufficient. Perception tokens act as auxiliary\nreasoning tokens, akin to chain-of-thought prompts in language models. For\nexample, in a depth-related task, an MLM augmented with perception tokens can\nreason by generating a depth map as tokens, enabling it to solve the problem\neffectively. We propose AURORA, a training method that augments MLMs with\nperception tokens for improved reasoning over visual inputs. AURORA leverages a\nVQVAE to transform intermediate image representations, such as depth maps into\na tokenized format and bounding box tokens, which is then used in a multi-task\ntraining framework. AURORA achieves notable improvements across counting\nbenchmarks: +10.8% on BLINK, +11.3% on CVBench, and +8.3% on SEED-Bench,\noutperforming finetuning approaches in generalization across datasets. It also\nimproves on relative depth: over +6% on BLINK. With perception tokens, AURORA\nexpands the scope of MLMs beyond language-based reasoning, paving the way for\nmore effective visual reasoning capabilities.\n","authors":["Mahtab Bigverdi","Zelun Luo","Cheng-Yu Hsieh","Ethan Shen","Dongping Chen","Linda G. Shapiro","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2412.03548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05289v2","updated":"2024-12-04T18:32:57Z","published":"2023-11-09T11:32:49Z","title":"VoxNeRF: Bridging Voxel Representation and Neural Radiance Fields for\n Enhanced Indoor View Synthesis","summary":" The generation of high-fidelity view synthesis is essential for robotic\nnavigation and interaction but remains challenging, particularly in indoor\nenvironments and real-time scenarios. Existing techniques often require\nsignificant computational resources for both training and rendering, and they\nfrequently result in suboptimal 3D representations due to insufficient\ngeometric structuring. To address these limitations, we introduce VoxNeRF, a\nnovel approach that utilizes easy-to-obtain geometry priors to enhance both the\nquality and efficiency of neural indoor reconstruction and novel view\nsynthesis. We propose an efficient voxel-guided sampling technique that\nallocates computational resources selectively to the most relevant segments of\nrays based on a voxel-encoded geometry prior, significantly reducing training\nand rendering time. Additionally, we incorporate a robust depth loss to improve\nreconstruction and rendering quality in sparse view settings. Our approach is\nvalidated with extensive experiments on ScanNet and ScanNet++ where VoxNeRF\noutperforms existing state-of-the-art methods and establishes a new benchmark\nfor indoor immersive interpolation and extrapolation settings.\n","authors":["Sen Wang","Qing Cheng","Stefano Gasperini","Wei Zhang","Shun-Cheng Wu","Niclas Zeller","Daniel Cremers","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2311.05289v2.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.11556v2","updated":"2024-12-04T18:31:44Z","published":"2023-12-17T08:07:32Z","title":"StarVector: Generating Scalable Vector Graphics Code from Images and\n Text","summary":" Scalable Vector Graphics (SVGs) are vital for modern image rendering due to\ntheir scalability and versatility. Previous SVG generation methods have focused\non curve-based vectorization, lacking semantic understanding, often producing\nartifacts, and struggling with SVG primitives beyond path curves. To address\nthese issues, we introduce StarVector, a multimodal large language model for\nSVG generation. It performs image vectorization by understanding image\nsemantics and using SVG primitives for compact, precise outputs. Unlike\ntraditional methods, StarVector works directly in the SVG code space,\nleveraging visual understanding to apply accurate SVG primitives. To train\nStarVector, we create SVG-Stack, a diverse dataset of 2M samples that enables\ngeneralization across vectorization tasks and precise use of primitives like\nellipses, polygons, and text. We address challenges in SVG evaluation, showing\nthat pixel-based metrics like MSE fail to capture the unique qualities of\nvector graphics. We introduce SVG-Bench, a benchmark across 10 datasets, and 3\ntasks: Image-to-SVG, Text-to-SVG generation, and diagram generation. Using this\nsetup, StarVector achieves state-of-the-art performance, producing more compact\nand semantically rich SVGs.\n","authors":["Juan A. Rodriguez","Abhay Puri","Shubham Agarwal","Issam H. Laradji","Pau Rodriguez","Sai Rajeswar","David Vazquez","Christopher Pal","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2312.11556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12712v3","updated":"2024-12-04T18:18:47Z","published":"2024-03-19T13:19:41Z","title":"Instance-Warp: Saliency Guided Image Warping for Unsupervised Domain\n Adaptation","summary":" Driving is challenging in conditions like night, rain, and snow. Lack of good\nlabeled datasets has hampered progress in scene understanding under such\nconditions. Unsupervised Domain Adaptation (UDA) using large labeled clear-day\ndatasets is a promising research direction in such cases. However, many UDA\nmethods are trained with dominant scene backgrounds (e.g., roads, sky,\nsidewalks) that appear dramatically different across domains. As a result, they\nstruggle to learn effective features of smaller and often sparse foreground\nobjects (e.g., people, vehicles, signs).\n In this work, we improve UDA training by applying in-place image warping to\nfocus on salient objects. We design instance-level saliency guidance to\nadaptively oversample object regions and undersample background areas, which\nreduces adverse effects from background context and enhances backbone feature\nlearning. Our approach improves adaptation across geographies, lighting, and\nweather conditions, and is agnostic to the task (segmentation, detection),\ndomain adaptation algorithm, saliency guidance, and underlying model\narchitecture. Result highlights include +6.1 mAP50 for BDD100K Clear\n$\\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\\rightarrow$ Night, +3.0\nmAP50 for BDD100K Clear $\\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes\n$\\rightarrow$ ACDC. Besides, Our method adds minimal training memory and no\nadditional inference latency. Code is available at\nhttps://github.com/ShenZheng2000/Instance-Warp\n","authors":["Shen Zheng","Anurag Ghosh","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.12712v3.pdf","comment":"WACV 2025 Accepted Paper"},{"id":"http://arxiv.org/abs/2412.03526v1","updated":"2024-12-04T18:15:06Z","published":"2024-12-04T18:15:06Z","title":"Feed-Forward Bullet-Time Reconstruction of Dynamic Scenes from Monocular\n Videos","summary":" Recent advancements in static feed-forward scene reconstruction have\ndemonstrated significant progress in high-quality novel view synthesis.\nHowever, these models often struggle with generalizability across diverse\nenvironments and fail to effectively handle dynamic content. We present BTimer\n(short for BulletTimer), the first motion-aware feed-forward model for\nreal-time reconstruction and novel view synthesis of dynamic scenes. Our\napproach reconstructs the full scene in a 3D Gaussian Splatting representation\nat a given target ('bullet') timestamp by aggregating information from all the\ncontext frames. Such a formulation allows BTimer to gain scalability and\ngeneralization by leveraging both static and dynamic scene datasets. Given a\ncasual monocular dynamic video, BTimer reconstructs a bullet-time scene within\n150ms while reaching state-of-the-art performance on both static and dynamic\nscene datasets, even compared with optimization-based approaches.\n","authors":["Hanxue Liang","Jiawei Ren","Ashkan Mirzaei","Antonio Torralba","Ziwei Liu","Igor Gilitschenski","Sanja Fidler","Cengiz Oztireli","Huan Ling","Zan Gojcic","Jiahui Huang"],"pdf_url":"https://arxiv.org/pdf/2412.03526v1.pdf","comment":"Project website:\n https://research.nvidia.com/labs/toronto-ai/bullet-timer/"},{"id":"http://arxiv.org/abs/2412.03520v1","updated":"2024-12-04T18:02:49Z","published":"2024-12-04T18:02:49Z","title":"Seeing Beyond Views: Multi-View Driving Scene Video Generation with\n Holistic Attention","summary":" Generating multi-view videos for autonomous driving training has recently\ngained much attention, with the challenge of addressing both cross-view and\ncross-frame consistency. Existing methods typically apply decoupled attention\nmechanisms for spatial, temporal, and view dimensions. However, these\napproaches often struggle to maintain consistency across dimensions,\nparticularly when handling fast-moving objects that appear at different times\nand viewpoints. In this paper, we present CogDriving, a novel network designed\nfor synthesizing high-quality multi-view driving videos. CogDriving leverages a\nDiffusion Transformer architecture with holistic-4D attention modules, enabling\nsimultaneous associations across the spatial, temporal, and viewpoint\ndimensions. We also propose a lightweight controller tailored for CogDriving,\ni.e., Micro-Controller, which uses only 1.1% of the parameters of the standard\nControlNet, enabling precise control over Bird's-Eye-View layouts. To enhance\nthe generation of object instances crucial for autonomous driving, we propose a\nre-weighted learning objective, dynamically adjusting the learning weights for\nobject instances during training. CogDriving demonstrates strong performance on\nthe nuScenes validation set, achieving an FVD score of 37.8, highlighting its\nability to generate realistic driving videos. The project can be found at\nhttps://luhannan.github.io/CogDrivingPage/.\n","authors":["Hannan Lu","Xiaohe Wu","Shudong Wang","Xiameng Qin","Xinyu Zhang","Junyu Han","Wangmeng Zuo","Ji Tao"],"pdf_url":"https://arxiv.org/pdf/2412.03520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03518v1","updated":"2024-12-04T17:59:04Z","published":"2024-12-04T17:59:04Z","title":"Dense Scene Reconstruction from Light-Field Images Affected by Rolling\n Shutter","summary":" This paper presents a dense depth estimation approach from light-field (LF)\nimages that is able to compensate for strong rolling shutter (RS) effects. Our\nmethod estimates RS compensated views and dense RS compensated disparity maps.\nWe present a two-stage method based on a 2D Gaussians Splatting that allows for\na ``render and compare\" strategy with a point cloud formulation. In the first\nstage, a subset of sub-aperture images is used to estimate an RS agnostic 3D\nshape that is related to the scene target shape ``up to a motion\". In the\nsecond stage, the deformation of the 3D shape is computed by estimating an\nadmissible camera motion. We demonstrate the effectiveness and advantages of\nthis approach through several experiments conducted for different scenes and\ntypes of motions. Due to lack of suitable datasets for evaluation, we also\npresent a new carefully designed synthetic dataset of RS LF images. The source\ncode, trained models and dataset will be made publicly available at:\nhttps://github.com/ICB-Vision-AI/DenseRSLF\n","authors":["Hermes McGriff","Renato Martins","Nicolas Andreff","Cedric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2412.03518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07360v3","updated":"2024-12-04T17:58:35Z","published":"2023-12-12T15:30:24Z","title":"Boosting Latent Diffusion with Flow Matching","summary":" Visual synthesis has recently seen significant leaps in performance, largely\ndue to breakthroughs in generative models. Diffusion models have been a key\nenabler, as they excel in image diversity. However, this comes at the cost of\nslow training and synthesis, which is only partially alleviated by latent\ndiffusion. To this end, flow matching is an appealing approach due to its\ncomplementary characteristics of faster training and inference but less diverse\nsynthesis. We demonstrate that introducing flow matching between a frozen\ndiffusion model and a convolutional decoder enables high-resolution image\nsynthesis at reduced computational cost and model size. A small diffusion model\ncan then effectively provide the necessary visual diversity, while flow\nmatching efficiently enhances resolution and detail by mapping the small to a\nhigh-dimensional latent space. These latents are then projected to\nhigh-resolution images by the subsequent convolutional decoder of the latent\ndiffusion approach. Combining the diversity of diffusion models, the efficiency\nof flow matching, and the effectiveness of convolutional decoders,\nstate-of-the-art high-resolution image synthesis is achieved at $1024^2$ pixels\nwith minimal computational cost. Further scaling up our method we can reach\nresolutions up to $2048^2$ pixels. Importantly, our approach is orthogonal to\nrecent approximation and speed-up strategies for the underlying model, making\nit easily integrable into the various diffusion model frameworks.\n","authors":["Johannes Schusterbauer","Ming Gui","Pingchuan Ma","Nick Stracke","Stefan A. Baumann","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2312.07360v3.pdf","comment":"ECCV 2024 (Oral), Project Page:\n https://compvis.github.io/fm-boosting/"},{"id":"http://arxiv.org/abs/2412.03517v1","updated":"2024-12-04T17:58:03Z","published":"2024-12-04T17:58:03Z","title":"NVComposer: Boosting Generative Novel View Synthesis with Multiple\n Sparse and Unposed Images","summary":" Recent advancements in generative models have significantly improved novel\nview synthesis (NVS) from multi-view data. However, existing methods depend on\nexternal multi-view alignment processes, such as explicit pose estimation or\npre-reconstruction, which limits their flexibility and accessibility,\nespecially when alignment is unstable due to insufficient overlap or occlusions\nbetween views. In this paper, we propose NVComposer, a novel approach that\neliminates the need for explicit external alignment. NVComposer enables the\ngenerative model to implicitly infer spatial and geometric relationships\nbetween multiple conditional views by introducing two key components: 1) an\nimage-pose dual-stream diffusion model that simultaneously generates target\nnovel views and condition camera poses, and 2) a geometry-aware feature\nalignment module that distills geometric priors from dense stereo models during\ntraining. Extensive experiments demonstrate that NVComposer achieves\nstate-of-the-art performance in generative multi-view NVS tasks, removing the\nreliance on external alignment and thus improving model accessibility. Our\napproach shows substantial improvements in synthesis quality as the number of\nunposed input views increases, highlighting its potential for more flexible and\naccessible generative NVS systems.\n","authors":["Lingen Li","Zhaoyang Zhang","Yaowei Li","Jiale Xu","Xiaoyu Li","Wenbo Hu","Weihao Cheng","Jinwei Gu","Tianfan Xue","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2412.03517v1.pdf","comment":"Project webpage: https://lg-li.github.io/project/nvcomposer"},{"id":"http://arxiv.org/abs/2406.16860v2","updated":"2024-12-04T17:57:32Z","published":"2024-06-24T17:59:42Z","title":"Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs","summary":" We introduce Cambrian-1, a family of multimodal LLMs (MLLMs) designed with a\nvision-centric approach. While stronger language models can enhance multimodal\ncapabilities, the design choices for vision components are often insufficiently\nexplored and disconnected from visual representation learning research. This\ngap hinders accurate sensory grounding in real-world scenarios. Our study uses\nLLMs and visual instruction tuning as an interface to evaluate various visual\nrepresentations, offering new insights into different models and architectures\n-- self-supervised, strongly supervised, or combinations thereof -- based on\nexperiments with over 20 vision encoders. We critically examine existing MLLM\nbenchmarks, address the difficulties involved in consolidating and interpreting\nresults from various tasks, and introduce a new vision-centric benchmark,\nCV-Bench. To further improve visual grounding, we propose the Spatial Vision\nAggregator (SVA), a dynamic and spatially-aware connector that integrates\nhigh-resolution vision features with LLMs while reducing the number of tokens.\nAdditionally, we discuss the curation of high-quality visual instruction-tuning\ndata from publicly available sources, emphasizing the importance of data source\nbalancing and distribution ratio. Collectively, Cambrian-1 not only achieves\nstate-of-the-art performance but also serves as a comprehensive, open cookbook\nfor instruction-tuned MLLMs. We provide model weights, code, supporting tools,\ndatasets, and detailed instruction-tuning and evaluation recipes. We hope our\nrelease will inspire and accelerate advancements in multimodal systems and\nvisual representation learning.\n","authors":["Shengbang Tong","Ellis Brown","Penghao Wu","Sanghyun Woo","Manoj Middepogu","Sai Charitha Akula","Jihan Yang","Shusheng Yang","Adithya Iyer","Xichen Pan","Ziteng Wang","Rob Fergus","Yann LeCun","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2406.16860v2.pdf","comment":"NeurIPS 2024 (Oral). Website at https://cambrian-mllm.github.io"},{"id":"http://arxiv.org/abs/2412.03515v1","updated":"2024-12-04T17:57:25Z","published":"2024-12-04T17:57:25Z","title":"Distilling Diffusion Models to Efficient 3D LiDAR Scene Completion","summary":" Diffusion models have been applied to 3D LiDAR scene completion due to their\nstrong training stability and high completion quality. However, the slow\nsampling speed limits the practical application of diffusion-based scene\ncompletion models since autonomous vehicles require an efficient perception of\nsurrounding environments. This paper proposes a novel distillation method\ntailored for 3D LiDAR scene completion models, dubbed $\\textbf{ScoreLiDAR}$,\nwhich achieves efficient yet high-quality scene completion. ScoreLiDAR enables\nthe distilled model to sample in significantly fewer steps after distillation.\nTo improve completion quality, we also introduce a novel $\\textbf{Structural\nLoss}$, which encourages the distilled model to capture the geometric structure\nof the 3D LiDAR scene. The loss contains a scene-wise term constraining the\nholistic structure and a point-wise term constraining the key landmark points\nand their relative configuration. Extensive experiments demonstrate that\nScoreLiDAR significantly accelerates the completion time from 30.55 to 5.37\nseconds per frame ($>$5$\\times$) on SemanticKITTI and achieves superior\nperformance compared to state-of-the-art 3D LiDAR scene completion models. Our\ncode is publicly available at https://github.com/happyw1nd/ScoreLiDAR.\n","authors":["Shengyuan Zhang","An Zhao","Ling Yang","Zejian Li","Chenye Meng","Haoran Xu","Tianrun Chen","AnYang Wei","Perry Pengyun GU","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2412.03515v1.pdf","comment":"https://github.com/happyw1nd/ScoreLiDAR"},{"id":"http://arxiv.org/abs/2412.03513v1","updated":"2024-12-04T17:56:49Z","published":"2024-12-04T17:56:49Z","title":"KKLIP: Knowledge Distillation Exploiting K-means Clustering for\n Language-Image Pre-Training","summary":" Recently, CLIP has emerged as a valuable model for aligning image and text\ninformation in multi-modal scenarios. However, researchers have observed\nlimitations in the ability of CLIP's text and image encoders to extract\ndetailed knowledge from caption-image pairs. In response, this paper introduces\nKKLIP, a novel approach designed to enhance the quality of CLIP by\nincorporating a new knowledge distillation (KD) method derived from Llama 2.\nOur method comprises three objectives: Text Embedding Distillation, Concept\nLearning, and Contrastive Learning. Firstly, Text Embedding Distillation\ninvolves training the KKLIP text encoder to emulate the teacher model, Llama 2.\nSecondly, Concept Learning assigns a soft concept label to each caption-image\npair through offline k-means clustering of text information from Llama 2,\nallowing KKLIP to learn from these soft concept labels. Finally, Contrastive\nLearning harmonizes text and image embeddings. Our experimental results\ndemonstrate that KKLIP enhances the quality of both text and image encoders.\n","authors":["Kuei-Chun Kao"],"pdf_url":"https://arxiv.org/pdf/2412.03513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03512v1","updated":"2024-12-04T17:55:33Z","published":"2024-12-04T17:55:33Z","title":"Distillation of Diffusion Features for Semantic Correspondence","summary":" Semantic correspondence, the task of determining relationships between\ndifferent parts of images, underpins various applications including 3D\nreconstruction, image-to-image translation, object tracking, and visual place\nrecognition. Recent studies have begun to explore representations learned in\nlarge generative image models for semantic correspondence, demonstrating\npromising results. Building on this progress, current state-of-the-art methods\nrely on combining multiple large models, resulting in high computational\ndemands and reduced efficiency. In this work, we address this challenge by\nproposing a more computationally efficient approach. We propose a novel\nknowledge distillation technique to overcome the problem of reduced efficiency.\nWe show how to use two large vision foundation models and distill the\ncapabilities of these complementary models into one smaller model that\nmaintains high accuracy at reduced computational cost. Furthermore, we\ndemonstrate that by incorporating 3D data, we are able to further improve\nperformance, without the need for human-annotated correspondences. Overall, our\nempirical results demonstrate that our distilled model with 3D data\naugmentation achieves performance superior to current state-of-the-art methods\nwhile significantly reducing computational load and enhancing practicality for\nreal-world applications, such as semantic video correspondence. Our code and\nweights are publicly available on our project page.\n","authors":["Frank Fundel","Johannes Schusterbauer","Vincent Tao Hu","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2412.03512v1.pdf","comment":"WACV 2025, Page: https://compvis.github.io/distilldift"},{"id":"http://arxiv.org/abs/2412.03498v1","updated":"2024-12-04T17:39:55Z","published":"2024-12-04T17:39:55Z","title":"A Bidirectional Siamese Recurrent Neural Network for Accurate Gait\n Recognition Using Body Landmarks","summary":" Gait recognition is a significant biometric technique for person\nidentification, particularly in scenarios where other physiological biometrics\nare impractical or ineffective. In this paper, we address the challenges\nassociated with gait recognition and present a novel approach to improve its\naccuracy and reliability. The proposed method leverages advanced techniques,\nincluding sequential gait landmarks obtained through the Mediapipe pose\nestimation model, Procrustes analysis for alignment, and a Siamese\nbiGRU-dualStack Neural Network architecture for capturing temporal\ndependencies. Extensive experiments were conducted on large-scale cross-view\ndatasets to demonstrate the effectiveness of the approach, achieving high\nrecognition accuracy compared to other models. The model demonstrated\naccuracies of 95.7%, 94.44%, 87.71%, and 86.6% on CASIA-B, SZU RGB-D, OU-MVLP,\nand Gait3D datasets respectively. The results highlight the potential\napplications of the proposed method in various practical domains, indicating\nits significant contribution to the field of gait recognition.\n","authors":["Proma Hossain Progga","Md. Jobayer Rahman","Swapnil Biswas","Md. Shakil Ahmed","Arif Reza Anwary","Swakkhar Shatabda"],"pdf_url":"https://arxiv.org/pdf/2412.03498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04844v3","updated":"2024-12-04T17:34:13Z","published":"2023-05-08T16:42:55Z","title":"SR+Codec: a Benchmark of Super-Resolution for Video Compression Bitrate\n Reduction","summary":" In recent years, there has been significant interest in Super-Resolution\n(SR), which focuses on generating a high-resolution image from a low-resolution\ninput. Deep learning-based methods for super-resolution have been particularly\npopular and have shown impressive results on various benchmarks. However,\nresearch indicates that these methods may not perform as well on strongly\ncompressed videos. We developed a super-resolution benchmark to analyze SR's\ncapacity to upscale compressed videos. Our dataset employed video codecs based\non five widely-used compression standards: H.264, H.265, H.266, AV1, and AVS3.\nWe assessed 19 popular SR models using our benchmark and evaluated their\nability to restore details and their susceptibility to compression artifacts.\nTo get an accurate perceptual ranking of SR models, we conducted a\ncrowd-sourced side-by-side comparison of their outputs. We found that some SR\nmodels, combined with compression, allow us to reduce the video bitrate without\nsignificant loss of quality. We also compared a range of image and video\nquality metrics with subjective scores to evaluate their accuracy on\nsuper-resolved compressed videos. The benchmark is publicly available at\nhttps://videoprocessing.ai/benchmarks/super-resolution-for-video-compression.html\n","authors":["Evgeney Bogatyrev","Ivan Molodetskikh","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2305.04844v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03490v1","updated":"2024-12-04T17:26:30Z","published":"2024-12-04T17:26:30Z","title":"Data Fusion of Semantic and Depth Information in the Context of Object\n Detection","summary":" Considerable study has already been conducted regarding autonomous driving in\nmodern era. An autonomous driving system must be extremely good at detecting\nobjects surrounding the car to ensure safety. In this paper, classification,\nand estimation of an object's (pedestrian) position (concerning an ego 3D\ncoordinate system) are studied and the distance between the ego vehicle and the\nobject in the context of autonomous driving is measured. To classify the\nobject, faster Region-based Convolution Neural Network (R-CNN) with inception\nv2 is utilized. First, a network is trained with customized dataset to estimate\nthe reference position of objects as well as the distance from the vehicle.\nFrom camera calibration to computing the distance, cutting-edge technologies of\ncomputer vision algorithms in a series of processes are applied to generate a\n3D reference point of the region of interest. The foremost step in this process\nis generating a disparity map using the concept of stereo vision.\n","authors":["Md Abu Yusuf","Md Rezaul Karim Khan","Partha Pratim Saha","Mohammed Mahbubur Rahaman"],"pdf_url":"https://arxiv.org/pdf/2412.03490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11976v2","updated":"2024-12-04T17:13:22Z","published":"2024-11-18T19:06:01Z","title":"Coverage-Constrained Human-AI Cooperation with Multiple Experts","summary":" Human-AI cooperative classification (HAI-CC) approaches aim to develop hybrid\nintelligent systems that enhance decision-making in various high-stakes\nreal-world scenarios by leveraging both human expertise and AI capabilities.\nCurrent HAI-CC methods primarily focus on learning-to-defer (L2D), where\ndecisions are deferred to human experts, and learning-to-complement (L2C),\nwhere AI and human experts make predictions cooperatively. However, a notable\nresearch gap remains in effectively exploring both L2D and L2C under diverse\nexpert knowledge to improve decision-making, particularly when constrained by\nthe cooperation cost required to achieve a target probability for AI-only\nselection (i.e., coverage). In this paper, we address this research gap by\nproposing the Coverage-constrained Learning to Defer and Complement with\nSpecific Experts (CL2DC) method. CL2DC makes final decisions through either AI\nprediction alone or by deferring to or complementing a specific expert,\ndepending on the input data. Furthermore, we propose a coverage-constrained\noptimisation to control the cooperation cost, ensuring it approximates a target\nprobability for AI-only selection. This approach enables an effective\nassessment of system performance within a specified budget. Also, CL2DC is\ndesigned to address scenarios where training sets contain multiple noisy-label\nannotations without any clean-label references. Comprehensive evaluations on\nboth synthetic and real-world datasets demonstrate that CL2DC achieves superior\nperformance compared to state-of-the-art HAI-CC methods.\n","authors":["Zheng Zhang","Cuong Nguyen","Kevin Wells","Thanh-Toan Do","David Rosewarne","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2411.11976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.07000v2","updated":"2024-12-04T17:11:27Z","published":"2022-03-14T11:07:33Z","title":"Cross-View-Prediction: Exploring Contrastive Feature for Hyperspectral\n Image Classification","summary":" This paper presents a self-supervised feature learning method for\nhyperspectral image classification. Our method tries to construct two different\nviews of the raw hyperspectral image through a cross-representation learning\nmethod. And then to learn semantically consistent representation over the\ncreated views by contrastive learning method. Specifically, four\ncross-channel-prediction based augmentation methods are naturally designed to\nutilize the high dimension characteristic of hyperspectral data for the view\nconstruction. And the better representative features are learned by maximizing\nmutual information and minimizing conditional entropy across different views\nfrom our contrastive network. This 'Cross-View-Predicton' style is\nstraightforward and gets the state-of-the-art performance of unsupervised\nclassification with a simple SVM classifier.\n","authors":["Anyu Zhang","Haotian Wu","Zeyu Cao"],"pdf_url":"https://arxiv.org/pdf/2203.07000v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03473v1","updated":"2024-12-04T16:59:49Z","published":"2024-12-04T16:59:49Z","title":"Urban4D: Semantic-Guided 4D Gaussian Splatting for Urban Scene\n Reconstruction","summary":" Reconstructing dynamic urban scenes presents significant challenges due to\ntheir intrinsic geometric structures and spatiotemporal dynamics. Existing\nmethods that attempt to model dynamic urban scenes without leveraging priors on\npotentially moving regions often produce suboptimal results. Meanwhile,\napproaches based on manual 3D annotations yield improved reconstruction quality\nbut are impractical due to labor-intensive labeling. In this paper, we revisit\nthe potential of 2D semantic maps for classifying dynamic and static Gaussians\nand integrating spatial and temporal dimensions for urban scene representation.\nWe introduce Urban4D, a novel framework that employs a semantic-guided\ndecomposition strategy inspired by advances in deep 2D semantic map generation.\nOur approach distinguishes potentially dynamic objects through reliable\nsemantic Gaussians. To explicitly model dynamic objects, we propose an\nintuitive and effective 4D Gaussian splatting (4DGS) representation that\naggregates temporal information through learnable time embeddings for each\nGaussian, predicting their deformations at desired timestamps using a\nmultilayer perceptron (MLP). For more accurate static reconstruction, we also\ndesign a k-nearest neighbor (KNN)-based consistency regularization to handle\nthe ground surface due to its low-texture characteristic. Extensive experiments\non real-world datasets demonstrate that Urban4D not only achieves comparable or\nbetter quality than previous state-of-the-art methods but also effectively\ncaptures dynamic objects while maintaining high visual fidelity for static\nelements.\n","authors":["Ziwen Li","Jiaxin Huang","Runnan Chen","Yunlong Che","Yandong Guo","Tongliang Liu","Fakhri Karray","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2412.03473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03472v1","updated":"2024-12-04T16:59:44Z","published":"2024-12-04T16:59:44Z","title":"Measure Anything: Real-time, Multi-stage Vision-based Dimensional\n Measurement using Segment Anything","summary":" We present Measure Anything, a comprehensive vision-based framework for\ndimensional measurement of objects with circular cross-sections, leveraging the\nSegment Anything Model (SAM). Our approach estimates key geometric features --\nincluding diameter, length, and volume -- for rod-like geometries with varying\ncurvature and general objects with constant skeleton slope. The framework\nintegrates segmentation, mask processing, skeleton construction, and 2D-3D\ntransformation, packaged in a user-friendly interface. We validate our\nframework by estimating the diameters of Canola stems -- collected from\nagricultural fields in North Dakota -- which are thin and non-uniform, posing\nchallenges for existing methods. Measuring its diameters is critical, as it is\na phenotypic traits that correlates with the health and yield of Canola crops.\nThis application also exemplifies the potential of Measure Anything, where\nintegrating intelligent models -- such as keypoint detection -- extends its\nscalability to fully automate the measurement process for high-throughput\napplications. Furthermore, we showcase its versatility in robotic grasping,\nleveraging extracted geometric features to identify optimal grasp points.\n","authors":["Yongkyu Lee","Shivam Kumar Panda","Wei Wang","Mohammad Khalid Jawed"],"pdf_url":"https://arxiv.org/pdf/2412.03472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03467v1","updated":"2024-12-04T16:56:20Z","published":"2024-12-04T16:56:20Z","title":"Training-Free Mitigation of Language Reasoning Degradation After\n Multimodal Instruction Tuning","summary":" Multimodal models typically combine a powerful large language model (LLM)\nwith a vision encoder and are then trained on multimodal data via instruction\ntuning. While this process adapts LLMs to multimodal settings, it remains\nunclear whether this adaptation compromises their original language reasoning\ncapabilities. In this work, we explore the effects of multimodal instruction\ntuning on language reasoning performance. We focus on LLaVA, a leading\nmultimodal framework that integrates LLMs such as Vicuna or Mistral with the\nCLIP vision encoder. We compare the performance of the original LLMs with their\nmultimodal-adapted counterparts across eight language reasoning tasks. Our\nexperiments yield several key insights. First, the impact of multimodal\nlearning varies between Vicuna and Mistral: we observe a degradation in\nlanguage reasoning for Mistral but improvements for Vicuna across most tasks.\nSecond, while multimodal instruction learning consistently degrades performance\non mathematical reasoning tasks (e.g., GSM8K), it enhances performance on\ncommonsense reasoning tasks (e.g., CommonsenseQA). Finally, we demonstrate that\na training-free model merging technique can effectively mitigate the language\nreasoning degradation observed in multimodal-adapted Mistral and even improve\nperformance on visual tasks.\n","authors":["Neale Ratzlaff","Man Luo","Xin Su","Vasudev Lal","Phillip Howard"],"pdf_url":"https://arxiv.org/pdf/2412.03467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03456v1","updated":"2024-12-04T16:45:02Z","published":"2024-12-04T16:45:02Z","title":"Gesture Classification in Artworks Using Contextual Image Features","summary":" Recognizing gestures in artworks can add a valuable dimension to art\nunderstanding and help to acknowledge the role of the sense of smell in\ncultural heritage. We propose a method to recognize smell gestures in\nhistorical artworks. We show that combining local features with global image\ncontext improves classification performance notably on different backbones.\n","authors":["Azhar Hussian","Mathias Zinnen","Thi My Hang Tran","Andreas Maier","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2412.03456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14108v2","updated":"2024-12-04T16:43:00Z","published":"2024-07-19T08:24:36Z","title":"GaussianBeV: 3D Gaussian Representation meets Perception Models for BeV\n Segmentation","summary":" The Bird's-eye View (BeV) representation is widely used for 3D perception\nfrom multi-view camera images. It allows to merge features from different\ncameras into a common space, providing a unified representation of the 3D\nscene. The key component is the view transformer, which transforms image views\ninto the BeV. However, actual view transformer methods based on geometry or\ncross-attention do not provide a sufficiently detailed representation of the\nscene, as they use a sub-sampling of the 3D space that is non-optimal for\nmodeling the fine structures of the environment. In this paper, we propose\nGaussianBeV, a novel method for transforming image features to BeV by finely\nrepresenting the scene using a set of 3D gaussians located and oriented in 3D\nspace. This representation is then splattered to produce the BeV feature map by\nadapting recent advances in 3D representation rendering based on gaussian\nsplatting. GaussianBeV is the first approach to use this 3D gaussian modeling\nand 3D scene rendering process online, i.e. without optimizing it on a specific\nscene and directly integrated into a single stage model for BeV scene\nunderstanding. Experiments show that the proposed representation is highly\neffective and place GaussianBeV as the new state-of-the-art on the BeV semantic\nsegmentation task on the nuScenes dataset.\n","authors":["Florian Chabot","Nicolas Granger","Guillaume Lapouge"],"pdf_url":"https://arxiv.org/pdf/2407.14108v2.pdf","comment":"Accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2412.03453v1","updated":"2024-12-04T16:40:56Z","published":"2024-12-04T16:40:56Z","title":"Pre-trained Multiple Latent Variable Generative Models are good\n defenders against Adversarial Attacks","summary":" Attackers can deliberately perturb classifiers' input with subtle noise,\naltering final predictions. Among proposed countermeasures, adversarial\npurification employs generative networks to preprocess input images, filtering\nout adversarial noise. In this study, we propose specific generators, defined\nMultiple Latent Variable Generative Models (MLVGMs), for adversarial\npurification. These models possess multiple latent variables that naturally\ndisentangle coarse from fine features. Taking advantage of these properties, we\nautoencode images to maintain class-relevant information, while discarding and\nre-sampling any detail, including adversarial noise. The procedure is\ncompletely training-free, exploring the generalization abilities of pre-trained\nMLVGMs on the adversarial purification downstream task. Despite the lack of\nlarge models, trained on billions of samples, we show that smaller MLVGMs are\nalready competitive with traditional methods, and can be used as foundation\nmodels. Official code released at https://github.com/SerezD/gen_adversarial.\n","authors":["Dario Serez","Marco Cristani","Alessio Del Bue","Vittorio Murino","Pietro Morerio"],"pdf_url":"https://arxiv.org/pdf/2412.03453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03451v1","updated":"2024-12-04T16:38:07Z","published":"2024-12-04T16:38:07Z","title":"PlanarSplatting: Accurate Planar Surface Reconstruction in 3 Minutes","summary":" This paper presents PlanarSplatting, an ultra-fast and accurate surface\nreconstruction approach for multiview indoor images. We take the 3D planes as\nthe main objective due to their compactness and structural expressiveness in\nindoor scenes, and develop an explicit optimization framework that learns to\nfit the expected surface of indoor scenes by splatting the 3D planes into 2.5D\ndepth and normal maps. As our PlanarSplatting operates directly on the 3D plane\nprimitives, it eliminates the dependencies on 2D/3D plane detection and plane\nmatching and tracking for planar surface reconstruction. Furthermore, the\nessential merits of plane-based representation plus CUDA-based implementation\nof planar splatting functions, PlanarSplatting reconstructs an indoor scene in\n3 minutes while having significantly better geometric accuracy. Thanks to our\nultra-fast reconstruction speed, the largest quantitative evaluation on the\nScanNet and ScanNet++ datasets over hundreds of scenes clearly demonstrated the\nadvantages of our method. We believe that our accurate and ultrafast planar\nsurface reconstruction method will be applied in the structured data curation\nfor surface reconstruction in the future. The code of our CUDA implementation\nwill be publicly available. Project page:\nhttps://icetttb.github.io/PlanarSplatting/\n","authors":["Bin Tan","Rui Yu","Yujun Shen","Nan Xue"],"pdf_url":"https://arxiv.org/pdf/2412.03451v1.pdf","comment":"Project page: https://icetttb.github.io/PlanarSplatting/"},{"id":"http://arxiv.org/abs/2412.02366v2","updated":"2024-12-04T16:38:01Z","published":"2024-12-03T10:45:34Z","title":"GenMix: Effective Data Augmentation with Generative Diffusion Model\n Image Editing","summary":" Data augmentation is widely used to enhance generalization in visual\nclassification tasks. However, traditional methods struggle when source and\ntarget domains differ, as in domain adaptation, due to their inability to\naddress domain gaps. This paper introduces GenMix, a generalizable\nprompt-guided generative data augmentation approach that enhances both\nin-domain and cross-domain image classification. Our technique leverages image\nediting to generate augmented images based on custom conditional prompts,\ndesigned specifically for each problem type. By blending portions of the input\nimage with its edited generative counterpart and incorporating fractal\npatterns, our approach mitigates unrealistic images and label ambiguity,\nimproving the performance and adversarial robustness of the resulting models.\nEfficacy of our method is established with extensive experiments on eight\npublic datasets for general and fine-grained classification, in both in-domain\nand cross-domain settings. Additionally, we demonstrate performance\nimprovements for self-supervised learning, learning with data scarcity, and\nadversarial robustness. As compared to the existing state-of-the-art methods,\nour technique achieves stronger performance across the board.\n","authors":["Khawar Islam","Muhammad Zaigham Zaheer","Arif Mahmood","Karthik Nandakumar","Naveed Akhtar"],"pdf_url":"https://arxiv.org/pdf/2412.02366v2.pdf","comment":"https://diffusemix.github.io/"},{"id":"http://arxiv.org/abs/2412.03439v1","updated":"2024-12-04T16:29:04Z","published":"2024-12-04T16:29:04Z","title":"CleanDIFT: Diffusion Features without Noise","summary":" Internal features from large-scale pre-trained diffusion models have recently\nbeen established as powerful semantic descriptors for a wide range of\ndownstream tasks. Works that use these features generally need to add noise to\nimages before passing them through the model to obtain the semantic features,\nas the models do not offer the most useful features when given images with\nlittle to no noise. We show that this noise has a critical impact on the\nusefulness of these features that cannot be remedied by ensembling with\ndifferent random noises. We address this issue by introducing a lightweight,\nunsupervised fine-tuning method that enables diffusion backbones to provide\nhigh-quality, noise-free semantic features. We show that these features readily\noutperform previous diffusion features by a wide margin in a wide variety of\nextraction setups and downstream tasks, offering better performance than even\nensemble-based methods at a fraction of the cost.\n","authors":["Nick Stracke","Stefan Andreas Baumann","Kolja Bauer","Frank Fundel","Björn Ommer"],"pdf_url":"https://arxiv.org/pdf/2412.03439v1.pdf","comment":"for the project page and code, view\n https://compvis.github.io/CleanDIFT/"},{"id":"http://arxiv.org/abs/2412.03430v1","updated":"2024-12-04T16:19:47Z","published":"2024-12-04T16:19:47Z","title":"SINGER: Vivid Audio-driven Singing Video Generation with Multi-scale\n Spectral Diffusion Model","summary":" Recent advancements in generative models have significantly enhanced talking\nface video generation, yet singing video generation remains underexplored. The\ndifferences between human talking and singing limit the performance of existing\ntalking face video generation models when applied to singing. The fundamental\ndifferences between talking and singing-specifically in audio characteristics\nand behavioral expressions-limit the effectiveness of existing models. We\nobserve that the differences between singing and talking audios manifest in\nterms of frequency and amplitude. To address this, we have designed a\nmulti-scale spectral module to help the model learn singing patterns in the\nspectral domain. Additionally, we develop a spectral-filtering module that aids\nthe model in learning the human behaviors associated with singing audio. These\ntwo modules are integrated into the diffusion model to enhance singing video\ngeneration performance, resulting in our proposed model, SINGER. Furthermore,\nthe lack of high-quality real-world singing face videos has hindered the\ndevelopment of the singing video generation community. To address this gap, we\nhave collected an in-the-wild audio-visual singing dataset to facilitate\nresearch in this area. Our experiments demonstrate that SINGER is capable of\ngenerating vivid singing videos and outperforms state-of-the-art methods in\nboth objective and subjective evaluations.\n","authors":["Yan Li","Ziya Zhou","Zhiqiang Wang","Wei Xue","Wenhan Luo","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2412.03430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03428v1","updated":"2024-12-04T16:17:47Z","published":"2024-12-04T16:17:47Z","title":"2DGS-Room: Seed-Guided 2D Gaussian Splatting with Geometric Constrains\n for High-Fidelity Indoor Scene Reconstruction","summary":" The reconstruction of indoor scenes remains challenging due to the inherent\ncomplexity of spatial structures and the prevalence of textureless regions.\nRecent advancements in 3D Gaussian Splatting have improved novel view synthesis\nwith accelerated processing but have yet to deliver comparable performance in\nsurface reconstruction. In this paper, we introduce 2DGS-Room, a novel method\nleveraging 2D Gaussian Splatting for high-fidelity indoor scene reconstruction.\nSpecifically, we employ a seed-guided mechanism to control the distribution of\n2D Gaussians, with the density of seed points dynamically optimized through\nadaptive growth and pruning mechanisms. To further improve geometric accuracy,\nwe incorporate monocular depth and normal priors to provide constraints for\ndetails and textureless regions respectively. Additionally, multi-view\nconsistency constraints are employed to mitigate artifacts and further enhance\nreconstruction quality. Extensive experiments on ScanNet and ScanNet++ datasets\ndemonstrate that our method achieves state-of-the-art performance in indoor\nscene reconstruction.\n","authors":["Wanting Zhang","Haodong Xiang","Zhichao Liao","Xiansong Lai","Xinghui Li","Long Zeng"],"pdf_url":"https://arxiv.org/pdf/2412.03428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03752v2","updated":"2024-12-04T15:53:19Z","published":"2024-11-06T08:27:49Z","title":"Deferred Poisoning: Making the Model More Vulnerable via Hessian\n Singularization","summary":" Recent studies have shown that deep learning models are very vulnerable to\npoisoning attacks. Many defense methods have been proposed to address this\nissue. However, traditional poisoning attacks are not as threatening as\ncommonly believed. This is because they often cause differences in how the\nmodel performs on the training set compared to the validation set. Such\ninconsistency can alert defenders that their data has been poisoned, allowing\nthem to take the necessary defensive actions. In this paper, we introduce a\nmore threatening type of poisoning attack called the Deferred Poisoning Attack.\nThis new attack allows the model to function normally during the training and\nvalidation phases but makes it very sensitive to evasion attacks or even\nnatural noise. We achieve this by ensuring the poisoned model's loss function\nhas a similar value as a normally trained model at each input sample but with a\nlarge local curvature. A similar model loss ensures that there is no obvious\ninconsistency between the training and validation accuracy, demonstrating high\nstealthiness. On the other hand, the large curvature implies that a small\nperturbation may cause a significant increase in model loss, leading to\nsubstantial performance degradation, which reflects a worse robustness. We\nfulfill this purpose by making the model have singular Hessian information at\nthe optimal point via our proposed Singularization Regularization term. We have\nconducted both theoretical and empirical analyses of the proposed method and\nvalidated its effectiveness through experiments on image classification tasks.\nFurthermore, we have confirmed the hazards of this form of poisoning attack\nunder more general scenarios using natural noise, offering a new perspective\nfor research in the field of security.\n","authors":["Yuhao He","Jinyu Tian","Xianwei Zheng","Li Dong","Yuanman Li","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.03752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03413v1","updated":"2024-12-04T15:49:49Z","published":"2024-12-04T15:49:49Z","title":"Deep Learning for Sea Surface Temperature Reconstruction under Cloud\n Occlusion","summary":" Sea Surface Temperature (SST) is crucial for understanding Earth's oceans and\nclimate, significantly influencing weather patterns, ocean currents, marine\necosystem health, and the global energy balance. Large-scale SST monitoring\nrelies on satellite infrared radiation detection, but cloud cover presents a\nmajor challenge, creating extensive observational gaps and hampering our\nability to fully capture large-scale ocean temperature patterns. Efforts to\naddress these gaps in existing L4 datasets have been made, but they often\nexhibit notable local and seasonal biases, compromising data reliability and\naccuracy. To tackle this challenge, we employed deep neural networks to\nreconstruct cloud-covered portions of satellite imagery while preserving the\nintegrity of observed values in cloud-free areas, using MODIS satellite derived\nobservations of SST. Our best-performing architecture showed significant skill\nimprovements over established methodologies, achieving substantial reductions\nin error metrics when benchmarked against widely used approaches and datasets.\nThese results underscore the potential of advanced AI techniques to enhance the\ncompleteness of satellite observations in Earth-science remote sensing,\nproviding more accurate and reliable datasets for environmental assessments,\ndata-driven model training, climate research, and seamless integration into\nmodel data assimilation workflows.\n","authors":["Andrea Asperti","Ali Aydogdu","Emanuela Clementi","Angelo Greco","Lorenzo Mentaschi","Fabio Merizzi","Pietro Miraglio","Paolo Oddo","Nadia Pinardi","Alessandro Testa"],"pdf_url":"https://arxiv.org/pdf/2412.03413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03409v1","updated":"2024-12-04T15:48:59Z","published":"2024-12-04T15:48:59Z","title":"PrefixKV: Adaptive Prefix KV Cache is What Vision Instruction-Following\n Models Need for Efficient Generation","summary":" Recently, large vision-language models (LVLMs) have rapidly gained popularity\nfor their strong generation and reasoning capabilities given diverse multimodal\ninputs. However, these models incur significant computational and memory\noverhead during inference, which greatly hinders the efficient deployment in\npractical scenarios. The extensive key-value (KV) cache, necessitated by the\nlengthy input and output sequences, notably contributes to the high inference\ncost. Based on this, recent works have investigated ways to reduce the KV cache\nsize for higher efficiency. Although effective, they generally overlook the\ndistinct importance distributions of KV vectors across layers and maintain the\nsame cache size for each layer during the next token prediction. This results\nin the significant contextual information loss for certain layers, leading to\nnotable performance decline. To address this, we present PrefixKV. It reframes\nthe challenge of determining KV cache sizes for all layers into the task of\nsearching for the optimal global prefix configuration. With an adaptive\nlayer-wise KV retention recipe based on binary search, the maximum contextual\ninformation can thus be preserved in each layer, facilitating the generation.\nExtensive experiments demonstrate that our method achieves the state-of-the-art\nperformance compared with others. It exhibits superior inference efficiency and\ngeneration quality trade-offs, showing promising potential for practical\napplications. Code is available at \\url{https://github.com/THU-MIG/PrefixKV}.\n","authors":["Ao Wang","Hui Chen","Jianchao Tan","Kefeng Zhang","Xunliang Cai","Zijia Lin","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2412.03409v1.pdf","comment":"12 pages, 5 figures;"},{"id":"http://arxiv.org/abs/2412.03407v1","updated":"2024-12-04T15:45:20Z","published":"2024-12-04T15:45:20Z","title":"Skel3D: Skeleton Guided Novel View Synthesis","summary":" In this paper, we present an approach for monocular open-set novel view\nsynthesis (NVS) that leverages object skeletons to guide the underlying\ndiffusion model. Building upon a baseline that utilizes a pre-trained 2D image\ngenerator, our method takes advantage of the Objaverse dataset, which includes\nanimated objects with bone structures. By introducing a skeleton guide layer\nfollowing the existing ray conditioning normalization (RCN) layer, our approach\nenhances pose accuracy and multi-view consistency. The skeleton guide layer\nprovides detailed structural information for the generative model, improving\nthe quality of synthesized views. Experimental results demonstrate that our\nskeleton-guided method significantly enhances consistency and accuracy across\ndiverse object categories within the Objaverse dataset. Our method outperforms\nexisting state-of-the-art NVS techniques both quantitatively and qualitatively,\nwithout relying on explicit 3D representations.\n","authors":["Aron Fóthi","Bence Fazekas","Natabara Máté Gyöngyössy","Kristian Fenech"],"pdf_url":"https://arxiv.org/pdf/2412.03407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03401v1","updated":"2024-12-04T15:32:37Z","published":"2024-12-04T15:32:37Z","title":"Benchmarking Pretrained Attention-based Models for Real-Time Recognition\n in Robot-Assisted Esophagectomy","summary":" Esophageal cancer is among the most common types of cancer worldwide. It is\ntraditionally treated using open esophagectomy, but in recent years,\nrobot-assisted minimally invasive esophagectomy (RAMIE) has emerged as a\npromising alternative. However, robot-assisted surgery can be challenging for\nnovice surgeons, as they often suffer from a loss of spatial orientation.\nComputer-aided anatomy recognition holds promise for improving surgical\nnavigation, but research in this area remains limited. In this study, we\ndeveloped a comprehensive dataset for semantic segmentation in RAMIE, featuring\nthe largest collection of vital anatomical structures and surgical instruments\nto date. Handling this diverse set of classes presents challenges, including\nclass imbalance and the recognition of complex structures such as nerves. This\nstudy aims to understand the challenges and limitations of current\nstate-of-the-art algorithms on this novel dataset and problem. Therefore, we\nbenchmarked eight real-time deep learning models using two pretraining\ndatasets. We assessed both traditional and attention-based networks,\nhypothesizing that attention-based networks better capture global patterns and\naddress challenges such as occlusion caused by blood or other tissues. The\nbenchmark includes our RAMIE dataset and the publicly available CholecSeg8k\ndataset, enabling a thorough assessment of surgical segmentation tasks. Our\nfindings indicate that pretraining on ADE20k, a dataset for semantic\nsegmentation, is more effective than pretraining on ImageNet. Furthermore,\nattention-based models outperform traditional convolutional neural networks,\nwith SegNeXt and Mask2Former achieving higher Dice scores, and Mask2Former\nadditionally excelling in average symmetric surface distance.\n","authors":["Ronald L. P. D. de Jong","Yasmina al Khalil","Tim J. M. Jaspers","Romy C. van Jaarsveld","Gino M. Kuiper","Yiping Li","Richard van Hillegersberg","Jelle P. Ruurda","Marcel Breeuwer","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2412.03401v1.pdf","comment":"Accepted for presentation at the SPIE Medical Imaging Conference,\n 2025"},{"id":"http://arxiv.org/abs/2412.03400v1","updated":"2024-12-04T15:31:30Z","published":"2024-12-04T15:31:30Z","title":"Implicit Priors Editing in Stable Diffusion via Targeted Token\n Adjustment","summary":" Implicit assumptions and priors are often necessary in text-to-image\ngeneration tasks, especially when textual prompts lack sufficient context.\nHowever, these assumptions can sometimes reflect outdated concepts,\ninaccuracies, or societal bias embedded in the training data. We present\nEmbedding-only Editing (Embedit), a method designed to efficiently adjust\nimplict assumptions and priors in the model without affecting its\ninterpretation of unrelated objects or overall performance. Given a \"source\"\nprompt (e.g., \"rose\") that elicits an implicit assumption (e.g., rose is red)\nand a \"destination\" prompt that specifies the desired attribute (e.g., \"blue\nrose\"), Embedit fine-tunes only the word token embedding (WTE) of the target\nobject (\"rose\") to optimize the last hidden state of text encoder in Stable\nDiffusion, a SOTA text-to-image model. This targeted adjustment prevents\nunintended effects on other objects in the model's knowledge base, as the WTEs\nfor unrelated objects and the model weights remain unchanged. Consequently,\nwhen a prompt does not contain the edited object, all representations, and the\nmodel outputs are identical to those of the original, unedited model. Our\nmethod is highly efficient, modifying only 768 parameters for Stable Diffusion\n1.4 and 2048 for XL in a single edit, matching the WTE dimension of each\nrespective model. This minimal scope, combined with rapid execution, makes\nEmbedit highly practical for real-world applications. Additionally, changes are\neasily reversible by restoring the original WTE layers. Our experimental\nresults demonstrate that Embedit consistently outperforms previous methods\nacross various models, tasks, and editing scenarios (both single and sequential\nmultiple edits), achieving at least a 6.01% improvement (from 87.17% to\n93.18%).\n","authors":["Feng He","Chao Zhang","Zhixue Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.03400v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.10180v2","updated":"2024-12-04T15:28:18Z","published":"2024-07-14T12:42:11Z","title":"Defending Against Repetitive Backdoor Attacks on Semi-supervised\n Learning through Lens of Rate-Distortion-Perception Trade-off","summary":" Semi-supervised learning (SSL) has achieved remarkable performance with a\nsmall fraction of labeled data by leveraging vast amounts of unlabeled data\nfrom the Internet. However, this large pool of untrusted data is extremely\nvulnerable to data poisoning, leading to potential backdoor attacks. Current\nbackdoor defenses are not yet effective against such a vulnerability in SSL. In\nthis study, we propose a novel method, Unlabeled Data Purification (UPure), to\ndisrupt the association between trigger patterns and target classes by\nintroducing perturbations in the frequency domain. By leveraging the\nRate-Distortion-Perception (RDP) trade-off, we further identify the frequency\nband, where the perturbations are added, and justify this selection. Notably,\nUPure purifies poisoned unlabeled data without the need of extra clean labeled\ndata. Extensive experiments on four benchmark datasets and five SSL algorithms\ndemonstrate that UPure effectively reduces the attack success rate from 99.78%\nto 0% while maintaining model accuracy. Code is available here:\n\\url{https://github.com/chengyi-chris/UPure}.\n","authors":["Cheng-Yi Lee","Ching-Chia Kao","Cheng-Han Yeh","Chun-Shien Lu","Chia-Mu Yu","Chu-Song Chen"],"pdf_url":"https://arxiv.org/pdf/2407.10180v2.pdf","comment":"Accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2405.19732v4","updated":"2024-12-04T15:20:35Z","published":"2024-05-30T06:24:14Z","title":"LLM as a Complementary Optimizer to Gradient Descent: A Case Study in\n Prompt Tuning","summary":" Mastering a skill generally relies on both hands-on experience from doers and\ninsightful, high-level guidance by mentors. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdates at each step. Large Language Models (LLMs) can also search for better\nsolutions by inferring from natural language instructions, akin to a high-level\nmentor. In this paper, we show that these two participators are complementary\nto each other and can effectively collaborate as a combined optimization\nframework. The collaborative optimization is achieved by alternating between\nthe gradient-based and LLM-based optimizers. We instruct LLMs to generate\npossibly improved solutions by taking parameter trajectories recorded during\nthe previous stage of gradient-based optimization into account. Inferred\nresults of LLMs are used as restarting points for the next stage of gradient\noptimization. We verify the effectiveness of this optimization framework on\nprompt tuning. By leveraging both the locally rigorous gradient-based optimizer\nand the high-level deductive LLM-based optimizer, the combined optimization\nmethod consistently yields improvements over competitive baselines on a variety\nof tasks. Our results demonstrate the synergistic effect of conventional\ngradient-based optimization and the inference ability of LLMs. The code is\nreleased at https://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16310v3","updated":"2024-12-04T15:12:06Z","published":"2024-11-25T11:57:48Z","title":"Functionality understanding and segmentation in 3D scenes","summary":" Understanding functionalities in 3D scenes involves interpreting natural\nlanguage descriptions to locate functional interactive objects, such as handles\nand buttons, in a 3D environment. Functionality understanding is highly\nchallenging, as it requires both world knowledge to interpret language and\nspatial perception to identify fine-grained objects. For example, given a task\nlike 'turn on the ceiling light', an embodied AI agent must infer that it needs\nto locate the light switch, even though the switch is not explicitly mentioned\nin the task description. To date, no dedicated methods have been developed for\nthis problem. In this paper, we introduce Fun3DU, the first approach designed\nfor functionality understanding in 3D scenes. Fun3DU uses a language model to\nparse the task description through Chain-of-Thought reasoning in order to\nidentify the object of interest. The identified object is segmented across\nmultiple views of the captured scene by using a vision and language model. The\nsegmentation results from each view are lifted in 3D and aggregated into the\npoint cloud using geometric information. Fun3DU is training-free, relying\nentirely on pre-trained models. We evaluate Fun3DU on SceneFun3D, the most\nrecent and only dataset to benchmark this task, which comprises over 3000 task\ndescriptions on 230 scenes. Our method significantly outperforms\nstate-of-the-art open-vocabulary 3D segmentation approaches. Project page:\nhttps://jcorsetti.github.io/fun3du\n","authors":["Jaime Corsetti","Francesco Giuliari","Alice Fasoli","Davide Boscaini","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2411.16310v3.pdf","comment":"Technical report. 20 pages, 12 figures, 7 tables. Fixed main diagram"},{"id":"http://arxiv.org/abs/2412.03379v1","updated":"2024-12-04T15:06:39Z","published":"2024-12-04T15:06:39Z","title":"Mapping using Transformers for Volumes -- Network for Super-Resolution\n with Long-Range Interactions","summary":" Until now, it has been difficult for volumetric super-resolution to utilize\nthe recent advances in transformer-based models seen in 2D super-resolution.\nThe memory required for self-attention in 3D volumes limits the receptive\nfield. Therefore, long-range interactions are not used in 3D to the extent done\nin 2D and the strength of transformers is not realized. We propose a\nmulti-scale transformer-based model based on hierarchical attention blocks\ncombined with carrier tokens at multiple scales to overcome this. Here\ninformation from larger regions at coarse resolution is sequentially carried on\nto finer-resolution regions to predict the super-resolved image. Using\ntransformer layers at each resolution, our coarse-to-fine modeling limits the\nnumber of tokens at each scale and enables attention over larger regions than\nwhat has previously been possible. We experimentally compare our method,\nMTVNet, against state-of-the-art volumetric super-resolution models on five 3D\ndatasets demonstrating the advantage of an increased receptive field. This\nadvantage is especially pronounced for images that are larger than what is seen\nin popularly used 3D datasets. Our code is available at\nhttps://github.com/AugustHoeg/MTVNet\n","authors":["August Leander Høeg","Sophia W. Bardenfleth","Hans Martin Kjer","Tim B. Dyrby","Vedrana Andersen Dahl","Anders Dahl"],"pdf_url":"https://arxiv.org/pdf/2412.03379v1.pdf","comment":"14 pages, 8 Figures with supplementary material"},{"id":"http://arxiv.org/abs/2412.03378v1","updated":"2024-12-04T15:05:43Z","published":"2024-12-04T15:05:43Z","title":"Volumetrically Consistent 3D Gaussian Rasterization","summary":" Recently, 3D Gaussian Splatting (3DGS) has enabled photorealistic view\nsynthesis at high inference speeds. However, its splatting-based rendering\nmodel makes several approximations to the rendering equation, reducing physical\naccuracy. We show that splatting and its approximations are unnecessary, even\nwithin a rasterizer; we instead volumetrically integrate 3D Gaussians directly\nto compute the transmittance across them analytically. We use this analytic\ntransmittance to derive more physically-accurate alpha values than 3DGS, which\ncan directly be used within their framework. The result is a method that more\nclosely follows the volume rendering equation (similar to ray-tracing) while\nenjoying the speed benefits of rasterization. Our method represents opaque\nsurfaces with higher accuracy and fewer points than 3DGS. This enables it to\noutperform 3DGS for view synthesis (measured in SSIM and LPIPS). Being\nvolumetrically consistent also enables our method to work out of the box for\ntomography. We match the state-of-the-art 3DGS-based tomography method with\nfewer points. Being volumetrically consistent also enables our method to work\nout of the box for tomography. We match the state-of-the-art 3DGS-based\ntomography method with fewer points.\n","authors":["Chinmay Talegaonkar","Yash Belhe","Ravi Ramamoorthi","Nicholas Antipa"],"pdf_url":"https://arxiv.org/pdf/2412.03378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03371v1","updated":"2024-12-04T14:59:05Z","published":"2024-12-04T14:59:05Z","title":"SGSST: Scaling Gaussian Splatting StyleTransfer","summary":" Applying style transfer to a full 3D environment is a challenging task that\nhas seen many developments since the advent of neural rendering. 3D Gaussian\nsplatting (3DGS) has recently pushed further many limits of neural rendering in\nterms of training speed and reconstruction quality. This work introduces SGSST:\nScaling Gaussian Splatting Style Transfer, an optimization-based method to\napply style transfer to pretrained 3DGS scenes. We demonstrate that a new\nmultiscale loss based on global neural statistics, that we name SOS for\nSimultaneously Optimized Scales, enables style transfer to ultra-high\nresolution 3D scenes. Not only SGSST pioneers 3D scene style transfer at such\nhigh image resolutions, it also produces superior visual quality as assessed by\nthorough qualitative, quantitative and perceptual comparisons.\n","authors":["Bruno Galerne","Jianling Wang","Lara Raad","Jean-Michel Morel"],"pdf_url":"https://arxiv.org/pdf/2412.03371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02168v2","updated":"2024-12-04T14:58:01Z","published":"2024-12-03T04:55:02Z","title":"Generative Photography: Scene-Consistent Camera Control for Realistic\n Text-to-Image Synthesis","summary":" Image generation today can produce somewhat realistic images from text\nprompts. However, if one asks the generator to synthesize a particular camera\nsetting such as creating different fields of view using a 24mm lens versus a\n70mm lens, the generator will not be able to interpret and generate\nscene-consistent images. This limitation not only hinders the adoption of\ngenerative tools in photography applications but also exemplifies a broader\nissue of bridging the gap between the data-driven models and the physical\nworld. In this paper, we introduce the concept of Generative Photography, a\nframework designed to control camera intrinsic settings during content\ngeneration. The core innovation of this work are the concepts of Dimensionality\nLifting and Contrastive Camera Learning, which achieve continuous and\nconsistent transitions for different camera settings. Experimental results show\nthat our method produces significantly more scene-consistent photorealistic\nimages than state-of-the-art models such as Stable Diffusion 3 and FLUX.\n","authors":["Yu Yuan","Xijun Wang","Yichen Sheng","Prateek Chennuri","Xingguang Zhang","Stanley Chan"],"pdf_url":"https://arxiv.org/pdf/2412.02168v2.pdf","comment":"Project page: https://generative-photography.github.io/project/"},{"id":"http://arxiv.org/abs/2412.03355v1","updated":"2024-12-04T14:39:54Z","published":"2024-12-04T14:39:54Z","title":"TASR: Timestep-Aware Diffusion Model for Image Super-Resolution","summary":" Diffusion models have recently achieved outstanding results in the field of\nimage super-resolution. These methods typically inject low-resolution (LR)\nimages via ControlNet.In this paper, we first explore the temporal dynamics of\ninformation infusion through ControlNet, revealing that the input from LR\nimages predominantly influences the initial stages of the denoising process.\nLeveraging this insight, we introduce a novel timestep-aware diffusion model\nthat adaptively integrates features from both ControlNet and the pre-trained\nStable Diffusion (SD). Our method enhances the transmission of LR information\nin the early stages of diffusion to guarantee image fidelity and stimulates the\ngeneration ability of the SD model itself more in the later stages to enhance\nthe detail of generated images. To train this method, we propose a\ntimestep-aware training strategy that adopts distinct losses at varying\ntimesteps and acts on disparate modules. Experiments on benchmark datasets\ndemonstrate the effectiveness of our method. Code:\nhttps://github.com/SleepyLin/TASR\n","authors":["Qinwei Lin","Xiaopeng Sun","Yu Gao","Yujie Zhong","Dengjie Li","Zheng Zhao","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13040v2","updated":"2024-12-04T14:38:11Z","published":"2024-04-19T17:53:43Z","title":"Analysis of Classifier-Free Guidance Weight Schedulers","summary":" Classifier-Free Guidance (CFG) enhances the quality and condition adherence\nof text-to-image diffusion models. It operates by combining the conditional and\nunconditional predictions using a fixed weight. However, recent works vary the\nweights throughout the diffusion process, reporting superior results but\nwithout providing any rationale or analysis. By conducting comprehensive\nexperiments, this paper provides insights into CFG weight schedulers. Our\nfindings suggest that simple, monotonically increasing weight schedulers\nconsistently lead to improved performances, requiring merely a single line of\ncode. In addition, more complex parametrized schedulers can be optimized for\nfurther improvement, but do not generalize across different models and tasks.\n","authors":["Xi Wang","Nicolas Dufour","Nefeli Andreou","Marie-Paule Cani","Victoria Fernandez Abrevaya","David Picard","Vicky Kalogeiton"],"pdf_url":"https://arxiv.org/pdf/2404.13040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03352v1","updated":"2024-12-04T14:35:06Z","published":"2024-12-04T14:35:06Z","title":"Intuitive Axial Augmentation Using Polar-Sine-Based Piecewise Distortion\n for Medical Slice-Wise Segmentation","summary":" Most data-driven models for medical image analysis rely on universal\naugmentations to improve performance. Experimental evidence has confirmed their\neffectiveness, but the unclear mechanism underlying them poses a barrier to the\nwidespread acceptance and trust in such methods within the medical community.\nWe revisit and acknowledge the unique characteristics of medical images apart\nfrom traditional digital images, and consequently, proposed a medical-specific\naugmentation algorithm that is more elastic and aligns well with radiology scan\nprocedure. The method performs piecewise affine with sinusoidal distorted ray\naccording to radius on polar coordinates, thus simulating uncertain postures of\nhuman lying flat on the scanning table. Our method could generate human\nvisceral distribution without affecting the fundamental relative position on\naxial plane. Two non-adaptive algorithms, namely Meta-based Scan Table Removal\nand Similarity-Guided Parameter Search, are introduced to bolster robustness of\nour augmentation method. Experiments show our method improves accuracy across\nmultiple famous segmentation frameworks without requiring more data samples.\nOur preview code is available in: https://github.com/MGAMZ/PSBPD.\n","authors":["Yiqin Zhang","Qingkui Chen","Chen Huang","Zhengjie Zhang","Meiling Chen","Zhibing Fu"],"pdf_url":"https://arxiv.org/pdf/2412.03352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03349v1","updated":"2024-12-04T14:30:19Z","published":"2024-12-04T14:30:19Z","title":"Fairer Analysis and Demographically Balanced Face Generation for Fairer\n Face Verification","summary":" Face recognition and verification are two computer vision tasks whose\nperformances have advanced with the introduction of deep representations.\nHowever, ethical, legal, and technical challenges due to the sensitive nature\nof face data and biases in real-world training datasets hinder their\ndevelopment. Generative AI addresses privacy by creating fictitious identities,\nbut fairness problems remain. Using the existing DCFace SOTA framework, we\nintroduce a new controlled generation pipeline that improves fairness. Through\nclassical fairness metrics and a proposed in-depth statistical analysis based\non logit models and ANOVA, we show that our generation pipeline improves\nfairness more than other bias mitigation approaches while slightly improving\nraw performance.\n","authors":["Alexandre Fournier-Montgieux","Michael Soumm","Adrian Popescu","Bertrand Luvison","Hervé Le Borgne"],"pdf_url":"https://arxiv.org/pdf/2412.03349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03347v1","updated":"2024-12-04T14:28:43Z","published":"2024-12-04T14:28:43Z","title":"DIVE: Taming DINO for Subject-Driven Video Editing","summary":" Building on the success of diffusion models in image generation and editing,\nvideo editing has recently gained substantial attention. However, maintaining\ntemporal consistency and motion alignment still remains challenging. To address\nthese issues, this paper proposes DINO-guided Video Editing (DIVE), a framework\ndesigned to facilitate subject-driven editing in source videos conditioned on\neither target text prompts or reference images with specific identities. The\ncore of DIVE lies in leveraging the powerful semantic features extracted from a\npretrained DINOv2 model as implicit correspondences to guide the editing\nprocess. Specifically, to ensure temporal motion consistency, DIVE employs DINO\nfeatures to align with the motion trajectory of the source video. Extensive\nexperiments on diverse real-world videos demonstrate that our framework can\nachieve high-quality editing results with robust motion consistency,\nhighlighting the potential of DINO to contribute to video editing. For precise\nsubject editing, DIVE incorporates the DINO features of reference images into a\npretrained text-to-image model to learn Low-Rank Adaptations (LoRAs),\neffectively registering the target subject's identity. Project page:\nhttps://dino-video-editing.github.io\n","authors":["Yi Huang","Wei Xiong","He Zhang","Chaoqi Chen","Jianzhuang Liu","Mingfu Yan","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03342v1","updated":"2024-12-04T14:20:27Z","published":"2024-12-04T14:20:27Z","title":"UniVAD: A Training-free Unified Model for Few-shot Visual Anomaly\n Detection","summary":" Visual Anomaly Detection (VAD) aims to identify abnormal samples in images\nthat deviate from normal patterns, covering multiple domains, including\nindustrial, logical, and medical fields. Due to the domain gaps between these\nfields, existing VAD methods are typically tailored to each domain, with\nspecialized detection techniques and model architectures that are difficult to\ngeneralize across different domains. Moreover, even within the same domain,\ncurrent VAD approaches often follow a \"one-category-one-model\" paradigm,\nrequiring large amounts of normal samples to train class-specific models,\nresulting in poor generalizability and hindering unified evaluation across\ndomains. To address this issue, we propose a generalized few-shot VAD method,\nUniVAD, capable of detecting anomalies across various domains, such as\nindustrial, logical, and medical anomalies, with a training-free unified model.\nUniVAD only needs few normal samples as references during testing to detect\nanomalies in previously unseen objects, without training on the specific\ndomain. Specifically, UniVAD employs a Contextual Component Clustering ($C^3$)\nmodule based on clustering and vision foundation models to segment components\nwithin the image accurately, and leverages Component-Aware Patch Matching\n(CAPM) and Graph-Enhanced Component Modeling (GECM) modules to detect anomalies\nat different semantic levels, which are aggregated to produce the final\ndetection result. We conduct experiments on nine datasets spanning industrial,\nlogical, and medical fields, and the results demonstrate that UniVAD achieves\nstate-of-the-art performance in few-shot anomaly detection tasks across\nmultiple domains, outperforming domain-specific anomaly detection models. The\ncode will be made publicly available.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03342v1.pdf","comment":"project page: https://uni-vad.github.io/"},{"id":"http://arxiv.org/abs/2412.03324v1","updated":"2024-12-04T13:56:44Z","published":"2024-12-04T13:56:44Z","title":"A Stitch in Time Saves Nine: Small VLM is a Precise Guidance for\n accelerating Large VLMs","summary":" Vision-language models (VLMs) have shown remarkable success across various\nmulti-modal tasks, yet large VLMs encounter significant efficiency challenges\ndue to processing numerous visual tokens. A promising approach to accelerating\nlarge VLM inference is using partial information, such as attention maps from\nspecific layers, to assess token importance and prune less essential tokens.\nHowever, our study reveals three key insights: (i) Partial attention\ninformation is insufficient for accurately identifying critical visual tokens,\nresulting in suboptimal performance, especially at low token retention ratios;\n(ii) Global attention information, such as the attention map aggregated across\nall layers, more effectively preserves essential tokens and maintains\ncomparable performance under aggressive pruning. However, the attention maps\nfrom all layers requires a full inference pass, which increases computational\nload and is therefore impractical in existing methods; and (iii) The global\nattention map aggregated from a small VLM closely resembles that of a large\nVLM, suggesting an efficient alternative. Based on these findings, we introduce\na \\textbf{training-free} method, \\underline{\\textbf{S}}mall VLM\n\\underline{\\textbf{G}}uidance for accelerating \\underline{\\textbf{L}}arge VLMs\n(\\textbf{SGL}). Specifically, we employ the attention map aggregated from a\nsmall VLM to guide visual token pruning in a large VLM. Additionally, an early\nexiting mechanism is developed to fully use the small VLM's predictions,\ndynamically invoking the larger VLM only when necessary, yielding a superior\ntrade-off between accuracy and computation. Extensive evaluations across 11\nbenchmarks demonstrate the effectiveness and generalizability of SGL, achieving\nup to 91\\% pruning ratio for visual tokens while retaining competitive\nperformance.\n","authors":["Wangbo Zhao","Yizeng Han","Jiasheng Tang","Zhikai Li","Yibing Song","Kai Wang","Zhangyang Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2412.03324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03318v1","updated":"2024-12-04T13:52:05Z","published":"2024-12-04T13:52:05Z","title":"Domain-Agnostic Stroke Lesion Segmentation Using Physics-Constrained\n Synthetic Data","summary":" Segmenting stroke lesions in Magnetic Resonance Imaging (MRI) is challenging\ndue to diverse clinical imaging domains, with existing models struggling to\ngeneralise across different MRI acquisition parameters and sequences. In this\nwork, we propose two novel physics-constrained approaches using synthetic\nquantitative MRI (qMRI) images to enhance the robustness and generalisability\nof segmentation models. We trained a qMRI estimation model to predict qMRI maps\nfrom MPRAGE images, which were used to simulate diverse MRI sequences for\nsegmentation training. A second approach built upon prior work in synthetic\ndata for stroke lesion segmentation, generating qMRI maps from a dataset of\ntissue labels. The proposed approaches improved over the baseline nnUNet on a\nvariety of out-of-distribution datasets, with the second approach outperforming\nthe prior synthetic data method.\n","authors":["Liam Chalcroft","Jenny Crinion","Cathy J. Price","John Ashburner"],"pdf_url":"https://arxiv.org/pdf/2412.03318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03315v1","updated":"2024-12-04T13:47:51Z","published":"2024-12-04T13:47:51Z","title":"Geometry-guided Cross-view Diffusion for One-to-many Cross-view Image\n Synthesis","summary":" This paper presents a novel approach for cross-view synthesis aimed at\ngenerating plausible ground-level images from corresponding satellite imagery\nor vice versa. We refer to these tasks as satellite-to-ground (Sat2Grd) and\nground-to-satellite (Grd2Sat) synthesis, respectively. Unlike previous works\nthat typically focus on one-to-one generation, producing a single output image\nfrom a single input image, our approach acknowledges the inherent one-to-many\nnature of the problem. This recognition stems from the challenges posed by\ndifferences in illumination, weather conditions, and occlusions between the two\nviews. To effectively model this uncertainty, we leverage recent advancements\nin diffusion models. Specifically, we exploit random Gaussian noise to\nrepresent the diverse possibilities learnt from the target view data. We\nintroduce a Geometry-guided Cross-view Condition (GCC) strategy to establish\nexplicit geometric correspondences between satellite and street-view features.\nThis enables us to resolve the geometry ambiguity introduced by camera pose\nbetween image pairs, boosting the performance of cross-view image synthesis.\nThrough extensive quantitative and qualitative analyses on three benchmark\ncross-view datasets, we demonstrate the superiority of our proposed\ngeometry-guided cross-view condition over baseline methods, including recent\nstate-of-the-art approaches in cross-view image synthesis. Our method generates\nimages of higher quality, fidelity, and diversity than other state-of-the-art\napproaches.\n","authors":["Tao Jun Lin","Wenqing Wang","Yujiao Shi","Akhil Perincherry","Ankit Vora","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2412.03315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03314v1","updated":"2024-12-04T13:47:37Z","published":"2024-12-04T13:47:37Z","title":"Equivariant Representation Learning for Augmentation-based\n Self-Supervised Learning via Image Reconstruction","summary":" Augmentation-based self-supervised learning methods have shown remarkable\nsuccess in self-supervised visual representation learning, excelling in\nlearning invariant features but often neglecting equivariant ones. This\nlimitation reduces the generalizability of foundation models, particularly for\ndownstream tasks requiring equivariance. We propose integrating an image\nreconstruction task as an auxiliary component in augmentation-based\nself-supervised learning algorithms to facilitate equivariant feature learning\nwithout additional parameters. Our method implements a cross-attention\nmechanism to blend features learned from two augmented views, subsequently\nreconstructing one of them. This approach is adaptable to various datasets and\naugmented-pair based learning methods. We evaluate its effectiveness on\nlearning equivariant features through multiple linear regression tasks and\ndownstream applications on both artificial (3DIEBench) and natural (ImageNet)\ndatasets. Results consistently demonstrate significant improvements over\nstandard augmentation-based self-supervised learning methods and\nstate-of-the-art approaches, particularly excelling in scenarios involving\ncombined augmentations. Our method enhances the learning of both invariant and\nequivariant features, leading to more robust and generalizable visual\nrepresentations for computer vision tasks.\n","authors":["Qin Wang","Kai Krajsek","Hanno Scharr"],"pdf_url":"https://arxiv.org/pdf/2412.03314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04203v3","updated":"2024-12-04T13:43:10Z","published":"2023-04-09T10:08:38Z","title":"OpenDriver: An Open-Road Driver State Detection Dataset","summary":" Among numerous studies for driver state detection, wearable physiological\nmeasurements offer a practical method for real-time monitoring. However, there\nare few driver physiological datasets in open-road scenarios, and the existing\ndatasets suffer from issues such as poor signal quality, small sample sizes,\nand short data collection periods. Therefore, in this paper, a large-scale\nmultimodal driving dataset, OpenDriver, for driver state detection is\ndeveloped. The OpenDriver encompasses a total of 3,278 driving trips, with a\nsignal collection duration spanning approximately 4,600 hours. Two modalities\nof driving signals are enrolled in OpenDriver: electrocardiogram (ECG) signals\nand six-axis motion data of the steering wheel from a motion measurement unit\n(IMU), which were recorded from 81 drivers and their vehicles. Furthermore,\nthree challenging tasks are involved in our work, namely ECG signal quality\nassessment, individual biometric identification based on ECG signals, and\nphysiological signal analysis in complex driving environments. To facilitate\nresearch in these tasks, corresponding benchmarks have also been introduced.\nFirst, a noisy augmentation strategy is applied to generate a larger-scale ECG\nsignal dataset with realistic noise simulation for quality assessment. Second,\nan end-to-end contrastive learning framework is employed for individual\nbiometric identification. Finally, a comprehensive analysis of drivers' HRV\nfeatures under different driving conditions is conducted. Each benchmark\nprovides evaluation metrics and reference results. The OpenDriver dataset will\nbe publicly available at https://github.com/bdne/OpenDriver.\n","authors":["Delong Liu","Shichao Li","Tianyi Shi","Zhu Meng","Guanyu Chen","Yadong Huang","Jin Dong","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.04203v3.pdf","comment":"Considering that there are flaws in the statistical data of the\n dataset, all the authors agreed to withdraw the manuscript"},{"id":"http://arxiv.org/abs/2407.02854v2","updated":"2024-12-04T13:41:11Z","published":"2024-07-03T07:12:36Z","title":"A Spatio-Temporal Representation Learning as an Alternative to\n Traditional Glosses in Sign Language Translation and Production","summary":" This work addresses the challenges associated with the use of glosses in both\nSign Language Translation (SLT) and Sign Language Production (SLP). While\nglosses have long been used as a bridge between sign language and spoken\nlanguage, they come with two major limitations that impede the advancement of\nsign language systems. First, annotating the glosses is a labor-intensive and\ntime-consuming process, which limits the scalability of datasets. Second, the\nglosses oversimplify sign language by stripping away its spatio-temporal\ndynamics, reducing complex signs to basic labels and missing the subtle\nmovements essential for precise interpretation. To address these limitations,\nwe introduce Universal Gloss-level Representation (UniGloR), a framework\ndesigned to capture the spatio-temporal features inherent in sign language,\nproviding a more dynamic and detailed alternative to the use of the glosses.\nThe core idea of UniGloR is simple yet effective: We derive dense\nspatio-temporal representations from sign keypoint sequences using\nself-supervised learning and seamlessly integrate them into SLT and SLP tasks.\nOur experiments in a keypoint-based setting demonstrate that UniGloR either\noutperforms or matches the performance of previous SLT and SLP methods on two\nwidely-used datasets: PHOENIX14T and How2Sign.\n","authors":["Eui Jun Hwang","Sukmin Cho","Huije Lee","Youngwoo Yoon","Jong C. Park"],"pdf_url":"https://arxiv.org/pdf/2407.02854v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.17686v2","updated":"2024-12-04T13:39:01Z","published":"2024-11-26T18:53:51Z","title":"Rethinking Token Reduction in MLLMs: Towards a Unified Paradigm for\n Training-Free Acceleration","summary":" To accelerate the inference of heavy Multimodal Large Language Models\n(MLLMs), this study rethinks the current landscape of training-free token\nreduction research. We regret to find that the critical components of existing\nmethods are tightly intertwined, with their interconnections and effects\nremaining unclear for comparison, transfer, and expansion. Therefore, we\npropose a unified ''filter-correlate-compress'' paradigm that decomposes the\ntoken reduction into three distinct stages within a pipeline, maintaining\nconsistent design objectives and elements while allowing for unique\nimplementations. We additionally demystify the popular works and subsume them\ninto our paradigm to showcase its universality. Finally, we offer a suite of\nmethods grounded in the paradigm, striking a balance between speed and accuracy\nthroughout different phases of the inference. Experimental results across 10\nbenchmarks indicate that our methods can achieve up to an 82.4% reduction in\nFLOPs with a minimal impact on performance, simultaneously surpassing\nstate-of-the-art training-free methods. Our project page is at\nhttps://ficoco-accelerate.github.io/.\n","authors":["Yuhang Han","Xuyang Liu","Pengxiang Ding","Donglin Wang","Honggang Chen","Qingsen Yan","Siteng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.17686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03297v1","updated":"2024-12-04T13:16:17Z","published":"2024-12-04T13:16:17Z","title":"Composed Image Retrieval for Training-Free Domain Conversion","summary":" This work addresses composed image retrieval in the context of domain\nconversion, where the content of a query image is retrieved in the domain\nspecified by the query text. We show that a strong vision-language model\nprovides sufficient descriptive power without additional training. The query\nimage is mapped to the text input space using textual inversion. Unlike common\npractice that invert in the continuous space of text tokens, we use the\ndiscrete word space via a nearest-neighbor search in a text vocabulary. With\nthis inversion, the image is softly mapped across the vocabulary and is made\nmore robust using retrieval-based augmentation. Database images are retrieved\nby a weighted ensemble of text queries combining mapped words with the domain\ntext. Our method outperforms prior art by a large margin on standard and newly\nintroduced benchmarks. Code: https://github.com/NikosEfth/freedom\n","authors":["Nikos Efthymiadis","Bill Psomas","Zakaria Laskar","Konstantinos Karantzalos","Yannis Avrithis","Ondřej Chum","Giorgos Tolias"],"pdf_url":"https://arxiv.org/pdf/2412.03297v1.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2412.03293v1","updated":"2024-12-04T13:11:38Z","published":"2024-12-04T13:11:38Z","title":"Diffusion-VLA: Scaling Robot Foundation Models via Unified Diffusion and\n Autoregression","summary":" In this paper, we present DiffusionVLA, a novel framework that seamlessly\ncombines the autoregression model with the diffusion model for learning\nvisuomotor policy. Central to our approach is a next-token prediction\nobjective, enabling the model to reason effectively over the user's query in\nthe context of current observations. Subsequently, a diffusion model is\nattached to generate robust action outputs. To enhance policy learning through\nself-reasoning, we introduce a novel reasoning injection module that integrates\nreasoning phrases directly into the policy learning process. The whole\nframework is simple and flexible, making it easy to deploy and upgrade. We\nconduct extensive experiments using multiple real robots to validate the\neffectiveness of DiffusionVLA. Our tests include a challenging factory sorting\ntask, where DiffusionVLA successfully categorizes objects, including those not\nseen during training. We observe that the reasoning module makes the model\ninterpretable. It allows observers to understand the model thought process and\nidentify potential causes of policy failures. Additionally, we test\nDiffusionVLA on a zero-shot bin-picking task, achieving 63.7\\% accuracy on 102\npreviously unseen objects. Our method demonstrates robustness to visual\nchanges, such as distractors and new backgrounds, and easily adapts to new\nembodiments. Furthermore, DiffusionVLA can follow novel instructions and retain\nconversational ability. Notably, DiffusionVLA is data-efficient and fast at\ninference; our smallest DiffusionVLA-2B runs 82Hz on a single A6000 GPU and can\ntrain from scratch on less than 50 demonstrations for a complex task. Finally,\nwe scale the model from 2B to 72B parameters, showcasing improved\ngeneralization capabilities with increased model size.\n","authors":["Junjie Wen","Minjie Zhu","Yichen Zhu","Zhibin Tang","Jinming Li","Zhongyi Zhou","Chengmeng Li","Xiaoyu Liu","Yaxin Peng","Chaomin Shen","Feifei Feng"],"pdf_url":"https://arxiv.org/pdf/2412.03293v1.pdf","comment":"The project page is available at: http://diffusion-vla.github.io"},{"id":"http://arxiv.org/abs/2412.03283v1","updated":"2024-12-04T12:57:17Z","published":"2024-12-04T12:57:17Z","title":"Black-Box Forgery Attacks on Semantic Watermarks for Diffusion Models","summary":" Integrating watermarking into the generation process of latent diffusion\nmodels (LDMs) simplifies detection and attribution of generated content.\nSemantic watermarks, such as Tree-Rings and Gaussian Shading, represent a novel\nclass of watermarking techniques that are easy to implement and highly robust\nagainst various perturbations. However, our work demonstrates a fundamental\nsecurity vulnerability of semantic watermarks. We show that attackers can\nleverage unrelated models, even with different latent spaces and architectures\n(UNet vs DiT), to perform powerful and realistic forgery attacks. Specifically,\nwe design two watermark forgery attacks. The first imprints a targeted\nwatermark into real images by manipulating the latent representation of an\narbitrary image in an unrelated LDM to get closer to the latent representation\nof a watermarked image. We also show that this technique can be used for\nwatermark removal. The second attack generates new images with the target\nwatermark by inverting a watermarked image and re-generating it with an\narbitrary prompt. Both attacks just need a single reference image with the\ntarget watermark. Overall, our findings question the applicability of semantic\nwatermarks by revealing that attackers can easily forge or remove these\nwatermarks under realistic conditions.\n","authors":["Andreas Müller","Denis Lukovnikov","Jonas Thietke","Asja Fischer","Erwin Quiring"],"pdf_url":"https://arxiv.org/pdf/2412.03283v1.pdf","comment":"23 pages, 21 figures, 6 tables"},{"id":"http://arxiv.org/abs/2410.15957v3","updated":"2024-12-04T12:54:44Z","published":"2024-10-21T12:36:27Z","title":"CamI2V: Camera-Controlled Image-to-Video Diffusion Model","summary":" Recent advancements have integrated camera pose as a user-friendly and\nphysics-informed condition in video diffusion models, enabling precise camera\ncontrol. In this paper, we identify one of the key challenges as effectively\nmodeling noisy cross-frame interactions to enhance geometry consistency and\ncamera controllability. We innovatively associate the quality of a condition\nwith its ability to reduce uncertainty and interpret noisy cross-frame features\nas a form of noisy condition. Recognizing that noisy conditions provide\ndeterministic information while also introducing randomness and potential\nmisguidance due to added noise, we propose applying epipolar attention to only\naggregate features along corresponding epipolar lines, thereby accessing an\noptimal amount of noisy conditions. Additionally, we address scenarios where\nepipolar lines disappear, commonly caused by rapid camera movements, dynamic\nobjects, or occlusions, ensuring robust performance in diverse environments.\nFurthermore, we develop a more robust and reproducible evaluation pipeline to\naddress the inaccuracies and instabilities of existing camera control metrics.\nOur method achieves a 25.64% improvement in camera controllability on the\nRealEstate10K dataset without compromising dynamics or generation quality and\ndemonstrates strong generalization to out-of-domain images. Training and\ninference require only 24GB and 12GB of memory, respectively, for 16-frame\nsequences at 256x256 resolution. We will release all checkpoints, along with\ntraining and evaluation code. Dynamic videos are best viewed at\nhttps://zgctroy.github.io/CamI2V.\n","authors":["Guangcong Zheng","Teng Li","Rui Jiang","Yehao Lu","Tao Wu","Xi Li"],"pdf_url":"https://arxiv.org/pdf/2410.15957v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11268v5","updated":"2024-12-04T12:43:30Z","published":"2023-09-20T12:51:13Z","title":"StructChart: On the Schema, Metric, and Augmentation for Visual Chart\n Understanding","summary":" Charts are common in literature across various scientific fields, conveying\nrich information easily accessible to readers. Current chart-related tasks\nfocus on either chart perception that extracts information from the visual\ncharts, or chart reasoning given the extracted data, e.g. in a tabular form. In\nthis paper, we introduce StructChart, a novel framework that leverages\nStructured Triplet Representations (STR) to achieve a unified and\nlabel-efficient approach to chart perception and reasoning tasks, which is\ngenerally applicable to different downstream tasks, beyond the\nquestion-answering task as specifically studied in peer works. Specifically,\nStructChart first reformulates the chart data from the tubular form (linearized\nCSV) to STR, which can friendlily reduce the task gap between chart perception\nand reasoning. We then propose a Structuring Chart-oriented Representation\nMetric (SCRM) to quantitatively evaluate the chart perception task performance.\nTo augment the training, we further explore the potential of Large Language\nModels (LLMs) to enhance the diversity in both chart visual style and\nstatistical information. Extensive experiments on various chart-related tasks\ndemonstrate the effectiveness and potential of a unified chart\nperception-reasoning paradigm to push the frontier of chart understanding.\n","authors":["Renqiu Xia","Haoyang Peng","Hancheng Ye","Mingsheng Li","Xiangchao Yan","Peng Ye","Botian Shi","Yu Qiao","Junchi Yan","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.11268v5.pdf","comment":"All codes, models and SimChart9K data are available for downloading\n at: https://github.com/UniModal4Reasoning/ChartVLM and\n https://github.com/UniModal4Reasoning/SimChart9K"},{"id":"http://arxiv.org/abs/2407.18555v3","updated":"2024-12-04T12:42:04Z","published":"2024-07-26T07:08:05Z","title":"How to Segment in 3D Using 2D Models: Automated 3D Segmentation of\n Prostate Cancer Metastatic Lesions on PET Volumes Using Multi-angle Maximum\n Intensity Projections and Diffusion Models","summary":" Prostate specific membrane antigen (PSMA) positron emission\ntomography/computed tomography (PET/CT) imaging provides a tremendously\nexciting frontier in visualization of prostate cancer (PCa) metastatic lesions.\nHowever, accurate segmentation of metastatic lesions is challenging due to low\nsignal-to-noise ratios and variable sizes, shapes, and locations of the\nlesions. This study proposes a novel approach for automated segmentation of\nmetastatic lesions in PSMA PET/CT 3D volumetric images using 2D denoising\ndiffusion probabilistic models (DDPMs). Instead of 2D trans-axial slices or 3D\nvolumes, the proposed approach segments the lesions on generated multi-angle\nmaximum intensity projections (MA-MIPs) of the PSMA PET images, then obtains\nthe final 3D segmentation masks from 3D ordered subset expectation maximization\n(OSEM) reconstruction of 2D MA-MIPs segmentations. Our proposed method achieved\nsuperior performance compared to state-of-the-art 3D segmentation approaches in\nterms of accuracy and robustness in detecting and segmenting small metastatic\nPCa lesions. The proposed method has significant potential as a tool for\nquantitative analysis of metastatic burden in PCa patients.\n","authors":["Amirhosein Toosi","Sara Harsini","François Bénard","Carlos Uribe","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2407.18555v3.pdf","comment":"11 pages, 2 figures, accepted in the DGM4MICCAI workshop, MICCAI,\n 2024"},{"id":"http://arxiv.org/abs/2412.00153v2","updated":"2024-12-04T12:40:30Z","published":"2024-11-29T07:00:18Z","title":"ROSE: Revolutionizing Open-Set Dense Segmentation with Patch-Wise\n Perceptual Large Multimodal Model","summary":" Advances in CLIP and large multimodal models (LMMs) have enabled\nopen-vocabulary and free-text segmentation, yet existing models still require\npredefined category prompts, limiting free-form category self-generation. Most\nsegmentation LMMs also remain confined to sparse predictions, restricting their\napplicability in open-set environments. In contrast, we propose ROSE, a\nRevolutionary Open-set dense SEgmentation LMM, which enables dense mask\nprediction and open-category generation through patch-wise perception. Our\nmethod treats each image patch as an independent region of interest candidate,\nenabling the model to predict both dense and sparse masks simultaneously.\nAdditionally, a newly designed instruction-response paradigm takes full\nadvantage of the generation and generalization capabilities of LMMs, achieving\ncategory prediction independent of closed-set constraints or predefined\ncategories. To further enhance mask detail and category precision, we introduce\na conversation-based refinement paradigm, integrating the prediction result\nfrom previous step with textual prompt for revision. Extensive experiments\ndemonstrate that ROSE achieves competitive performance across various\nsegmentation tasks in a unified framework. Code will be released.\n","authors":["Kunyang Han","Yibo Hu","Mengxue Qu","Hailin Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2412.00153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03268v1","updated":"2024-12-04T12:23:17Z","published":"2024-12-04T12:23:17Z","title":"RFSR: Improving ISR Diffusion Models via Reward Feedback Learning","summary":" Generative diffusion models (DM) have been extensively utilized in image\nsuper-resolution (ISR). Most of the existing methods adopt the denoising loss\nfrom DDPMs for model optimization. We posit that introducing reward feedback\nlearning to finetune the existing models can further improve the quality of the\ngenerated images. In this paper, we propose a timestep-aware training strategy\nwith reward feedback learning. Specifically, in the initial denoising stages of\nISR diffusion, we apply low-frequency constraints to super-resolution (SR)\nimages to maintain structural stability. In the later denoising stages, we use\nreward feedback learning to improve the perceptual and aesthetic quality of the\nSR images. In addition, we incorporate Gram-KL regularization to alleviate\nstylization caused by reward hacking. Our method can be integrated into any\ndiffusion-based ISR model in a plug-and-play manner. Experiments show that ISR\ndiffusion models, when fine-tuned with our method, significantly improve the\nperceptual and aesthetic quality of SR images, achieving excellent subjective\nresults. Code: https://github.com/sxpro/RFSR\n","authors":["Xiaopeng Sun","Qinwei Lin","Yu Gao","Yujie Zhong","Chengjian Feng","Dengjie Li","Zheng Zhao","Jie Hu","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2412.03268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16316v5","updated":"2024-12-04T12:17:31Z","published":"2024-11-25T12:09:43Z","title":"Monocular Lane Detection Based on Deep Learning: A Survey","summary":" Lane detection plays an important role in autonomous driving perception\nsystems. As deep learning algorithms gain popularity, monocular lane detection\nmethods based on them have demonstrated superior performance and emerged as a\nkey research direction in autonomous driving perception. The core designs of\nthese algorithmic frameworks can be summarized as follows: (1) Task paradigm,\nfocusing on lane instance-level discrimination; (2) Lane modeling, representing\nlanes as a set of learnable parameters in the neural network; (3) Global\ncontext supplementation, enhancing inference on the obscure lanes; (4)\nPerspective effect elimination, providing accurate 3D lanes for downstream\napplications. From these perspectives, this paper presents a comprehensive\noverview of existing methods, encompassing both the increasingly mature 2D lane\ndetection approaches and the developing 3D lane detection works. Besides, this\npaper compares the performance of mainstream methods on different benchmarks\nand investigates their inference speed under a unified setting for fair\ncomparison. Moreover, we present some extended works on lane detection,\nincluding multi-task perception, video lane detection, online high-definition\nmap construction, and lane topology reasoning, to offer readers a comprehensive\nroadmap for the evolution of lane detection. Finally, we point out some\npotential future research directions in this field. We exhaustively collect the\npapers and codes of existing works at\nhttps://github.com/Core9724/Awesome-Lane-Detection and will keep tracing the\nresearch.\n","authors":["Xin He","Haiyun Guo","Kuan Zhu","Bingke Zhu","Xu Zhao","Jianwu Fang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2411.16316v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02245v2","updated":"2024-12-04T12:16:16Z","published":"2024-12-03T08:18:56Z","title":"SparseLGS: Sparse View Language Embedded Gaussian Splatting","summary":" Recently, several studies have combined Gaussian Splatting to obtain scene\nrepresentations with language embeddings for open-vocabulary 3D scene\nunderstanding. While these methods perform well, they essentially require very\ndense multi-view inputs, limiting their applicability in real-world scenarios.\nIn this work, we propose SparseLGS to address the challenge of 3D scene\nunderstanding with pose-free and sparse view input images. Our method leverages\na learning-based dense stereo model to handle pose-free and sparse inputs, and\na three-step region matching approach to address the multi-view semantic\ninconsistency problem, which is especially important for sparse inputs.\nDifferent from directly learning high-dimensional CLIP features, we extract\nlow-dimensional information and build bijections to avoid excessive learning\nand storage costs. We introduce a reconstruction loss during semantic training\nto improve Gaussian positions and shapes. To the best of our knowledge, we are\nthe first to address the 3D semantic field problem with sparse pose-free\ninputs. Experimental results show that SparseLGS achieves comparable quality\nwhen reconstructing semantic fields with fewer inputs (3-4 views) compared to\nprevious SOTA methods with dense input. Besides, when using the same sparse\ninput, SparseLGS leads significantly in quality and heavily improves the\ncomputation speed (5$\\times$speedup). Project page:\nhttps://ustc3dv.github.io/SparseLGS\n","authors":["Jun Hu","Zhang Chen","Zhong Li","Yi Xu","Juyong Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02245v2.pdf","comment":"Project Page: https://ustc3dv.github.io/SparseLGS"},{"id":"http://arxiv.org/abs/2412.03263v1","updated":"2024-12-04T12:11:19Z","published":"2024-12-04T12:11:19Z","title":"NeRF and Gaussian Splatting SLAM in the Wild","summary":" Navigating outdoor environments with visual Simultaneous Localization and\nMapping (SLAM) systems poses significant challenges due to dynamic scenes,\nlighting variations, and seasonal changes, requiring robust solutions. While\ntraditional SLAM methods struggle with adaptability, deep learning-based\napproaches and emerging neural radiance fields as well as Gaussian\nSplatting-based SLAM methods, offer promising alternatives. However, these\nmethods have primarily been evaluated in controlled indoor environments with\nstable conditions, leaving a gap in understanding their performance in\nunstructured and variable outdoor settings. This study addresses this gap by\nevaluating these methods in natural outdoor environments, focusing on camera\ntracking accuracy, robustness to environmental factors, and computational\nefficiency, highlighting distinct trade-offs. Extensive evaluations demonstrate\nthat neural SLAM methods achieve superior robustness, particularly under\nchallenging conditions such as low light, but at a high computational cost. At\nthe same time, traditional methods perform the best across seasons but are\nhighly sensitive to variations in lighting conditions. The code of the\nbenchmark is publicly available at\nhttps://github.com/iis-esslingen/nerf-3dgs-benchmark.\n","authors":["Fabian Schmidt","Markus Enzweiler","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2412.03263v1.pdf","comment":"5 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2412.03261v1","updated":"2024-12-04T12:07:20Z","published":"2024-12-04T12:07:20Z","title":"Is JPEG AI going to change image forensics?","summary":" In this paper, we investigate the counter-forensic effects of the forthcoming\nJPEG AI standard based on neural image compression, focusing on two critical\nareas: deepfake image detection and image splicing localization. Neural image\ncompression leverages advanced neural network algorithms to achieve higher\ncompression rates while maintaining image quality. However, it introduces\nartifacts that closely resemble those generated by image synthesis techniques\nand image splicing pipelines, complicating the work of researchers when\ndiscriminating pristine from manipulated content. We comprehensively analyze\nJPEG AI's counter-forensic effects through extensive experiments on several\nstate-of-the-art detectors and datasets. Our results demonstrate that an\nincrease in false alarms impairs the performance of leading forensic detectors\nwhen analyzing genuine content processed through JPEG AI. By exposing the\nvulnerabilities of the available forensic tools we aim to raise the urgent need\nfor multimedia forensics researchers to include JPEG AI images in their\nexperimental setups and develop robust forensic techniques to distinguish\nbetween neural compression artifacts and actual manipulations.\n","authors":["Edoardo Daniele Cannas","Sara Mandelli","Natasa Popovic","Ayman Alkhateeb","Alessandro Gnutti","Paolo Bestagini","Stefano Tubaro"],"pdf_url":"https://arxiv.org/pdf/2412.03261v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03259v1","updated":"2024-12-04T11:59:36Z","published":"2024-12-04T11:59:36Z","title":"GERD: Geometric event response data generation","summary":" Event-based vision sensors are appealing because of their time resolution,\nhigher dynamic range, and low-power consumption. They also provide data that is\nfundamentally different from conventional frame-based cameras: events are\nsparse, discrete, and require integration in time. Unlike conventional models\ngrounded in established geometric and physical principles, event-based models\nlack comparable foundations. We introduce a method to generate event-based data\nunder controlled transformations. Specifically, we subject a prototypical\nobject to transformations that change over time to produce carefully curated\nevent videos. We hope this work simplifies studies for geometric approaches in\nevent-based vision. GERD is available at https://github.com/ncskth/gerd\n","authors":["Jens Egholm Pedersen","Dimitris Korakovounis","Jörg Conradt"],"pdf_url":"https://arxiv.org/pdf/2412.03259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03255v1","updated":"2024-12-04T11:54:57Z","published":"2024-12-04T11:54:57Z","title":"DynamicControl: Adaptive Condition Selection for Improved Text-to-Image\n Generation","summary":" To enhance the controllability of text-to-image diffusion models, current\nControlNet-like models have explored various control signals to dictate image\nattributes. However, existing methods either handle conditions inefficiently or\nuse a fixed number of conditions, which does not fully address the complexity\nof multiple conditions and their potential conflicts. This underscores the need\nfor innovative approaches to manage multiple conditions effectively for more\nreliable and detailed image synthesis. To address this issue, we propose a\nnovel framework, DynamicControl, which supports dynamic combinations of diverse\ncontrol signals, allowing adaptive selection of different numbers and types of\nconditions. Our approach begins with a double-cycle controller that generates\nan initial real score sorting for all input conditions by leveraging\npre-trained conditional generation models and discriminative models. This\ncontroller evaluates the similarity between extracted conditions and input\nconditions, as well as the pixel-level similarity with the source image. Then,\nwe integrate a Multimodal Large Language Model (MLLM) to build an efficient\ncondition evaluator. This evaluator optimizes the ordering of conditions based\non the double-cycle controller's score ranking. Our method jointly optimizes\nMLLMs and diffusion models, utilizing MLLMs' reasoning capabilities to\nfacilitate multi-condition text-to-image (T2I) tasks. The final sorted\nconditions are fed into a parallel multi-control adapter, which learns feature\nmaps from dynamic visual conditions and integrates them to modulate ControlNet,\nthereby enhancing control over generated images. Through both quantitative and\nqualitative comparisons, DynamicControl demonstrates its superiority over\nexisting methods in terms of controllability, generation quality and\ncomposability under various conditional controls.\n","authors":["Qingdong He","Jinlong Peng","Pengcheng Xu","Boyuan Jiang","Xiaobin Hu","Donghao Luo","Yong Liu","Yabiao Wang","Chengjie Wang","Xiangtai Li","Jiangning Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03248v1","updated":"2024-12-04T11:47:57Z","published":"2024-12-04T11:47:57Z","title":"AIM: Adaptive Inference of Multi-Modal LLMs via Token Merging and\n Pruning","summary":" Large language models (LLMs) have enabled the creation of multi-modal LLMs\nthat exhibit strong comprehension of visual data such as images and videos.\nHowever, these models usually rely on extensive visual tokens from visual\nencoders, leading to high computational demands, which limits their\napplicability in resource-constrained environments and for long-context tasks.\nIn this work, we propose a training-free adaptive inference method for\nmulti-modal LLMs that can accommodate a broad range of efficiency requirements\nwith a minimum performance drop. Our method consists of a) iterative token\nmerging based on embedding similarity before LLMs, and b) progressive token\npruning within LLM layers based on multi-modal importance. With a minimalist\ndesign, our method can be applied to both video and image LLMs. Extensive\nexperiments on diverse video and image benchmarks demonstrate that, our method\nsubstantially reduces computation load (e.g., a $\\textbf{7-fold}$ reduction in\nFLOPs) while preserving the performance of video and image LLMs. Further, under\na similar computational cost, our method outperforms the state-of-the-art\nmethods in long video understanding (e.g., $\\textbf{+4.6}$ on MLVU).\nAdditionally, our in-depth analysis provides insights into token redundancy and\nLLM layer behaviors, offering guidance for future research in designing\nefficient multi-modal LLMs. Our code will be available at\nhttps://github.com/LaVi-Lab/AIM.\n","authors":["Yiwu Zhong","Zhuoming Liu","Yin Li","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03248v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2409.09318v3","updated":"2024-12-04T11:44:57Z","published":"2024-09-14T05:31:29Z","title":"ODE: Open-Set Evaluation of Hallucinations in Multimodal Large Language\n Models","summary":" Hallucination poses a persistent challenge for multimodal large language\nmodels (MLLMs). However, existing benchmarks for evaluating hallucinations are\ngenerally static, which may overlook the potential risk of data contamination.\nTo address this issue, we propose ODE, an open-set, dynamic protocol designed\nto evaluate object hallucinations in MLLMs at both the existence and attribute\nlevels. ODE employs a graph-based structure to represent real-world object\nconcepts, their attributes, and the distributional associations between them.\nThis structure facilitates the extraction of concept combinations based on\ndiverse distributional criteria, generating varied samples for structured\nqueries that evaluate hallucinations in both generative and discriminative\ntasks. Through the generation of new samples, dynamic concept combinations, and\nvaried distribution frequencies, ODE mitigates the risk of data contamination\nand broadens the scope of evaluation. This protocol is applicable to both\ngeneral and specialized scenarios, including those with limited data.\nExperimental results demonstrate the effectiveness of our protocol, revealing\nthat MLLMs exhibit higher hallucination rates when evaluated with ODE-generated\nsamples, which indicates potential data contamination. Furthermore, these\ngenerated samples aid in analyzing hallucination patterns and fine-tuning\nmodels, offering an effective approach to mitigating hallucinations in MLLMs.\n","authors":["Yahan Tu","Rui Hu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2409.09318v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03240v1","updated":"2024-12-04T11:42:17Z","published":"2024-12-04T11:42:17Z","title":"Task-driven Image Fusion with Learnable Fusion Loss","summary":" Multi-modal image fusion aggregates information from multiple sensor sources,\nachieving superior visual quality and perceptual characteristics compared to\nany single source, often enhancing downstream tasks. However, current fusion\nmethods for downstream tasks still use predefined fusion objectives that\npotentially mismatch the downstream tasks, limiting adaptive guidance and\nreducing model flexibility. To address this, we propose Task-driven Image\nFusion (TDFusion), a fusion framework incorporating a learnable fusion loss\nguided by task loss. Specifically, our fusion loss includes learnable\nparameters modeled by a neural network called the loss generation module. This\nmodule is supervised by the loss of downstream tasks in a meta-learning manner.\nThe learning objective is to minimize the task loss of the fused images, once\nthe fusion module has been optimized by the fusion loss. Iterative updates\nbetween the fusion module and the loss module ensure that the fusion network\nevolves toward minimizing task loss, guiding the fusion process toward the task\nobjectives. TDFusion's training relies solely on the loss of downstream tasks,\nmaking it adaptable to any specific task. It can be applied to any architecture\nof fusion and task networks. Experiments demonstrate TDFusion's performance in\nboth fusion and task-related applications, including four public fusion\ndatasets, semantic segmentation, and object detection. The code will be\nreleased.\n","authors":["Haowen Bai","Jiangshe Zhang","Zixiang Zhao","Yichen Wu","Lilun Deng","Yukun Cui","Tao Feng","Shuang Xu"],"pdf_url":"https://arxiv.org/pdf/2412.03240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.11243v4","updated":"2024-12-04T11:23:37Z","published":"2024-04-17T10:49:00Z","title":"Multi-Sensor Diffusion-Driven Optical Image Translation for Large-Scale\n Applications","summary":" Comparing images captured by disparate sensors is a common challenge in\nremote sensing. This requires image translation -- converting imagery from one\nsensor domain to another while preserving the original content. Denoising\nDiffusion Implicit Models (DDIM) are potential state-of-the-art solutions for\nsuch domain translation due to their proven superiority in multiple\nimage-to-image translation tasks in computer vision. However, these models\nstruggle with reproducing radiometric features of large-scale multi-patch\nimagery, resulting in inconsistencies across the full image. This renders\ndownstream tasks like Heterogeneous Change Detection impractical. To overcome\nthese limitations, we propose a method that leverages denoising diffusion for\neffective multi-sensor optical image translation over large areas. Our approach\nsuper-resolves large-scale low spatial resolution images into high-resolution\nequivalents from disparate optical sensors, ensuring uniformity across hundreds\nof patches. Our contributions lie in new forward and reverse diffusion\nprocesses that address the challenges of large-scale image translation.\nExtensive experiments using paired Sentinel-II (10m) and Planet Dove (3m)\nimages demonstrate that our approach provides precise domain adaptation,\npreserving image content while improving radiometric accuracy and feature\nrepresentation. A thorough image quality assessment and comparisons with the\nstandard DDIM framework and five other leading methods are presented. We reach\na mean Learned Perceptual Image Patch Similarity (mLPIPS) of 0.1884 and a\nFr\\'echet Inception Distance (FID) of 45.64, expressively outperforming all\ncompared methods, including DDIM, ShuffleMixer, and SwinIR. The usefulness of\nour approach is further demonstrated in two Heterogeneous Change Detection\ntasks.\n","authors":["João Gabriel Vinholi","Marco Chini","Anis Amziane","Renato Machado","Danilo Silva","Patrick Matgen"],"pdf_url":"https://arxiv.org/pdf/2404.11243v4.pdf","comment":"This is the accepted version of the manuscript published in IEEE\n Journal of Selected Topics in Applied Earth Observations and Remote Sensing\n (JSTARS). Please access the final version at IEEEXplore (Open Access). DOI\n 10.1109/JSTARS.2024.3506032. This technology is protected by a patent filed\n on 23 december 2023 at Office Luxembourgeois de la propri\\'et\\'e\n intellectuelle (LU505861)"},{"id":"http://arxiv.org/abs/2412.03225v1","updated":"2024-12-04T11:23:15Z","published":"2024-12-04T11:23:15Z","title":"MaterialPicker: Multi-Modal Material Generation with Diffusion\n Transformers","summary":" High-quality material generation is key for virtual environment authoring and\ninverse rendering. We propose MaterialPicker, a multi-modal material generator\nleveraging a Diffusion Transformer (DiT) architecture, improving and\nsimplifying the creation of high-quality materials from text prompts and/or\nphotographs. Our method can generate a material based on an image crop of a\nmaterial sample, even if the captured surface is distorted, viewed at an angle\nor partially occluded, as is often the case in photographs of natural scenes.\nWe further allow the user to specify a text prompt to provide additional\nguidance for the generation. We finetune a pre-trained DiT-based video\ngenerator into a material generator, where each material map is treated as a\nframe in a video sequence. We evaluate our approach both quantitatively and\nqualitatively and show that it enables more diverse material generation and\nbetter distortion correction than previous work.\n","authors":["Xiaohe Ma","Valentin Deschaintre","Miloš Hašan","Fujun Luan","Kun Zhou","Hongzhi Wu","Yiwei Hu"],"pdf_url":"https://arxiv.org/pdf/2412.03225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16204v3","updated":"2024-12-04T11:14:20Z","published":"2024-06-23T20:00:20Z","title":"Breaking the Frame: Visual Place Recognition by Overlap Prediction","summary":" Visual place recognition methods struggle with occlusions and partial visual\noverlaps. We propose a novel visual place recognition approach based on overlap\nprediction, called VOP, shifting from traditional reliance on global image\nsimilarities and local features to image overlap prediction. VOP proceeds\nco-visible image sections by obtaining patch-level embeddings using a Vision\nTransformer backbone and establishing patch-to-patch correspondences without\nrequiring expensive feature detection and matching. Our approach uses a voting\nmechanism to assess overlap scores for potential database images. It provides a\nnuanced image retrieval metric in challenging scenarios. Experimental results\nshow that VOP leads to more accurate relative pose estimation and localization\nresults on the retrieved image pairs than state-of-the-art baselines on a\nnumber of large-scale, real-world indoor and outdoor benchmarks. The code is\navailable at https://github.com/weitong8591/vop.git.\n","authors":["Tong Wei","Philipp Lindenberger","Jiri Matas","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2406.16204v3.pdf","comment":"WACV 2025"},{"id":"http://arxiv.org/abs/2407.05650v2","updated":"2024-12-04T11:12:42Z","published":"2024-07-08T06:22:10Z","title":"The Cooperative Network Architecture: Learning Structured Networks as\n Representation of Sensory Patterns","summary":" Nets, cooperative networks of neurons, have been proposed as format for the\nrepresentation of sensory signals, as physical implementation of the Gestalt\nphenomenon and as solution to the neural binding problem, while the direct\ninteraction between nets by structure-sensitive matching has been proposed as\nbasis for object-global operations such as object detection. The nets are\nflexibly composed of overlapping net fragments, which are learned from\nstatistical regularities of sensory input. We here present the cooperative\nnetwork architecture (CNA), a concrete model that learns such net structure to\nrepresent input patterns and deals robustly with noise, deformation, and\nout-of-distribution data, thus laying the groundwork for a novel neural\narchitecture.\n","authors":["Pascal J. Sager","Jan M. Deriu","Benjamin F. Grewe","Thilo Stadelmann","Christoph von der Malsburg"],"pdf_url":"https://arxiv.org/pdf/2407.05650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03215v1","updated":"2024-12-04T11:08:32Z","published":"2024-12-04T11:08:32Z","title":"Beyond [cls]: Exploring the true potential of Masked Image Modeling\n representations","summary":" Masked Image Modeling (MIM) has emerged as a popular method for\nSelf-Supervised Learning (SSL) of visual representations. However, for\nhigh-level perception tasks, MIM-pretrained models offer lower out-of-the-box\nrepresentation quality than the Joint-Embedding Architectures (JEA) - another\nprominent SSL paradigm. To understand this performance gap, we analyze the\ninformation flow in Vision Transformers (ViT) learned by both approaches. We\nreveal that whereas JEAs construct their representation on a selected set of\nrelevant image fragments, MIM models aggregate nearly whole image content.\nMoreover, we demonstrate that MIM-trained ViTs retain valuable information\nwithin their patch tokens, which is not effectively captured by the global\n[cls] token representations. Therefore, selective aggregation of relevant patch\ntokens, without any fine-tuning, results in consistently higher-quality of MIM\nrepresentations. To our knowledge, we are the first to highlight the lack of\neffective representation aggregation as an emergent issue of MIM and propose\ndirections to address it, contributing to future advances in Self-Supervised\nLearning.\n","authors":["Marcin Przewięźlikowski","Randall Balestriero","Wojciech Jasiński","Marek Śmieja","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2412.03215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03214v1","updated":"2024-12-04T11:05:01Z","published":"2024-12-04T11:05:01Z","title":"Continual Low-Rank Scaled Dot-product Attention","summary":" Transformers are widely used for their ability to capture data relations in\nsequence processing, with great success for a wide range of static tasks.\nHowever, the computational and memory footprint of their main component, i.e.,\nthe Scaled Dot-product Attention, is commonly overlooked. This makes their\nadoption in applications involving stream data processing with constraints in\nresponse latency, computational and memory resources infeasible. Some works\nhave proposed methods to lower the computational cost of transformers, i.e.\nlow-rank approximations, sparsity in attention, and efficient formulations for\nContinual Inference. In this paper, we introduce a new formulation of the\nScaled Dot-product Attention based on the Nystr\\\"om approximation that is\nsuitable for Continual Inference. In experiments on Online Audio Classification\nand Online Action Detection tasks, the proposed Continual Scaled Dot-product\nAttention can lower the number of operations by up to three orders of magnitude\ncompared to the original Transformers while retaining the predictive\nperformance of competing models.\n","authors":["Ginés Carreto Picón","Illia Oleksiienko","Lukas Hedegaard","Arian Bakhtiarnia","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2412.03214v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.03212v1","updated":"2024-12-04T10:57:55Z","published":"2024-12-04T10:57:55Z","title":"Semi-Supervised Transfer Boosting (SS-TrBoosting)","summary":" Semi-supervised domain adaptation (SSDA) aims at training a high-performance\nmodel for a target domain using few labeled target data, many unlabeled target\ndata, and plenty of auxiliary data from a source domain. Previous works in SSDA\nmainly focused on learning transferable representations across domains.\nHowever, it is difficult to find a feature space where the source and target\ndomains share the same conditional probability distribution. Additionally,\nthere is no flexible and effective strategy extending existing unsupervised\ndomain adaptation (UDA) approaches to SSDA settings. In order to solve the\nabove two challenges, we propose a novel fine-tuning framework, semi-supervised\ntransfer boosting (SS-TrBoosting). Given a well-trained deep learning-based UDA\nor SSDA model, we use it as the initial model, generate additional base\nlearners by boosting, and then use all of them as an ensemble. More\nspecifically, half of the base learners are generated by supervised domain\nadaptation, and half by semi-supervised learning. Furthermore, for more\nefficient data transmission and better data privacy protection, we propose a\nsource data generation approach to extend SS-TrBoosting to semi-supervised\nsource-free domain adaptation (SS-SFDA). Extensive experiments showed that\nSS-TrBoosting can be applied to a variety of existing UDA, SSDA and SFDA\napproaches to further improve their performance.\n","authors":["Lingfei Deng","Changming Zhao","Zhenbang Du","Kun Xia","Dongrui Wu"],"pdf_url":"https://arxiv.org/pdf/2412.03212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03210v1","updated":"2024-12-04T10:55:44Z","published":"2024-12-04T10:55:44Z","title":"Parametric Enhancement of PerceptNet: A Human-Inspired Approach for\n Image Quality Assessment","summary":" While deep learning models can learn human-like features at earlier levels,\nwhich suggests their utility in modeling human vision, few attempts exist to\nincorporate these features by design. Current approaches mostly optimize all\nparameters blindly, only constraining minor architectural aspects. This paper\ndemonstrates how parametrizing neural network layers enables more\nbiologically-plausible operations while reducing trainable parameters and\nimproving interpretability. We constrain operations to functional forms present\nin human vision, optimizing only these functions' parameters rather than all\nconvolutional tensor elements independently. We present two parametric model\nversions: one with hand-chosen biologically plausible parameters, and another\nfitted to human perception experimental data. We compare these with a\nnon-parametric version. All models achieve comparable state-of-the-art results,\nwith parametric versions showing orders of magnitude parameter reduction for\nminimal performance loss. The parametric models demonstrate improved\ninterpretability and training behavior. Notably, the model fitted to human\nperception, despite biological initialization, converges to biologically\nincorrect results. This raises scientific questions and highlights the need for\ndiverse evaluation methods to measure models' humanness, rather than assuming\ntask performance correlates with human-like behavior.\n","authors":["Jorge Vila-Tomás","Pablo Hernández-Cámara","Valero Laparra","Jesús Malo"],"pdf_url":"https://arxiv.org/pdf/2412.03210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03098v2","updated":"2024-12-04T10:52:25Z","published":"2024-11-05T13:44:25Z","title":"Local Lesion Generation is Effective for Capsule Endoscopy Image Data\n Augmentation in a Limited Data Setting","summary":" Limited medical imaging datasets challenge deep learning models by increasing\nrisks of overfitting and reduced generalization, particularly in Generative\nAdversarial Networks (GANs), where discriminators may overfit, leading to\ntraining divergence. This constraint also impairs classification models trained\non small datasets. Generative Data Augmentation (GDA) addresses this by\nexpanding training datasets with synthetic data, although it requires training\na generative model. We propose and evaluate two local lesion generation\napproaches to address the challenge of augmenting small medical image datasets.\nThe first approach employs the Poisson Image Editing algorithm, a classical\nimage processing technique, to create realistic image composites that\noutperform current state-of-the-art methods. The second approach introduces a\nnovel generative method, leveraging a fine-tuned Image Inpainting GAN to\nsynthesize realistic lesions within specified regions of real training images.\nA comprehensive comparison of the two proposed methods demonstrates that\neffective local lesion generation in a data-constrained setting allows for\nreaching new state-of-the-art results in capsule endoscopy lesion\nclassification. Combination of our techniques achieves a macro F1-score of\n33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on\nthe highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule\nendoscopy. To the best of our knowledge, this work is the first to apply a\nfine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that\nan image-conditional GAN can be adapted effectively to limited datasets to\ngenerate high-quality examples, facilitating effective data augmentation.\nAdditionally, we show that combining this GAN-based approach with classical\nimage processing techniques further improves the results.\n","authors":["Adrian B. Chłopowiec","Adam R. Chłopowiec","Krzysztof Galus","Wojciech Cebula","Martin Tabakov"],"pdf_url":"https://arxiv.org/pdf/2411.03098v2.pdf","comment":"54 pages, 35 figures"},{"id":"http://arxiv.org/abs/2412.03200v1","updated":"2024-12-04T10:40:17Z","published":"2024-12-04T10:40:17Z","title":"Fab-ME: A Vision State-Space and Attention-Enhanced Framework for Fabric\n Defect Detection","summary":" Effective defect detection is critical for ensuring the quality,\nfunctionality, and economic value of textile products. However, existing\nmethods face challenges in achieving high accuracy, real-time performance, and\nefficient global information extraction. To address these issues, we propose\nFab-ME, an advanced framework based on YOLOv8s, specifically designed for the\naccurate detection of 20 fabric defect types. Our contributions include the\nintroduction of the cross-stage partial bottleneck with two convolutions (C2F)\nvision state-space (C2F-VMamba) module, which integrates visual state-space\n(VSS) blocks into the YOLOv8s feature fusion network neck, enhancing the\ncapture of intricate details and global context while maintaining high\nprocessing speeds. Additionally, we incorporate an enhanced multi-scale channel\nattention (EMCA) module into the final layer of the feature extraction network,\nsignificantly improving sensitivity to small targets. Experimental results on\nthe Tianchi fabric defect detection dataset demonstrate that Fab-ME achieves a\n3.3\\% improvement in mAP@0.5 compared to the original YOLOv8s, validating its\neffectiveness for precise and efficient fabric defect detection.\n","authors":["Shuai Wang","Huiyan Kong","Baotian Li","Fa Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.03200v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2403.08004v2","updated":"2024-12-04T10:35:25Z","published":"2024-03-12T18:12:50Z","title":"Leveraging LLMs for On-the-Fly Instruction Guided Image Editing","summary":" The combination of language processing and image processing keeps attracting\nincreased interest given recent impressive advances that leverage the combined\nstrengths of both domains of research. Among these advances, the task of\nediting an image on the basis solely of a natural language instruction stands\nout as a most challenging endeavour. While recent approaches for this task\nresort, in one way or other, to some form of preliminary preparation, training\nor fine-tuning, this paper explores a novel approach: We propose a\npreparation-free method that permits instruction-guided image editing on the\nfly. This approach is organized along three steps properly orchestrated that\nresort to image captioning and DDIM inversion, followed by obtaining the edit\ndirection embedding, followed by image editing proper. While dispensing with\npreliminary preparation, our approach demonstrates to be effective and\ncompetitive, outperforming recent, state of the art models for this task when\nevaluated on the MAGICBRUSH dataset.\n","authors":["Rodrigo Santos","João Silva","António Branco"],"pdf_url":"https://arxiv.org/pdf/2403.08004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03192v1","updated":"2024-12-04T10:25:53Z","published":"2024-12-04T10:25:53Z","title":"Biologically-inspired Semi-supervised Semantic Segmentation for\n Biomedical Imaging","summary":" We propose a novel two-stage semi-supervised learning approach for training\ndownsampling-upsampling semantic segmentation architectures. The first stage\ndoes not use backpropagation. Rather, it exploits the bio-inspired Hebbian\nprinciple \"fire together, wire together\" as a local learning rule for updating\nthe weights of both convolutional and transpose-convolutional layers, allowing\nunsupervised discovery of data features. In the second stage, the model is\nfine-tuned with standard backpropagation on a small subset of labeled data. We\nevaluate our methodology through experiments conducted on several widely used\nbiomedical datasets, deeming that this domain is paramount in computer vision\nand is notably impacted by data scarcity. Results show that our proposed method\noutperforms SOTA approaches across different levels of label availability.\nFurthermore, we show that using our unsupervised stage to initialize the SOTA\napproaches leads to performance improvements. The code to replicate our\nexperiments can be found at:\nhttps://github.com/ciampluca/hebbian-medical-image-segmentation\n","authors":["Luca Ciampi","Gabriele Lagani","Giuseppe Amato","Fabrizio Falchi"],"pdf_url":"https://arxiv.org/pdf/2412.03192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16221v4","updated":"2024-12-04T10:25:18Z","published":"2023-12-24T11:05:10Z","title":"STRIDE: Single-video based Temporally Continuous Occlusion-Robust 3D\n Pose Estimation","summary":" The capability to accurately estimate 3D human poses is crucial for diverse\nfields such as action recognition, gait recognition, and virtual/augmented\nreality. However, a persistent and significant challenge within this field is\nthe accurate prediction of human poses under conditions of severe occlusion.\nTraditional image-based estimators struggle with heavy occlusions due to a lack\nof temporal context, resulting in inconsistent predictions. While video-based\nmodels benefit from processing temporal data, they encounter limitations when\nfaced with prolonged occlusions that extend over multiple frames. This\nchallenge arises because these models struggle to generalize beyond their\ntraining datasets, and the variety of occlusions is hard to capture in the\ntraining data. Addressing these challenges, we propose STRIDE (Single-video\nbased TempoRally contInuous Occlusion-Robust 3D Pose Estimation), a novel\nTest-Time Training (TTT) approach to fit a human motion prior for each video.\nThis approach specifically handles occlusions that were not encountered during\nthe model's training. By employing STRIDE, we can refine a sequence of noisy\ninitial pose estimates into accurate, temporally coherent poses during test\ntime, effectively overcoming the limitations of prior methods. Our framework\ndemonstrates flexibility by being model-agnostic, allowing us to use any\noff-the-shelf 3D pose estimation method for improving robustness and temporal\nconsistency. We validate STRIDE's efficacy through comprehensive experiments on\nchallenging datasets like Occluded Human3.6M, Human3.6M, and OCMotion, where it\nnot only outperforms existing single-image and video-based pose estimation\nmodels but also showcases superior handling of substantial occlusions,\nachieving fast, robust, accurate, and temporally consistent 3D pose estimates.\nCode is made publicly available at https://github.com/take2rohit/stride\n","authors":["Rohit Lal","Saketh Bachu","Yash Garg","Arindam Dutta","Calvin-Khang Ta","Dripta S. Raychaudhuri","Hannah Dela Cruz","M. Salman Asif","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2312.16221v4.pdf","comment":"Paper accepted at IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)-2025"},{"id":"http://arxiv.org/abs/2409.02224v2","updated":"2024-12-04T10:24:43Z","published":"2024-09-03T18:53:32Z","title":"EgoPressure: A Dataset for Hand Pressure and Pose Estimation in\n Egocentric Vision","summary":" Touch contact and pressure are essential for understanding how humans\ninteract with and manipulate objects, insights which can significantly benefit\napplications in mixed reality and robotics. However, estimating these\ninteractions from an egocentric camera perspective is challenging, largely due\nto the lack of comprehensive datasets that provide both accurate hand poses on\ncontacting surfaces and detailed annotations of pressure information. In this\npaper, we introduce EgoPressure, a novel egocentric dataset that captures\ndetailed touch contact and pressure interactions. EgoPressure provides\nhigh-resolution pressure intensity annotations for each contact point and\nincludes accurate hand pose meshes obtained through our proposed multi-view,\nsequence-based optimization method processing data from an 8-camera capture\nrig. Our dataset comprises 5 hours of recorded interactions from 21\nparticipants captured simultaneously by one head-mounted and seven stationary\nKinect cameras, which acquire RGB images and depth maps at 30 Hz. To support\nfuture research and benchmarking, we present several baseline models for\nestimating applied pressure on external surfaces from RGB images, with and\nwithout hand pose information. We further explore the joint estimation of the\nhand mesh and applied pressure. Our experiments demonstrate that pressure and\nhand pose are complementary for understanding hand-object interactions. ng of\nhand-object interactions in AR/VR and robotics research. Project page:\n\\url{https://yiming-zhao.github.io/EgoPressure/}.\n","authors":["Yiming Zhao","Taein Kwon","Paul Streli","Marc Pollefeys","Christian Holz"],"pdf_url":"https://arxiv.org/pdf/2409.02224v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10962v2","updated":"2024-12-04T10:09:46Z","published":"2024-01-19T11:45:31Z","title":"One Step Learning, One Step Review","summary":" Visual fine-tuning has garnered significant attention with the rise of\npre-trained vision models. The current prevailing method, full fine-tuning,\nsuffers from the issue of knowledge forgetting as it focuses solely on fitting\nthe downstream training set. In this paper, we propose a novel weight\nrollback-based fine-tuning method called OLOR (One step Learning, One step\nReview). OLOR combines fine-tuning with optimizers, incorporating a weight\nrollback term into the weight update term at each step. This ensures\nconsistency in the weight range of upstream and downstream models, effectively\nmitigating knowledge forgetting and enhancing fine-tuning performance. In\naddition, a layer-wise penalty is presented to employ penalty decay and the\ndiversified decay rate to adjust the weight rollback levels of layers for\nadapting varying downstream tasks. Through extensive experiments on various\ntasks such as image classification, object detection, semantic segmentation,\nand instance segmentation, we demonstrate the general applicability and\nstate-of-the-art performance of our proposed OLOR. Code is available at\nhttps://github.com/rainbow-xiao/OLOR-AAAI-2024.\n","authors":["Xiaolong Huang","Qiankun Li","Xueran Li","Xuesong Gao"],"pdf_url":"https://arxiv.org/pdf/2401.10962v2.pdf","comment":"Published at the 38th AAAI Conference on Artificial Intelligence\n (AAAI 2024)"},{"id":"http://arxiv.org/abs/2412.03179v1","updated":"2024-12-04T10:05:47Z","published":"2024-12-04T10:05:47Z","title":"Optimizing Dense Visual Predictions Through Multi-Task Coherence and\n Prioritization","summary":" Multi-Task Learning (MTL) involves the concurrent training of multiple tasks,\noffering notable advantages for dense prediction tasks in computer vision. MTL\nnot only reduces training and inference time as opposed to having multiple\nsingle-task models, but also enhances task accuracy through the interaction of\nmultiple tasks. However, existing methods face limitations. They often rely on\nsuboptimal cross-task interactions, resulting in task-specific predictions with\npoor geometric and predictive coherence. In addition, many approaches use\ninadequate loss weighting strategies, which do not address the inherent\nvariability in task evolution during training. To overcome these challenges, we\npropose an advanced MTL model specifically designed for dense vision tasks. Our\nmodel leverages state-of-the-art vision transformers with task-specific\ndecoders. To enhance cross-task coherence, we introduce a trace-back method\nthat improves both cross-task geometric and predictive features. Furthermore,\nwe present a novel dynamic task balancing approach that projects task losses\nonto a common scale and prioritizes more challenging tasks during training.\nExtensive experiments demonstrate the superiority of our method, establishing\nnew state-of-the-art performance across two benchmark datasets. The code is\navailable at:https://github.com/Klodivio355/MT-CP\n","authors":["Maxime Fontana","Michael Spratling","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2412.03179v1.pdf","comment":"Accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2412.03178v1","updated":"2024-12-04T10:03:52Z","published":"2024-12-04T10:03:52Z","title":"Towards Understanding and Quantifying Uncertainty for Text-to-Image\n Generation","summary":" Uncertainty quantification in text-to-image (T2I) generative models is\ncrucial for understanding model behavior and improving output reliability. In\nthis paper, we are the first to quantify and evaluate the uncertainty of T2I\nmodels with respect to the prompt. Alongside adapting existing approaches\ndesigned to measure uncertainty in the image space, we also introduce\nPrompt-based UNCertainty Estimation for T2I models (PUNC), a novel method\nleveraging Large Vision-Language Models (LVLMs) to better address uncertainties\narising from the semantics of the prompt and generated images. PUNC utilizes a\nLVLM to caption a generated image, and then compares the caption with the\noriginal prompt in the more semantically meaningful text space. PUNC also\nenables the disentanglement of both aleatoric and epistemic uncertainties via\nprecision and recall, which image-space approaches are unable to do. Extensive\nexperiments demonstrate that PUNC outperforms state-of-the-art uncertainty\nestimation techniques across various settings. Uncertainty quantification in\ntext-to-image generation models can be used on various applications including\nbias detection, copyright protection, and OOD detection. We also introduce a\ncomprehensive dataset of text prompts and generation pairs to foster further\nresearch in uncertainty quantification for generative models. Our findings\nillustrate that PUNC not only achieves competitive performance but also enables\nnovel applications in evaluating and improving the trustworthiness of\ntext-to-image models.\n","authors":["Gianni Franchi","Dat Nguyen Trong","Nacim Belkhir","Guoxuan Xia","Andrea Pilzer"],"pdf_url":"https://arxiv.org/pdf/2412.03178v1.pdf","comment":"28 pages and 22 figures"},{"id":"http://arxiv.org/abs/2412.03177v1","updated":"2024-12-04T09:59:43Z","published":"2024-12-04T09:59:43Z","title":"PatchDPO: Patch-level DPO for Finetuning-free Personalized Image\n Generation","summary":" Finetuning-free personalized image generation can synthesize customized\nimages without test-time finetuning, attracting wide research interest owing to\nits high efficiency. Current finetuning-free methods simply adopt a single\ntraining stage with a simple image reconstruction task, and they typically\ngenerate low-quality images inconsistent with the reference images during\ntest-time. To mitigate this problem, inspired by the recent DPO (i.e., direct\npreference optimization) technique, this work proposes an additional training\nstage to improve the pre-trained personalized generation models. However,\ntraditional DPO only determines the overall superiority or inferiority of two\nsamples, which is not suitable for personalized image generation because the\ngenerated images are commonly inconsistent with the reference images only in\nsome local image patches. To tackle this problem, this work proposes PatchDPO\nthat estimates the quality of image patches within each generated image and\naccordingly trains the model. To this end, PatchDPO first leverages the\npre-trained vision model with a proposed self-supervised training method to\nestimate the patch quality. Next, PatchDPO adopts a weighted training approach\nto train the model with the estimated patch quality, which rewards the image\npatches with high quality while penalizing the image patches with low quality.\nExperiment results demonstrate that PatchDPO significantly improves the\nperformance of multiple pre-trained personalized generation models, and\nachieves state-of-the-art performance on both single-object and multi-object\npersonalized image generation. Our code is available at\nhttps://github.com/hqhQAQ/PatchDPO.\n","authors":["Qihan Huang","Long Chan","Jinlong Liu","Wanggui He","Hao Jiang","Mingli Song","Jie Song"],"pdf_url":"https://arxiv.org/pdf/2412.03177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15017v4","updated":"2024-12-04T09:54:59Z","published":"2024-07-22T06:15:59Z","title":"Knowledge Mechanisms in Large Language Models: A Survey and Perspective","summary":" Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.\n","authors":["Mengru Wang","Yunzhi Yao","Ziwen Xu","Shuofei Qiao","Shumin Deng","Peng Wang","Xiang Chen","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15017v4.pdf","comment":"EMNLP 2024 Findings; 39 pages (v4)"},{"id":"http://arxiv.org/abs/2412.03173v1","updated":"2024-12-04T09:53:09Z","published":"2024-12-04T09:53:09Z","title":"IRisPath: Enhancing Off-Road Navigation with Robust IR-RGB Fusion for\n Improved Day and Night Traversability","summary":" Autonomous off-road navigation is required for applications in agriculture,\nconstruction, search and rescue and defence. Traditional on-road autonomous\nmethods struggle with dynamic terrains, leading to poor vehicle control on\noff-road. Recent deep-learning models have used perception sensors along with\nkinesthetic feedback for navigation on such terrains. However, this approach\nhas out-of-domain uncertainty. Factors like change in weather and time of day\nimpacts the performance of the model. We propose a multi modal fusion network\nFuseIsPath capable of using LWIR and RGB images to provide robustness against\ndynamic weather and light conditions. To aid further works in this domain, we\nalso open-source a day-night dataset with LWIR and RGB images along with\npseudo-labels for traversability. In order to co-register the two images we\ndeveloped a novel method for targetless extrinsic calibration of LWIR, LiDAR\nand RGB cameras with translation accuracy of 1.7cm and rotation accuracy of\n0.827degree.\n","authors":["Saksham Sharma","Akshit Raizada","Suresh Sundaram"],"pdf_url":"https://arxiv.org/pdf/2412.03173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01289v2","updated":"2024-12-04T09:51:16Z","published":"2024-12-02T09:02:28Z","title":"Enhancing Perception Capabilities of Multimodal LLMs with Training-Free\n Fusion","summary":" Multimodal LLMs (MLLMs) equip language models with visual capabilities by\naligning vision encoders with language models. Existing methods to enhance the\nvisual perception of MLLMs often involve designing more powerful vision\nencoders, which requires exploring a vast design space and re-aligning each\npotential encoder with the language model, resulting in prohibitively high\ntraining costs. In this paper, we introduce VisionFuse, a novel integration\nframework that efficiently utilizes multiple vision encoders from off-the-shelf\nMLLMs to enhance visual perception without requiring additional training. Our\napproach is motivated by the observation that different MLLMs tend to focus on\ndistinct regions given the same query and image. Moreover, we find that the\nfeature distributions of vision encoders within an MLLM family, a group of\nMLLMs sharing the same pretrained LLM, are highly aligned. Building on these\ninsights, VisionFuse enriches the visual context by concatenating the tokens\ngenerated by the vision encoders of selected MLLMs within a family. By merging\nthe parameters of language models from these MLLMs, VisionFuse allows a single\nlanguage model to align with various vision encoders, significantly reducing\ndeployment overhead. We conduct comprehensive evaluations across multiple\nmultimodal benchmarks using various MLLM combinations, demonstrating\nsubstantial improvements in multimodal tasks. Notably, when integrating\nMiniGemini-8B and SLIME-8B, VisionFuse achieves an average performance increase\nof over 4%.\n","authors":["Zhuokun Chen","Jinwu Hu","Zeshuai Deng","Yufeng Wang","Bohan Zhuang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2412.01289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03166v1","updated":"2024-12-04T09:46:41Z","published":"2024-12-04T09:46:41Z","title":"Are Explanations Helpful? A Comparative Analysis of Explainability\n Methods in Skin Lesion Classifiers","summary":" Deep Learning has shown outstanding results in computer vision tasks;\nhealthcare is no exception. However, there is no straightforward way to expose\nthe decision-making process of DL models. Good accuracy is not enough for skin\ncancer predictions. Understanding the model's behavior is crucial for clinical\napplication and reliable outcomes. In this work, we identify desiderata for\nexplanations in skin-lesion models. We analyzed seven methods, four based on\npixel-attribution (Grad-CAM, Score-CAM, LIME, SHAP) and three on high-level\nconcepts (ACE, ICE, CME), for a deep neural network trained on the\nInternational Skin Imaging Collaboration Archive. Our findings indicate that\nwhile these techniques reveal biases, there is room for improving the\ncomprehensiveness of explanations to achieve transparency in skin-lesion\nmodels.\n","authors":["Rosa Y. G. Paccotacya-Yanque","Alceu Bissoto","Sandra Avila"],"pdf_url":"https://arxiv.org/pdf/2412.03166v1.pdf","comment":"6 pages. Paper accepted at 20th International Symposium on Medical\n Information Processing and Analysis (SIPAIM)"},{"id":"http://arxiv.org/abs/2402.14400v3","updated":"2024-12-04T09:44:26Z","published":"2024-02-22T09:34:48Z","title":"Learning Developmental Age from 3D Infant Kinetics Using Adaptive Graph\n Neural Networks","summary":" Reliable methods for the neurodevelopmental assessment of infants are\nessential for early detection of problems that may need prompt interventions.\nSpontaneous motor activity, or 'kinetics', is shown to provide a powerful\nsurrogate measure of upcoming neurodevelopment. However, its assessment is by\nand large qualitative and subjective, focusing on visually identified,\nage-specific gestures. In this work, we introduce Kinetic Age (KA), a novel\ndata-driven metric that quantifies neurodevelopmental maturity by predicting an\ninfant's age based on their movement patterns. KA offers an interpretable and\ngeneralizable proxy for motor development. Our method leverages 3D video\nrecordings of infants, processed with pose estimation to extract\nspatio-temporal series of anatomical landmarks, which are released as a new\nopenly available dataset. These data are modeled using adaptive graph\nconvolutional networks, able to capture the spatio-temporal dependencies in\ninfant movements. We also show that our data-driven approach achieves\nimprovement over traditional machine learning baselines based on manually\nengineered features.\n","authors":["Daniel Holmberg","Manu Airaksinen","Viviana Marchi","Andrea Guzzetta","Anna Kivi","Leena Haataja","Sampsa Vanhatalo","Teemu Roos"],"pdf_url":"https://arxiv.org/pdf/2402.14400v3.pdf","comment":"15 pages, 9 figures. Code repository available via\n https://github.com/deinal/infant-aagcn"},{"id":"http://arxiv.org/abs/2412.01064v2","updated":"2024-12-04T09:43:18Z","published":"2024-12-02T02:50:07Z","title":"FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking\n Portrait","summary":" With the rapid advancement of diffusion-based generative models, portrait\nimage animation has achieved remarkable results. However, it still faces\nchallenges in temporally consistent video generation and fast sampling due to\nits iterative sampling nature. This paper presents FLOAT, an audio-driven\ntalking portrait video generation method based on flow matching generative\nmodel. We shift the generative modeling from the pixel-based latent space to a\nlearned motion latent space, enabling efficient design of temporally consistent\nmotion. To achieve this, we introduce a transformer-based vector field\npredictor with a simple yet effective frame-wise conditioning mechanism.\nAdditionally, our method supports speech-driven emotion enhancement, enabling a\nnatural incorporation of expressive motions. Extensive experiments demonstrate\nthat our method outperforms state-of-the-art audio-driven talking portrait\nmethods in terms of visual quality, motion fidelity, and efficiency.\n","authors":["Taekyung Ki","Dongchan Min","Gyeongsu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.01064v2.pdf","comment":"Project page: https://deepbrainai-research.github.io/float/"},{"id":"http://arxiv.org/abs/2406.00758v3","updated":"2024-12-04T09:36:56Z","published":"2024-06-02T14:22:09Z","title":"Once-for-All: Controllable Generative Image Compression with Dynamic\n Granularity Adaption","summary":" Although recent generative image compression methods have demonstrated\nimpressive potential in optimizing the rate-distortion-perception trade-off,\nthey still face the critical challenge of flexible rate adaption to diverse\ncompression necessities and scenarios. To overcome this challenge, this paper\nproposes a Controllable Generative Image Compression framework, termed\nControl-GIC, the first capable of fine-grained bitrate adaption across a broad\nspectrum while ensuring high-fidelity and generality compression. Control-GIC\nis grounded in a VQGAN framework that encodes an image as a sequence of\nvariable-length codes (i.e. VQ-indices), which can be losslessly compressed and\nexhibits a direct positive correlation with the bitrates. Drawing inspiration\nfrom the classical coding principle, we correlate the information density of\nlocal image patches with their granular representations. Hence, we can flexibly\ndetermine a proper allocation of granularity for the patches to achieve dynamic\nadjustment for VQ-indices, resulting in desirable compression rates. We further\ndevelop a probabilistic conditional decoder capable of retrieving historic\nencoded multi-granularity representations according to transmitted codes, and\nthen reconstruct hierarchical granular features in the formalization of\nconditional probability, enabling more informative aggregation to improve\nreconstruction realism. Our experiments show that Control-GIC allows highly\nflexible and controllable bitrate adaption where the results demonstrate its\nsuperior performance over recent state-of-the-art methods.\n","authors":["Anqi Li","Feng Li","Yuxi Liu","Runmin Cong","Yao Zhao","Huihui Bai"],"pdf_url":"https://arxiv.org/pdf/2406.00758v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03159v1","updated":"2024-12-04T09:36:24Z","published":"2024-12-04T09:36:24Z","title":"Multi-Level Correlation Network For Few-Shot Image Classification","summary":" Few-shot image classification(FSIC) aims to recognize novel classes given few\nlabeled images from base classes. Recent works have achieved promising\nclassification performance, especially for metric-learning methods, where a\nmeasure at only image feature level is usually used. In this paper, we argue\nthat measure at such a level may not be effective enough to generalize from\nbase to novel classes when using only a few images. Instead, a multi-level\ndescriptor of an image is taken for consideration in this paper. We propose a\nmulti-level correlation network (MLCN) for FSIC to tackle this problem by\neffectively capturing local information. Concretely, we present the\nself-correlation module and cross-correlation module to learn the semantic\ncorrespondence relation of local information based on learned representations.\nMoreover, we propose a pattern-correlation module to capture the pattern of\nfine-grained images and find relevant structural patterns between base classes\nand novel classes. Extensive experiments and analysis show the effectiveness of\nour proposed method on four widely-used FSIC benchmarks. The code for our\napproach is available at: https://github.com/Yunkai696/MLCN.\n","authors":["Yunkai Dang","Min Zhang","Zhengyu Chen","Xinliang Zhang","Zheng Wang","Meijun Sun","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03150v1","updated":"2024-12-04T09:17:47Z","published":"2024-12-04T09:17:47Z","title":"Appearance Matching Adapter for Exemplar-based Semantic Image Synthesis","summary":" Exemplar-based semantic image synthesis aims to generate images aligned with\ngiven semantic content while preserving the appearance of an exemplar image.\nConventional structure-guidance models, such as ControlNet, are limited in that\nthey cannot directly utilize exemplar images as input, relying instead solely\non text prompts to control appearance. Recent tuning-free approaches address\nthis limitation by transferring local appearance from the exemplar image to the\nsynthesized image through implicit cross-image matching in the augmented\nself-attention mechanism of pre-trained diffusion models. However, these\nmethods face challenges when applied to content-rich scenes with significant\ngeometric deformations, such as driving scenes. In this paper, we propose the\nAppearance Matching Adapter (AM-Adapter), a learnable framework that enhances\ncross-image matching within augmented self-attention by incorporating semantic\ninformation from segmentation maps. To effectively disentangle generation and\nmatching processes, we adopt a stage-wise training approach. Initially, we\ntrain the structure-guidance and generation networks, followed by training the\nAM-Adapter while keeping the other networks frozen. During inference, we\nintroduce an automated exemplar retrieval method to efficiently select exemplar\nimage-segmentation pairs. Despite utilizing a limited number of learnable\nparameters, our method achieves state-of-the-art performance, excelling in both\nsemantic alignment preservation and local appearance fidelity. Extensive\nablation studies further validate our design choices. Code and pre-trained\nweights will be publicly available.: https://cvlab-kaist.github.io/AM-Adapter/\n","authors":["Siyoon Jin","Jisu Nam","Jiyoung Kim","Dahyun Chung","Yeong-Seok Kim","Joonhyung Park","Heonjeong Chu","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2412.03150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17576v2","updated":"2024-12-04T08:58:53Z","published":"2024-11-26T16:41:09Z","title":"A Distractor-Aware Memory for Visual Object Tracking with SAM2","summary":" Memory-based trackers are video object segmentation methods that form the\ntarget model by concatenating recently tracked frames into a memory buffer and\nlocalize the target by attending the current image to the buffered frames.\nWhile already achieving top performance on many benchmarks, it was the recent\nrelease of SAM2 that placed memory-based trackers into focus of the visual\nobject tracking community. Nevertheless, modern trackers still struggle in the\npresence of distractors. We argue that a more sophisticated memory model is\nrequired, and propose a new distractor-aware memory model for SAM2 and an\nintrospection-based update strategy that jointly addresses the segmentation\naccuracy as well as tracking robustness. The resulting tracker is denoted as\nSAM2.1++. We also propose a new distractor-distilled DiDi dataset to study the\ndistractor problem better. SAM2.1++ outperforms SAM2.1 and related SAM memory\nextensions on seven benchmarks and sets a solid new state-of-the-art on six of\nthem.\n","authors":["Jovana Videnovic","Alan Lukezic","Matej Kristan"],"pdf_url":"https://arxiv.org/pdf/2411.17576v2.pdf","comment":"Under review. Code available on Github:\n https://github.com/jovanavidenovic/DAM4SAM"},{"id":"http://arxiv.org/abs/2403.16539v4","updated":"2024-12-04T08:56:17Z","published":"2024-03-25T08:31:14Z","title":"Data-Efficient 3D Visual Grounding via Order-Aware Referring","summary":" 3D visual grounding aims to identify the target object within a 3D point\ncloud scene referred to by a natural language description. Previous works\nusually require significant data relating to point color and their descriptions\nto exploit the corresponding complicated verbo-visual relations. In our work,\nwe introduce Vigor, a novel Data-Efficient 3D Visual Grounding framework via\nOrder-aware Referring. Vigor leverages LLM to produce a desirable referential\norder from the input description for 3D visual grounding. With the proposed\nstacked object-referring blocks, the predicted anchor objects in the above\norder allow one to locate the target object progressively without supervision\non the identities of anchor objects or exact relations between anchor/target\nobjects. In addition, we present an order-aware warm-up training strategy,\nwhich augments referential orders for pre-training the visual grounding\nframework. This allows us to better capture the complex verbo-visual relations\nand benefit the desirable data-efficient learning scheme. Experimental results\non the NR3D and ScanRefer datasets demonstrate our superiority in low-resource\nscenarios. In particular, Vigor surpasses current state-of-the-art frameworks\nby 9.3% and 7.6% grounding accuracy under 1% data and 10% data settings on the\nNR3D dataset, respectively.\n","authors":["Tung-Yu Wu","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2403.16539v4.pdf","comment":"accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2404.00335v3","updated":"2024-12-04T08:54:16Z","published":"2024-03-30T12:10:34Z","title":"Learning Trimaps via Clicks for Image Matting","summary":" Despite significant advancements in image matting, existing models heavily\ndepend on manually-drawn trimaps for accurate results in natural image\nscenarios. However, the process of obtaining trimaps is time-consuming, lacking\nuser-friendliness and device compatibility. This reliance greatly limits the\npractical application of all trimap-based matting methods. To address this\nissue, we introduce Click2Trimap, an interactive model capable of predicting\nhigh-quality trimaps and alpha mattes with minimal user click inputs. Through\nanalyzing real users' behavioral logic and characteristics of trimaps, we\nsuccessfully propose a powerful iterative three-class training strategy and a\ndedicated simulation function, making Click2Trimap exhibit versatility across\nvarious scenarios. Quantitative and qualitative assessments on synthetic and\nreal-world matting datasets demonstrate Click2Trimap's superior performance\ncompared to all existing trimap-free matting methods. Especially, in the user\nstudy, Click2Trimap achieves high-quality trimap and matting predictions in\njust an average of 5 seconds per image, demonstrating its substantial practical\nvalue in real-world applications.\n","authors":["Chenyi Zhang","Yihan Hu","Henghui Ding","Humphrey Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2404.00335v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.07663v2","updated":"2024-12-04T08:47:23Z","published":"2024-10-10T07:12:46Z","title":"TDDSR: Single-Step Diffusion with Two Discriminators for Super\n Resolution","summary":" Super-resolution methods are increasingly becoming popular for both\nreal-world and face-specific tasks. Many existing approaches, however, rely on\nsimplistic degradation models, which limits their ability to handle complex and\nunknown degradation patterns effectively. While diffusion-based\nsuper-resolution techniques have recently shown impressive results, they are\nstill constrained by the need for numerous inference steps. To address this, we\npropose TDDSR, an efficient single-step diffusion-based super-resolution\nmethod. Our method, distilled from a pre-trained teacher model and based on a\ndiffusion network, performs super-resolution in a single step. It integrates a\nlearnable diffusion-based downsampler to capture diverse degradation patterns\nand employs two discriminators, one for high-resolution and one for\nlow-resolution images, to enhance the overall performance. Experimental results\ndemonstrate its effectiveness across real-world and face-specific SR tasks,\nachieving performance beyond other state-of-the-art models and comparable to\nprevious diffusion methods with multiple sampling steps.\n","authors":["Sohwi Kim","Tae-Kyun Kim"],"pdf_url":"https://arxiv.org/pdf/2410.07663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16473v3","updated":"2024-12-04T08:40:41Z","published":"2024-02-26T10:42:25Z","title":"DCVSMNet: Double Cost Volume Stereo Matching Network","summary":" We introduce Double Cost Volume Stereo Matching Network(DCVSMNet) which is a\nnovel architecture characterised by by two small upper (group-wise) and lower\n(norm correlation) cost volumes. Each cost volume is processed separately, and\na coupling module is proposed to fuse the geometry information extracted from\nthe upper and lower cost volumes. DCVSMNet is a fast stereo matching network\nwith a 67 ms inference time and strong generalization ability which can produce\ncompetitive results compared to state-of-the-art methods. The results on\nseveral bench mark datasets show that DCVSMNet achieves better accuracy than\nmethods such as CGI-Stereo and BGNet at the cost of greater inference time.\n","authors":["Mahmoud Tahmasebi","Saif Huq","Kevin Meehan","Marion McAfee"],"pdf_url":"https://arxiv.org/pdf/2402.16473v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03121v1","updated":"2024-12-04T08:40:11Z","published":"2024-12-04T08:40:11Z","title":"Splats in Splats: Embedding Invisible 3D Watermark within Gaussian\n Splatting","summary":" 3D Gaussian splatting (3DGS) has demonstrated impressive 3D reconstruction\nperformance with explicit scene representations. Given the widespread\napplication of 3DGS in 3D reconstruction and generation tasks, there is an\nurgent need to protect the copyright of 3DGS assets. However, existing\ncopyright protection techniques for 3DGS overlook the usability of 3D assets,\nposing challenges for practical deployment. Here we describe WaterGS, the first\n3DGS watermarking framework that embeds 3D content in 3DGS itself without\nmodifying any attributes of the vanilla 3DGS. To achieve this, we take a deep\ninsight into spherical harmonics (SH) and devise an importance-graded SH\ncoefficient encryption strategy to embed the hidden SH coefficients.\nFurthermore, we employ a convolutional autoencoder to establish a mapping\nbetween the original Gaussian primitives' opacity and the hidden Gaussian\nprimitives' opacity. Extensive experiments indicate that WaterGS significantly\noutperforms existing 3D steganography techniques, with 5.31% higher scene\nfidelity and 3X faster rendering speed, while ensuring security, robustness,\nand user experience. Codes and data will be released at\nhttps://water-gs.github.io.\n","authors":["Yijia Guo","Wenkai Huang","Yang Li","Gaolei Li","Hang Zhang","Liwen Hu","Jianhua Li","Tiejun Huang","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2412.03121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03118v1","updated":"2024-12-04T08:38:45Z","published":"2024-12-04T08:38:45Z","title":"ObjectFinder: Open-Vocabulary Assistive System for Interactive Object\n Search by Blind People","summary":" Assistive technology can be leveraged by blind people when searching for\nobjects in their daily lives. We created ObjectFinder, an open-vocabulary\ninteractive object-search prototype, which combines object detection with scene\ndescription and navigation. It enables blind persons to detect and navigate to\nobjects of their choice. Our approach used co-design for the development of the\nprototype. We further conducted need-finding interviews to better understand\nchallenges in object search, followed by a study with the ObjectFinder\nprototype in a laboratory setting simulating a living room and an office, with\neight blind users. Additionally, we compared the prototype with BeMyEyes and\nLookout for object search. We found that most participants felt more\nindependent with ObjectFinder and preferred it over the baselines when deployed\non more efficient hardware, as it enhances mental mapping and allows for active\ntarget definition. Moreover, we identified factors for future directions for\nthe development of object-search systems.\n","authors":["Ruiping Liu","Jiaming Zhang","Angela Schön","Karin Müller","Junwei Zheng","Kailun Yang","Kathrin Gerling","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2412.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02158v2","updated":"2024-12-04T08:34:49Z","published":"2024-12-03T04:34:23Z","title":"Agri-LLaVA: Knowledge-Infused Large Multimodal Assistant on Agricultural\n Pests and Diseases","summary":" In the general domain, large multimodal models (LMMs) have achieved\nsignificant advancements, yet challenges persist in applying them to specific\nfields, especially agriculture. As the backbone of the global economy,\nagriculture confronts numerous challenges, with pests and diseases being\nparticularly concerning due to their complexity, variability, rapid spread, and\nhigh resistance. This paper specifically addresses these issues. We construct\nthe first multimodal instruction-following dataset in the agricultural domain,\ncovering over 221 types of pests and diseases with approximately 400,000 data\nentries. This dataset aims to explore and address the unique challenges in pest\nand disease control. Based on this dataset, we propose a knowledge-infused\ntraining method to develop Agri-LLaVA, an agricultural multimodal conversation\nsystem. To accelerate progress in this field and inspire more researchers to\nengage, we design a diverse and challenging evaluation benchmark for\nagricultural pests and diseases. Experimental results demonstrate that\nAgri-LLaVA excels in agricultural multimodal conversation and visual\nunderstanding, providing new insights and approaches to address agricultural\npests and diseases. By open-sourcing our dataset and model, we aim to promote\nresearch and development in LMMs within the agricultural domain and make\nsignificant contributions to tackle the challenges of agricultural pests and\ndiseases. All resources can be found at https://github.com/Kki2Eve/Agri-LLaVA.\n","authors":["Liqiong Wang","Teng Jin","Jinyu Yang","Ales Leonardis","Fangyi Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.02158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06744v2","updated":"2024-12-04T08:16:01Z","published":"2024-04-10T05:10:05Z","title":"YOLO based Ocean Eddy Localization with AWS SageMaker","summary":" Ocean eddies play a significant role both on the sea surface and beneath it,\ncontributing to the sustainability of marine life dependent on oceanic\nbehaviors. Therefore, it is crucial to investigate ocean eddies to monitor\nchanges in the Earth, particularly in the oceans, and their impact on climate.\nThis study aims to pinpoint ocean eddies using AWS cloud services, specifically\nSageMaker. The primary objective is to detect small-scale (<20km) ocean eddies\nfrom satellite remote images and assess the feasibility of utilizing SageMaker,\nwhich offers tools for deploying AI applications. Moreover, this research not\nonly explores the deployment of cloud-based services for remote sensing of\nEarth data but also evaluates several YOLO (You Only Look Once) models using\nsingle and multi-GPU-based services in the cloud. Furthermore, this study\nunderscores the potential of these services, their limitations, challenges\nrelated to deployment and resource management, and their user-riendliness for\nEarth science projects.\n","authors":["Seraj Al Mahmud Mostafa","Jinbo Wang","Benjamin Holt","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2404.06744v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2412.03105v1","updated":"2024-12-04T08:10:48Z","published":"2024-12-04T08:10:48Z","title":"Few-Shot Learning with Adaptive Weight Masking in Conditional GANs","summary":" Deep learning has revolutionized various fields, yet its efficacy is hindered\nby overfitting and the requirement of extensive annotated data, particularly in\nfew-shot learning scenarios where limited samples are available. This paper\nintroduces a novel approach to few-shot learning by employing a Residual Weight\nMasking Conditional Generative Adversarial Network (RWM-CGAN) for data\naugmentation. The proposed model integrates residual units within the generator\nto enhance network depth and sample quality, coupled with a weight mask\nregularization technique in the discriminator to improve feature learning from\nsmall-sample categories. This method addresses the core issues of robustness\nand generalization in few-shot learning by providing a controlled and clear\naugmentation of the sample space. Extensive experiments demonstrate that\nRWM-CGAN not only expands the sample space effectively but also enriches the\ndiversity and quality of generated samples, leading to significant improvements\nin detection and classification accuracy on public datasets. The paper\ncontributes to the advancement of few-shot learning by offering a practical\nsolution to the challenges posed by data scarcity and the need for rapid\ngeneralization to new tasks or categories.\n","authors":["Jiacheng Hu","Zhen Qi","Jianjun Wei","Jiajing Chen","Runyuan Bao","Xinyu Qiu"],"pdf_url":"https://arxiv.org/pdf/2412.03105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03103v1","updated":"2024-12-04T08:06:06Z","published":"2024-12-04T08:06:06Z","title":"MultiGO: Towards Multi-level Geometry Learning for Monocular 3D Textured\n Human Reconstruction","summary":" This paper investigates the research task of reconstructing the 3D clothed\nhuman body from a monocular image. Due to the inherent ambiguity of single-view\ninput, existing approaches leverage pre-trained SMPL(-X) estimation models or\ngenerative models to provide auxiliary information for human reconstruction.\nHowever, these methods capture only the general human body geometry and\noverlook specific geometric details, leading to inaccurate skeleton\nreconstruction, incorrect joint positions, and unclear cloth wrinkles. In\nresponse to these issues, we propose a multi-level geometry learning framework.\nTechnically, we design three key components: skeleton-level enhancement,\njoint-level augmentation, and wrinkle-level refinement modules. Specifically,\nwe effectively integrate the projected 3D Fourier features into a Gaussian\nreconstruction model, introduce perturbations to improve joint depth estimation\nduring training, and refine the human coarse wrinkles by resembling the\nde-noising process of diffusion model. Extensive quantitative and qualitative\nexperiments on two out-of-distribution test sets show the superior performance\nof our approach compared to state-of-the-art (SOTA) methods.\n","authors":["Gangjian Zhang","Nanjie Yao","Shunsi Zhang","Hanfeng Zhao","Guoliang Pang","Jian Shu","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03102v1","updated":"2024-12-04T08:04:14Z","published":"2024-12-04T08:04:14Z","title":"Lightweight Multiplane Images Network for Real-Time Stereoscopic\n Conversion from Planar Video","summary":" With the rapid development of stereoscopic display technologies, especially\nglasses-free 3D screens, and virtual reality devices, stereoscopic conversion\nhas become an important task to address the lack of high-quality stereoscopic\nimage and video resources. Current stereoscopic conversion algorithms typically\nstruggle to balance reconstruction performance and inference efficiency. This\npaper proposes a planar video real-time stereoscopic conversion network based\non multi-plane images (MPI), which consists of a detail branch for generating\nMPI and a depth-semantic branch for perceiving depth information. Unlike models\nthat depend on explicit depth map inputs, the proposed method employs a\nlightweight depth-semantic branch to extract depth-aware features implicitly.\nTo optimize the lightweight branch, a heavy training but light inference\nstrategy is adopted, which involves designing a coarse-to-fine auxiliary branch\nthat is only used during the training stage. In addition, the proposed method\nsimplifies the MPI rendering process for stereoscopic conversion scenarios to\nfurther accelerate the inference. Experimental results demonstrate that the\nproposed method can achieve comparable performance to some state-of-the-art\n(SOTA) models and support real-time inference at 2K resolution. Compared to the\nSOTA TMPI algorithm, the proposed method obtains similar subjective quality\nwhile achieving over $40\\times$ inference acceleration.\n","authors":["Shanding Diao","Yang Zhao","Yuan Chen","Zhao Zhang","Wei Jia","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03102v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.02687v2","updated":"2024-12-04T08:01:47Z","published":"2024-12-03T18:56:32Z","title":"SNOOPI: Supercharged One-step Diffusion Distillation with Proper\n Guidance","summary":" Recent approaches have yielded promising results in distilling multi-step\ntext-to-image diffusion models into one-step ones. The state-of-the-art\nefficient distillation technique, i.e., SwiftBrushv2 (SBv2), even surpasses the\nteacher model's performance with limited resources. However, our study reveals\nits instability when handling different diffusion model backbones due to using\na fixed guidance scale within the Variational Score Distillation (VSD) loss.\nAnother weakness of the existing one-step diffusion models is the missing\nsupport for negative prompt guidance, which is crucial in practical image\ngeneration. This paper presents SNOOPI, a novel framework designed to address\nthese limitations by enhancing the guidance in one-step diffusion models during\nboth training and inference. First, we effectively enhance training stability\nthrough Proper Guidance-SwiftBrush (PG-SB), which employs a random-scale\nclassifier-free guidance approach. By varying the guidance scale of both\nteacher models, we broaden their output distributions, resulting in a more\nrobust VSD loss that enables SB to perform effectively across diverse backbones\nwhile maintaining competitive performance. Second, we propose a training-free\nmethod called Negative-Away Steer Attention (NASA), which integrates negative\nprompts into one-step diffusion models via cross-attention to suppress\nundesired elements in generated images. Our experimental results show that our\nproposed methods significantly improve baseline models across various metrics.\nRemarkably, we achieve an HPSv2 score of 31.08, setting a new state-of-the-art\nbenchmark for one-step diffusion models.\n","authors":["Viet Nguyen","Anh Nguyen","Trung Dao","Khoi Nguyen","Cuong Pham","Toan Tran","Anh Tran"],"pdf_url":"https://arxiv.org/pdf/2412.02687v2.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2407.17843v2","updated":"2024-12-04T07:50:27Z","published":"2024-07-25T07:57:55Z","title":"DragText: Rethinking Text Embedding in Point-based Image Editing","summary":" Point-based image editing enables accurate and flexible control through\ncontent dragging. However, the role of text embedding during the editing\nprocess has not been thoroughly investigated. A significant aspect that remains\nunexplored is the interaction between text and image embeddings. During the\nprogressive editing in a diffusion model, the text embedding remains constant.\nAs the image embedding increasingly diverges from its initial state, the\ndiscrepancy between the image and text embeddings presents a significant\nchallenge. In this study, we found that the text prompt significantly\ninfluences the dragging process, particularly in maintaining content integrity\nand achieving the desired manipulation. Upon these insights, we propose\nDragText, which optimizes text embedding in conjunction with the dragging\nprocess to pair with the modified image embedding. Simultaneously, we\nregularize the text optimization process to preserve the integrity of the\noriginal text prompt. Our approach can be seamlessly integrated with existing\ndiffusion-based drag methods, enhancing performance with only a few lines of\ncode.\n","authors":["Gayoon Choi","Taejin Jeong","Sujung Hong","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.17843v2.pdf","comment":"Accepted at WACV 2025; Code is released at\n https://github.com/MICV-yonsei/DragText"},{"id":"http://arxiv.org/abs/2410.01544v3","updated":"2024-12-04T07:47:11Z","published":"2024-10-02T13:30:32Z","title":"Boosting Weakly-Supervised Referring Image Segmentation via Progressive\n Comprehension","summary":" This paper explores the weakly-supervised referring image segmentation (WRIS)\nproblem, and focuses on a challenging setup where target localization is\nlearned directly from image-text pairs. We note that the input text description\ntypically already contains detailed information on how to localize the target\nobject, and we also observe that humans often follow a step-by-step\ncomprehension process (\\ie, progressively utilizing target-related attributes\nand relations as cues) to identify the target object. Hence, we propose a novel\nProgressive Comprehension Network (PCNet) to leverage target-related textual\ncues from the input description for progressively localizing the target object.\nSpecifically, we first use a Large Language Model (LLM) to decompose the input\ntext description into short phrases. These short phrases are taken as\ntarget-related cues and fed into a Conditional Referring Module (CRM) in\nmultiple stages, to allow updating the referring text embedding and enhance the\nresponse map for target localization in a multi-stage manner. Based on the CRM,\nwe then propose a Region-aware Shrinking (RaS) loss to constrain the visual\nlocalization to be conducted progressively in a coarse-to-fine manner across\ndifferent stages. Finally, we introduce an Instance-aware Disambiguation (IaD)\nloss to suppress instance localization ambiguity by differentiating overlapping\nresponse maps generated by different referring texts on the same image.\nExtensive experiments show that our method outperforms SOTA methods on three\ncommon benchmarks.\n","authors":["Zaiquan Yang","Yuhao Liu","Jiaying Lin","Gerhard Hancke","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2410.01544v3.pdf","comment":"Accepted to NeurIPS2024"},{"id":"http://arxiv.org/abs/2404.16331v2","updated":"2024-12-04T07:47:10Z","published":"2024-04-25T04:37:35Z","title":"IMWA: Iterative Model Weight Averaging Benefits Class-Imbalanced\n Learning Tasks","summary":" Model Weight Averaging (MWA) is a technique that seeks to enhance model's\nperformance by averaging the weights of multiple trained models. This paper\nfirst empirically finds that 1) the vanilla MWA can benefit the\nclass-imbalanced learning, and 2) performing model averaging in the early\nepochs of training yields a greater performance improvement than doing that in\nlater epochs. Inspired by these two observations, in this paper we propose a\nnovel MWA technique for class-imbalanced learning tasks named Iterative Model\nWeight Averaging (IMWA). Specifically, IMWA divides the entire training stage\ninto multiple episodes. Within each episode, multiple models are concurrently\ntrained from the same initialized model weight, and subsequently averaged into\na singular model. Then, the weight of this average model serves as a fresh\ninitialization for the ensuing episode, thus establishing an iterative learning\nparadigm. Compared to vanilla MWA, IMWA achieves higher performance\nimprovements with the same computational cost. Moreover, IMWA can further\nenhance the performance of those methods employing EMA strategy, demonstrating\nthat IMWA and EMA can complement each other. Extensive experiments on various\nclass-imbalanced learning tasks, i.e., class-imbalanced image classification,\nsemi-supervised class-imbalanced image classification and semi-supervised\nobject detection tasks showcase the effectiveness of our IMWA.\n","authors":["Zitong Huang","Ze Chen","Bowen Dong","Chaoqi Liang","Erjin Zhou","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.16331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03093v1","updated":"2024-12-04T07:44:58Z","published":"2024-12-04T07:44:58Z","title":"Expanding Event Modality Applications through a Robust CLIP-Based\n Encoder","summary":" This paper introduces a powerful encoder that transfers CLIP`s capabilities\nto event-based data, enhancing its utility and expanding its applicability\nacross diverse domains. While large-scale datasets have significantly advanced\nimage-based models, the scarcity of comprehensive event datasets has limited\nperformance potential in event modality. To address this challenge, we adapt\nCLIP`s architecture to align event embeddings with image embeddings, supporting\nzero-shot learning and preserving text alignment while mitigating catastrophic\nforgetting. Our encoder achieves strong performance in object recognition, with\ncompetitive results in zero-shot and few-shot learning tasks. Notably, it\ngeneralizes effectively to events extracted from video data without requiring\nadditional training, highlighting its versatility. Additionally, we integrate\nthis encoder within a cross-modality framework that facilitates interaction\nacross five modalities-Image, Event, Text, Sound, and Depth-expanding the\npossibilities for cross-modal applications. Overall, this work underscores the\ntransformative potential of a robust event encoder, broadening the scope and\nutility of event-based data across various fields.\n","authors":["Sungheon Jeong","Hanning Chen","Sanggeon Yun","Suhyeon Cho","Wenjun Huang","Xiangjian Liu","Mohsen Imani"],"pdf_url":"https://arxiv.org/pdf/2412.03093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01598v3","updated":"2024-12-04T07:36:46Z","published":"2024-01-03T07:59:17Z","title":"Learning Prompt with Distribution-Based Feature Replay for Few-Shot\n Class-Incremental Learning","summary":" Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new\nclasses based on very limited training data without forgetting the old ones\nencountered. Existing studies solely relied on pure visual networks, while in\nthis paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP)\nand propose a simple yet effective framework, named Learning Prompt with\nDistribution-based Feature Replay (LP-DiF). We observe that simply using CLIP\nfor zero-shot evaluation can substantially outperform the most influential\nmethods. Then, prompt tuning technique is involved to further improve its\nadaptation ability, allowing the model to continually capture specific\nknowledge from each session. To prevent the learnable prompt from forgetting\nold knowledge in the new session, we propose a pseudo-feature replay approach.\nSpecifically, we preserve the old knowledge of each class by maintaining a\nfeature-level Gaussian distribution with a diagonal covariance matrix, which is\nestimated by the image features of training images and synthesized features\ngenerated from a VAE. When progressing to a new session, pseudo-features are\nsampled from old-class distributions combined with training images of the\ncurrent session to optimize the prompt, thus enabling the model to learn new\nknowledge while retaining old knowledge. Experiments on three prevalent\nbenchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging\nbenchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the\nsuperiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is\npublicly available at https://github.com/1170300714/LP-DiF.\n","authors":["Zitong Huang","Ze Chen","Zhixing Chen","Erjin Zhou","Xinxing Xu","Rick Siow Mong Goh","Yong Liu","Wangmeng Zuo","Chunmei Feng"],"pdf_url":"https://arxiv.org/pdf/2401.01598v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.20044v2","updated":"2024-12-04T07:35:37Z","published":"2024-05-30T13:25:25Z","title":"A Point-Neighborhood Learning Framework for Nasal Endoscope Image\n Segmentation","summary":" Lesion segmentation on nasal endoscopic images is challenging due to its\ncomplex lesion features. Fully-supervised deep learning methods achieve\npromising performance with pixel-level annotations but impose a significant\nannotation burden on experts. Although weakly supervised or semi-supervised\nmethods can reduce the labelling burden, their performance is still limited.\nSome weakly semi-supervised methods employ a novel annotation strategy that\nlabels weak single-point annotations for the entire training set while\nproviding pixel-level annotations for a small subset of the data. However, the\nrelevant weakly semi-supervised methods only mine the limited information of\nthe point itself, while ignoring its label property and surrounding reliable\ninformation. This paper proposes a simple yet efficient weakly semi-supervised\nmethod called the Point-Neighborhood Learning (PNL) framework. PNL incorporates\nthe surrounding area of the point, referred to as the point-neighborhood, into\nthe learning process. In PNL, we propose a point-neighborhood supervision loss\nand a pseudo-label scoring mechanism to explicitly guide the model's training.\nMeanwhile, we proposed a more reliable data augmentation scheme. The proposed\nmethod significantly improves performance without increasing the parameters of\nthe segmentation neural network. Extensive experiments on the NPC-LES dataset\ndemonstrate that PNL outperforms existing methods by a significant margin.\nAdditional validation on colonoscopic polyp segmentation datasets confirms the\ngeneralizability of the proposed PNL.\n","authors":["Pengyu Jie","Wanquan Liu","Chenqiang Gao","Yihui Wen","Rui He","Weiping Wen","Pengcheng Li","Jintao Zhang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2405.20044v2.pdf","comment":"10 pages, 10 figures,"},{"id":"http://arxiv.org/abs/2410.11374v2","updated":"2024-12-04T07:35:20Z","published":"2024-10-15T08:12:54Z","title":"Preserve or Modify? Context-Aware Evaluation for Balancing Preservation\n and Modification in Text-Guided Image Editing","summary":" The development of vision-language and generative models has significantly\nadvanced text-guided image editing, which seeks the \\textit{preservation} of\ncore elements in the source image while implementing \\textit{modifications}\nbased on the target text. However, existing metrics have a\n\\textbf{context-blindness} problem, indiscriminately applying the same\nevaluation criteria on completely different pairs of source image and target\ntext, biasing towards either modification or preservation. Directional CLIP\nsimilarity, the only metric that considers both source image and target text,\nis also biased towards modification aspects and attends to irrelevant editing\nregions of the image. We propose \\texttt{AugCLIP}, a \\textbf{context-aware}\nmetric that adaptively coordinates preservation and modification aspects,\ndepending on the specific context of a given source image and target text. This\nis done by deriving the CLIP representation of an ideally edited image, that\npreserves the source image with necessary modifications to align with target\ntext. More specifically, using a multi-modal large language model,\n\\texttt{AugCLIP} augments the textual descriptions of the source and target,\nthen calculates a modification vector through a hyperplane that separates\nsource and target attributes in CLIP space. Extensive experiments on five\nbenchmark datasets, encompassing a diverse range of editing scenarios, show\nthat \\texttt{AugCLIP} aligns remarkably well with human evaluation standards,\noutperforming existing metrics. The code will be open-sourced for community\nuse.\n","authors":["Yoonjeon Kim","Soohyun Ryu","Yeonsung Jung","Hyunkoo Lee","Joowon Kim","June Yong Yang","Jaeryong Hwang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2410.11374v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2412.03085v1","updated":"2024-12-04T07:26:44Z","published":"2024-12-04T07:26:44Z","title":"Mimir: Improving Video Diffusion Models for Precise Text Understanding","summary":" Text serves as the key control signal in video generation due to its\nnarrative nature. To render text descriptions into video clips, current video\ndiffusion models borrow features from text encoders yet struggle with limited\ntext comprehension. The recent success of large language models (LLMs)\nshowcases the power of decoder-only transformers, which offers three clear\nbenefits for text-to-video (T2V) generation, namely, precise text understanding\nresulting from the superior scalability, imagination beyond the input text\nenabled by next token prediction, and flexibility to prioritize user interests\nthrough instruction tuning. Nevertheless, the feature distribution gap emerging\nfrom the two different text modeling paradigms hinders the direct use of LLMs\nin established T2V models. This work addresses this challenge with Mimir, an\nend-to-end training framework featuring a carefully tailored token fuser to\nharmonize the outputs from text encoders and LLMs. Such a design allows the T2V\nmodel to fully leverage learned video priors while capitalizing on the\ntext-related capability of LLMs. Extensive quantitative and qualitative results\ndemonstrate the effectiveness of Mimir in generating high-quality videos with\nexcellent text comprehension, especially when processing short captions and\nmanaging shifting motions. Project page:\nhttps://lucaria-academy.github.io/Mimir/\n","authors":["Shuai Tan","Biao Gong","Yutong Feng","Kecheng Zheng","Dandan Zheng","Shuwei Shi","Yujun Shen","Jingdong Chen","Ming Yang"],"pdf_url":"https://arxiv.org/pdf/2412.03085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03084v1","updated":"2024-12-04T07:26:36Z","published":"2024-12-04T07:26:36Z","title":"Hybrid deep learning-based strategy for the hepatocellular carcinoma\n cancer grade classification of H&E stained liver histopathology images","summary":" Hepatocellular carcinoma (HCC) is a common type of liver cancer whose\nearly-stage diagnosis is a common challenge, mainly due to the manual\nassessment of hematoxylin and eosin-stained whole slide images, which is a\ntime-consuming process and may lead to variability in decision-making. For\naccurate detection of HCC, we propose a hybrid deep learning-based architecture\nthat uses transfer learning to extract the features from pre-trained\nconvolutional neural network (CNN) models and a classifier made up of a\nsequence of fully connected layers. This study uses a publicly available The\nCancer Genome Atlas Hepatocellular Carcinoma (TCGA-LIHC)database (n=491) for\nmodel development and database of Kasturba Gandhi Medical College (KMC), India\nfor validation. The pre-processing step involves patch extraction, colour\nnormalization, and augmentation that results in 3920 patches for the TCGA\ndataset. The developed hybrid deep neural network consisting of a CNN-based\npre-trained feature extractor and a customized artificial neural network-based\nclassifier is trained using five-fold cross-validation. For this study, eight\ndifferent state-of-the-art models are trained and tested as feature extractors\nfor the proposed hybrid model. The proposed hybrid model with ResNet50-based\nfeature extractor provided the sensitivity, specificity, F1-score, accuracy,\nand AUC of 100.00%, 100.00%, 100.00%, 100.00%, and 1.00, respectively on the\nTCGA database. On the KMC database, EfficientNetb3 resulted in the optimal\nchoice of the feature extractor giving sensitivity, specificity, F1-score,\naccuracy, and AUC of 96.97, 98.85, 96.71, 96.71, and 0.99, respectively. The\nproposed hybrid models showed improvement in accuracy of 2% and 4% over the\npre-trained models in TCGA-LIHC and KMC databases.\n","authors":["Ajinkya Deshpande","Deep Gupta","Ankit Bhurane","Nisha Meshram","Sneha Singh","Petia Radeva"],"pdf_url":"https://arxiv.org/pdf/2412.03084v1.pdf","comment":"14 figure, 9 tables"},{"id":"http://arxiv.org/abs/2412.03079v1","updated":"2024-12-04T07:09:59Z","published":"2024-12-04T07:09:59Z","title":"Align3R: Aligned Monocular Depth Estimation for Dynamic Videos","summary":" Recent developments in monocular depth estimation methods enable high-quality\ndepth estimation of single-view images but fail to estimate consistent video\ndepth across different frames. Recent works address this problem by applying a\nvideo diffusion model to generate video depth conditioned on the input video,\nwhich is training-expensive and can only produce scale-invariant depth values\nwithout camera poses. In this paper, we propose a novel video-depth estimation\nmethod called Align3R to estimate temporal consistent depth maps for a dynamic\nvideo. Our key idea is to utilize the recent DUSt3R model to align estimated\nmonocular depth maps of different timesteps. First, we fine-tune the DUSt3R\nmodel with additional estimated monocular depth as inputs for the dynamic\nscenes. Then, we apply optimization to reconstruct both depth maps and camera\nposes. Extensive experiments demonstrate that Align3R estimates consistent\nvideo depth and camera poses for a monocular video with superior performance\nthan baseline methods.\n","authors":["Jiahao Lu","Tianyu Huang","Peng Li","Zhiyang Dou","Cheng Lin","Zhiming Cui","Zhen Dong","Sai-Kit Yeung","Wenping Wang","Yuan Liu"],"pdf_url":"https://arxiv.org/pdf/2412.03079v1.pdf","comment":"Project Page: https://igl-hkust.github.io/Align3R.github.io/"},{"id":"http://arxiv.org/abs/2412.03077v1","updated":"2024-12-04T07:02:49Z","published":"2024-12-04T07:02:49Z","title":"RoDyGS: Robust Dynamic Gaussian Splatting for Casual Videos","summary":" Dynamic view synthesis (DVS) has advanced remarkably in recent years,\nachieving high-fidelity rendering while reducing computational costs. Despite\nthe progress, optimizing dynamic neural fields from casual videos remains\nchallenging, as these videos do not provide direct 3D information, such as\ncamera trajectories or the underlying scene geometry. In this work, we present\nRoDyGS, an optimization pipeline for dynamic Gaussian Splatting from casual\nvideos. It effectively learns motion and underlying geometry of scenes by\nseparating dynamic and static primitives, and ensures that the learned motion\nand geometry are physically plausible by incorporating motion and geometric\nregularization terms. We also introduce a comprehensive benchmark, Kubric-MRig,\nthat provides extensive camera and object motion along with simultaneous\nmulti-view captures, features that are absent in previous benchmarks.\nExperimental results demonstrate that the proposed method significantly\noutperforms previous pose-free dynamic neural fields and achieves competitive\nrendering quality compared to existing pose-free static neural fields. The code\nand data are publicly available at https://rodygs.github.io/.\n","authors":["Yoonwoo Jeong","Junmyeong Lee","Hoseung Choi","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2412.03077v1.pdf","comment":"Project Page: https://rodygs.github.io/"},{"id":"http://arxiv.org/abs/2412.03069v1","updated":"2024-12-04T06:46:55Z","published":"2024-12-04T06:46:55Z","title":"TokenFlow: Unified Image Tokenizer for Multimodal Understanding and\n Generation","summary":" We present TokenFlow, a novel unified image tokenizer that bridges the\nlong-standing gap between multimodal understanding and generation. Prior\nresearch attempt to employ a single reconstruction-targeted Vector Quantization\n(VQ) encoder for unifying these two tasks. We observe that understanding and\ngeneration require fundamentally different granularities of visual information.\nThis leads to a critical trade-off, particularly compromising performance in\nmultimodal understanding tasks. TokenFlow addresses this challenge through an\ninnovative dual-codebook architecture that decouples semantic and pixel-level\nfeature learning while maintaining their alignment via a shared mapping\nmechanism. This design enables direct access to both high-level semantic\nrepresentations crucial for understanding tasks and fine-grained visual\nfeatures essential for generation through shared indices. Our extensive\nexperiments demonstrate TokenFlow's superiority across multiple dimensions.\nLeveraging TokenFlow, we demonstrate for the first time that discrete visual\ninput can surpass LLaVA-1.5 13B in understanding performance, achieving a 7.2\\%\naverage improvement. For image reconstruction, we achieve a strong FID score of\n0.63 at 384*384 resolution. Moreover, TokenFlow establishes state-of-the-art\nperformance in autoregressive image generation with a GenEval score of 0.55 at\n256*256 resolution, achieving comparable results to SDXL.\n","authors":["Liao Qu","Huichao Zhang","Yiheng Liu","Xu Wang","Yi Jiang","Yiming Gao","Hu Ye","Daniel K. Du","Zehuan Yuan","Xinglong Wu"],"pdf_url":"https://arxiv.org/pdf/2412.03069v1.pdf","comment":"https://byteflow-ai.github.io/TokenFlow/"},{"id":"http://arxiv.org/abs/2407.19652v2","updated":"2024-12-04T06:44:18Z","published":"2024-07-29T02:34:51Z","title":"SALVE: A 3D Reconstruction Benchmark of Wounds from Consumer-grade\n Videos","summary":" Managing chronic wounds is a global challenge that can be alleviated by the\nadoption of automatic systems for clinical wound assessment from consumer-grade\nvideos. While 2D image analysis approaches are insufficient for handling the 3D\nfeatures of wounds, existing approaches utilizing 3D reconstruction methods\nhave not been thoroughly evaluated. To address this gap, this paper presents a\ncomprehensive study on 3D wound reconstruction from consumer-grade videos.\nSpecifically, we introduce the SALVE dataset, comprising video recordings of\nrealistic wound phantoms captured with different cameras. Using this dataset,\nwe assess the accuracy and precision of state-of-the-art methods for 3D\nreconstruction, ranging from traditional photogrammetry pipelines to advanced\nneural rendering approaches. In our experiments, we observe that photogrammetry\napproaches do not provide smooth surfaces suitable for precise clinical\nmeasurements of wounds. Neural rendering approaches show promise in addressing\nthis issue, advancing the use of this technology in wound care practices.\n","authors":["Remi Chierchia","Leo Lebrat","David Ahmedt-Aristizabal","Olivier Salvado","Clinton Fookes","Rodrigo Santa Cruz"],"pdf_url":"https://arxiv.org/pdf/2407.19652v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01820v2","updated":"2024-12-04T06:38:22Z","published":"2024-12-02T18:58:04Z","title":"Towards Universal Soccer Video Understanding","summary":" As a globally celebrated sport, soccer has attracted widespread interest from\nfans all over the world. This paper aims to develop a comprehensive multi-modal\nframework for soccer video understanding. Specifically, we make the following\ncontributions in this paper: (i) we introduce SoccerReplay-1988, the largest\nmulti-modal soccer dataset to date, featuring videos and detailed annotations\nfrom 1,988 complete matches, with an automated annotation pipeline; (ii) we\npresent the first visual-language foundation model in the soccer domain,\nMatchVision, which leverages spatiotemporal information across soccer videos\nand excels in various downstream tasks; (iii) we conduct extensive experiments\nand ablation studies on event classification, commentary generation, and\nmulti-view foul recognition. MatchVision demonstrates state-of-the-art\nperformance on all of them, substantially outperforming existing models, which\nhighlights the superiority of our proposed data and model. We believe that this\nwork will offer a standard paradigm for sports understanding research.\n","authors":["Jiayuan Rao","Haoning Wu","Hao Jiang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2412.01820v2.pdf","comment":"Technical Report; Project Page: https://jyrao.github.io/UniSoccer/"},{"id":"http://arxiv.org/abs/2402.06251v2","updated":"2024-12-04T06:36:28Z","published":"2024-02-09T08:59:37Z","title":"Insomnia Identification via Electroencephalography","summary":" Insomnia is a serious sleep disorder caused by abnormal or excessive neural\nactivity in the brain. An estimated 50 million people worldwide are thought to\nbe affected by this condition, which is the second most severe neurological\ndisease after stroke. In order to ensure a quick recovery, an early and\naccurate diagnosis of insomnia enables more effective drug and treatment\nadministration. This study proposes a method that uses deep learning to\nautomatically identify patients with insomnia. A set of optimal features are\nextracted from spectral and temporal domains, including the relative power of\n{\\sigma}, \\b{eta} and {\\gamma} bands, the total power, the absolute slow wave\npower, the power ratios of {\\theta}, {\\alpha}, {\\gamma}, \\b{eta},\n{\\theta}/{\\alpha}, {\\theta}/\\b{eta}, {\\alpha}/{\\gamma} and {\\alpha}/\\b{eta},\nmean, zero crossing rate, mobility, complexity, sleep efficiency and total\nsleep time, to accurately quantify the differences between insomnia patients\nand healthy subjects and develops a 1D CNN model for the classification\nprocess. With the experiments use Fp2 and C4 EEG channels with 50 insomnia\npatients and 50 healthy subjects, the proposed model arrives 99.34% accuracy\nwithout sleep stage annotation. Using the features only from a single channel,\nthe study proposes a smart solution for insomnia patients which allows machine\nlearning to be to simplify current sleep monitoring hardware and improve\nin-home ambulatory monitoring.\n","authors":["Olviya Udeshika","Dilshan Lakshitha","Nilantha Premakumara","Surangani Bandara"],"pdf_url":"https://arxiv.org/pdf/2402.06251v2.pdf","comment":"This submission was made without all co-authors consent"},{"id":"http://arxiv.org/abs/2412.03061v1","updated":"2024-12-04T06:33:27Z","published":"2024-12-04T06:33:27Z","title":"Lightweight Stochastic Video Prediction via Hybrid Warping","summary":" Accurate video prediction by deep neural networks, especially for dynamic\nregions, is a challenging task in computer vision for critical applications\nsuch as autonomous driving, remote working, and telemedicine. Due to inherent\nuncertainties, existing prediction models often struggle with the complexity of\nmotion dynamics and occlusions. In this paper, we propose a novel stochastic\nlong-term video prediction model that focuses on dynamic regions by employing a\nhybrid warping strategy. By integrating frames generated through forward and\nbackward warpings, our approach effectively compensates for the weaknesses of\neach technique, improving the prediction accuracy and realism of moving regions\nin videos while also addressing uncertainty by making stochastic predictions\nthat account for various motions. Furthermore, considering real-time\npredictions, we introduce a MobileNet-based lightweight architecture into our\nmodel. Our model, called SVPHW, achieves state-of-the-art performance on two\nbenchmark datasets.\n","authors":["Kazuki Kotoyori","Shota Hirose","Heming Sun","Jiro Katto"],"pdf_url":"https://arxiv.org/pdf/2412.03061v1.pdf","comment":"IEEE VCIP 2024"},{"id":"http://arxiv.org/abs/2412.03059v1","updated":"2024-12-04T06:26:12Z","published":"2024-12-04T06:26:12Z","title":"CLAP: Unsupervised 3D Representation Learning for Fusion 3D Perception\n via Curvature Sampling and Prototype Learning","summary":" Unsupervised 3D representation learning via masked-and-reconstruction with\ndifferentiable rendering is promising to reduce the labeling burden for fusion\n3D perception. However, previous literature conduct pre-training for different\nmodalities separately because of the hight GPU memory consumption.\nConsequently, the interaction between the two modalities (images and point\nclouds) is neglected during pre-training. In this paper, we explore joint\nunsupervised pre-training for fusion 3D perception via differentiable rendering\nand propose CLAP, short for Curvature sampLing and swApping Prototype\nassignment prediction. The contributions are three-fold. 1) To overcome the GPU\nmemory consumption problem, we propose Curvature Sampling to sample the more\ninformative points/pixels for pre-training. 2) We propose to use learnable\nprototypes to represent parts of the scenes in a common feature space and bring\nthe idea of swapping prototype assignment prediction to learn the interaction\nbetween the two modalities. 3) To further optimize learnable prototypes, we\npropose an Expectation-Maximization training scheme to maximize the similarity\nbetween embeddings and prototypes, followed by a Gram Matrix Regularization\nLoss to avoid collapse. Experiment results on NuScenes show that CLAP achieves\n300% more performance gain as compared to previous SOTA 3D pre-training method\nvia differentiable rendering. Codes and models will be released.\n","authors":["Runjian Chen","Hang Zhang","Avinash Ravichandran","Wenqi Shao","Alex Wong","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2412.03059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03058v1","updated":"2024-12-04T06:25:26Z","published":"2024-12-04T06:25:26Z","title":"Revisiting Energy-Based Model for Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection is an essential approach to robustifying\ndeep learning models, enabling them to identify inputs that fall outside of\ntheir trained distribution. Existing OOD detection methods usually depend on\ncrafted data, such as specific outlier datasets or elaborate data\naugmentations. While this is reasonable, the frequent mismatch between crafted\ndata and OOD data limits model robustness and generalizability. In response to\nthis issue, we introduce Outlier Exposure by Simple Transformations (OEST), a\nframework that enhances OOD detection by leveraging \"peripheral-distribution\"\n(PD) data. Specifically, PD data are samples generated through simple data\ntransformations, thus providing an efficient alternative to manually curated\noutliers.\n We adopt energy-based models (EBMs) to study PD data. We recognize the\n\"energy barrier\" in OOD detection, which characterizes the energy difference\nbetween in-distribution (ID) and OOD samples and eases detection. PD data are\nintroduced to establish the energy barrier during training. Furthermore, this\nenergy barrier concept motivates a theoretically grounded energy-barrier loss\nto replace the classical energy-bounded loss, leading to an improved paradigm,\nOEST*, which achieves a more effective and theoretically sound separation\nbetween ID and OOD samples. We perform empirical validation of our proposal,\nand extensive experiments across various benchmarks demonstrate that OEST*\nachieves better or similar accuracy compared with state-of-the-art methods.\n","authors":["Yifan Wu","Xichen Ye","Songmin Dai","Dengye Pan","Xiaoqiang Li","Weizhong Zhang","Yifan Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03058v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.03056v1","updated":"2024-12-04T06:20:51Z","published":"2024-12-04T06:20:51Z","title":"Point-GN: A Non-Parametric Network Using Gaussian Positional Encoding\n for Point Cloud Classification","summary":" This paper introduces Point-GN, a novel non-parametric network for efficient\nand accurate 3D point cloud classification. Unlike conventional deep learning\nmodels that rely on a large number of trainable parameters, Point-GN leverages\nnon-learnable components-specifically, Farthest Point Sampling (FPS), k-Nearest\nNeighbors (k-NN), and Gaussian Positional Encoding (GPE)-to extract both local\nand global geometric features. This design eliminates the need for additional\ntraining while maintaining high performance, making Point-GN particularly\nsuited for real-time, resource-constrained applications. We evaluate Point-GN\non two benchmark datasets, ModelNet40 and ScanObjectNN, achieving\nclassification accuracies of 85.29% and 85.89%, respectively, while\nsignificantly reducing computational complexity. Point-GN outperforms existing\nnon-parametric methods and matches the performance of fully trained models, all\nwith zero learnable parameters. Our results demonstrate that Point-GN is a\npromising solution for 3D point cloud classification in practical, real-time\nenvironments.\n","authors":["Marzieh Mohammadi","Amir Salarpour"],"pdf_url":"https://arxiv.org/pdf/2412.03056v1.pdf","comment":"This paper has been accepted for presentation at the IEEE Winter\n Conference on Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2412.03055v1","updated":"2024-12-04T06:20:36Z","published":"2024-12-04T06:20:36Z","title":"Real-Time AIoT for UAV Antenna Interference Detection via Edge-Cloud\n Collaboration","summary":" In the fifth-generation (5G) era, eliminating communication interference\nsources is crucial for maintaining network performance. Interference often\noriginates from unauthorized or malfunctioning antennas, and radio monitoring\nagencies must address numerous sources of such antennas annually. Unmanned\naerial vehicles (UAVs) can improve inspection efficiency. However, the data\ntransmission delay in the existing cloud-only (CO) artificial intelligence (AI)\nmode fails to meet the low latency requirements for real-time performance.\nTherefore, we propose a computer vision-based AI of Things (AIoT) system to\ndetect antenna interference sources for UAVs. The system adopts an optimized\nedge-cloud collaboration (ECC+) mode, combining a keyframe selection algorithm\n(KSA), focusing on reducing end-to-end latency (E2EL) and ensuring reliable\ndata transmission, which aligns with the core principles of ultra-reliable\nlow-latency communication (URLLC). At the core of our approach is an end-to-end\nantenna localization scheme based on the tracking-by-detection (TBD) paradigm,\nincluding a detector (EdgeAnt) and a tracker (AntSort). EdgeAnt achieves\nstate-of-the-art (SOTA) performance with a mean average precision (mAP) of\n42.1% on our custom antenna interference source dataset, requiring only 3\nmillion parameters and 14.7 GFLOPs. On the COCO dataset, EdgeAnt achieves 38.9%\nmAP with 5.4 GFLOPs. We deployed EdgeAnt on Jetson Xavier NX (TRT) and\nRaspberry Pi 4B (NCNN), achieving real-time inference speeds of 21.1 (1088) and\n4.8 (640) frames per second (FPS), respectively. Compared with CO mode, the\nECC+ mode reduces E2EL by 88.9%, increases accuracy by 28.2%. Additionally, the\nsystem offers excellent scalability for coordinated multiple UAVs inspections.\nThe detector code is publicly available at\nhttps://github.com/SCNU-RISLAB/EdgeAnt.\n","authors":["Jun Dong","Jintao Cheng","Jin Wu","Chengxi Zhang","Shunyi Zhao","Xiaoyu Tang"],"pdf_url":"https://arxiv.org/pdf/2412.03055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.18407v2","updated":"2024-12-04T06:18:16Z","published":"2024-05-28T17:47:19Z","title":"Phased Consistency Models","summary":" Consistency Models (CMs) have made significant progress in accelerating the\ngeneration of diffusion models. However, their application to high-resolution,\ntext-conditioned image generation in the latent space remains unsatisfactory.\nIn this paper, we identify three key flaws in the current design of Latent\nConsistency Models (LCMs). We investigate the reasons behind these limitations\nand propose Phased Consistency Models (PCMs), which generalize the design space\nand address the identified limitations. Our evaluations demonstrate that PCMs\noutperform LCMs across 1--16 step generation settings. While PCMs are\nspecifically designed for multi-step refinement, they achieve comparable 1-step\ngeneration results to previously state-of-the-art specifically designed 1-step\nmethods. Furthermore, we show the methodology of PCMs is versatile and\napplicable to video generation, enabling us to train the state-of-the-art\nfew-step text-to-video generator. Our code is available at\nhttps://github.com/G-U-N/Phased-Consistency-Model.\n","authors":["Fu-Yun Wang","Zhaoyang Huang","Alexander William Bergman","Dazhong Shen","Peng Gao","Michael Lingelbach","Keqiang Sun","Weikang Bian","Guanglu Song","Yu Liu","Xiaogang Wang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2405.18407v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03054v1","updated":"2024-12-04T06:17:24Z","published":"2024-12-04T06:17:24Z","title":"TREND: Unsupervised 3D Representation Learning via Temporal Forecasting\n for LiDAR Perception","summary":" Labeling LiDAR point clouds is notoriously time-and-energy-consuming, which\nspurs recent unsupervised 3D representation learning methods to alleviate the\nlabeling burden in LiDAR perception via pretrained weights. Almost all existing\nwork focus on a single frame of LiDAR point cloud and neglect the temporal\nLiDAR sequence, which naturally accounts for object motion (and their\nsemantics). Instead, we propose TREND, namely Temporal REndering with Neural\nfielD, to learn 3D representation via forecasting the future observation in an\nunsupervised manner. Unlike existing work that follows conventional contrastive\nlearning or masked auto encoding paradigms, TREND integrates forecasting for 3D\npre-training through a Recurrent Embedding scheme to generate 3D embedding\nacross time and a Temporal Neural Field to represent the 3D scene, through\nwhich we compute the loss using differentiable rendering. To our best\nknowledge, TREND is the first work on temporal forecasting for unsupervised 3D\nrepresentation learning. We evaluate TREND on downstream 3D object detection\ntasks on popular datasets, including NuScenes, Once and Waymo. Experiment\nresults show that TREND brings up to 90% more improvement as compared to\nprevious SOTA unsupervised 3D pre-training methods and generally improve\ndifferent downstream models across datasets, demonstrating that indeed temporal\nforecasting brings improvement for LiDAR perception. Codes and models will be\nreleased.\n","authors":["Runjian Chen","Hyoungseob Park","Bo Zhang","Wenqi Shao","Ping Luo","Alex Wong"],"pdf_url":"https://arxiv.org/pdf/2412.03054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03052v1","updated":"2024-12-04T06:12:19Z","published":"2024-12-04T06:12:19Z","title":"Point-GR: Graph Residual Point Cloud Network for 3D Object\n Classification and Segmentation","summary":" In recent years, the challenge of 3D shape analysis within point cloud data\nhas gathered significant attention in computer vision. Addressing the\ncomplexities of effective 3D information representation and meaningful feature\nextraction for classification tasks remains crucial. This paper presents\nPoint-GR, a novel deep learning architecture designed explicitly to transform\nunordered raw point clouds into higher dimensions while preserving local\ngeometric features. It introduces residual-based learning within the network to\nmitigate the point permutation issues in point cloud data. The proposed\nPoint-GR network significantly reduced the number of network parameters in\nClassification and Part-Segmentation compared to baseline graph-based networks.\nNotably, the Point-GR model achieves a state-of-the-art scene segmentation mean\nIoU of 73.47% on the S3DIS benchmark dataset, showcasing its effectiveness.\nFurthermore, the model shows competitive results in Classification and\nPart-Segmentation tasks.\n","authors":["Md Meraz","Md Afzal Ansari","Mohammed Javed","Pavan Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2412.03052v1.pdf","comment":"ICPR 2024 G2SP-CV Workshop, Dec 1-5, 2024 Kolkata, India"},{"id":"http://arxiv.org/abs/2312.17225v3","updated":"2024-12-04T06:11:50Z","published":"2023-12-28T18:53:39Z","title":"4DGen: Grounded 4D Content Generation with Spatial-temporal Consistency","summary":" Aided by text-to-image and text-to-video diffusion models, existing 4D\ncontent creation pipelines utilize score distillation sampling to optimize the\nentire dynamic 3D scene. However, as these pipelines generate 4D content from\ntext or image inputs directly, they are constrained by limited motion\ncapabilities and depend on unreliable prompt engineering for desired results.\nTo address these problems, this work introduces \\textbf{4DGen}, a novel\nframework for grounded 4D content creation. We identify monocular video\nsequences as a key component in constructing the 4D content. Our pipeline\nfacilitates controllable 4D generation, enabling users to specify the motion\nvia monocular video or adopt image-to-video generations, thus offering superior\ncontrol over content creation. Furthermore, we construct our 4D representation\nusing dynamic 3D Gaussians, which permits efficient, high-resolution\nsupervision through rendering during training, thereby facilitating\nhigh-quality 4D generation. Additionally, we employ spatial-temporal pseudo\nlabels on anchor frames, along with seamless consistency priors implemented\nthrough 3D-aware score distillation sampling and smoothness regularizations.\nCompared to existing video-to-4D baselines, our approach yields superior\nresults in faithfully reconstructing input signals and realistically inferring\nrenderings from novel viewpoints and timesteps. More importantly, compared to\nprevious image-to-4D and text-to-4D works, 4DGen supports grounded generation,\noffering users enhanced control and improved motion generation capabilities, a\nfeature difficult to achieve with previous methods. Project page:\nhttps://vita-group.github.io/4DGen/\n","authors":["Yuyang Yin","Dejia Xu","Zhangyang Wang","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2312.17225v3.pdf","comment":"Project page: https://vita-group.github.io/4DGen/"},{"id":"http://arxiv.org/abs/2312.05915v2","updated":"2024-12-04T05:56:29Z","published":"2023-12-10T15:28:56Z","title":"Diffusion for Natural Image Matting","summary":" We aim to leverage diffusion to address the challenging image matting task.\nHowever, the presence of high computational overhead and the inconsistency of\nnoise sampling between the training and inference processes pose significant\nobstacles to achieving this goal. In this paper, we present DiffMatte, a\nsolution designed to effectively overcome these challenges. First, DiffMatte\ndecouples the decoder from the intricately coupled matting network design,\ninvolving only one lightweight decoder in the iterations of the diffusion\nprocess. With such a strategy, DiffMatte mitigates the growth of computational\noverhead as the number of samples increases. Second, we employ a self-aligned\ntraining strategy with uniform time intervals, ensuring a consistent noise\nsampling between training and inference across the entire time domain. Our\nDiffMatte is designed with flexibility in mind and can seamlessly integrate\ninto various modern matting architectures. Extensive experimental results\ndemonstrate that DiffMatte not only reaches the state-of-the-art level on the\nComposition-1k test set, surpassing the best methods in the past by 5% and 15%\nin the SAD metric and MSE metric respectively, but also show stronger\ngeneralization ability in other benchmarks.\n","authors":["Yihan Hu","Yiheng Lin","Wei Wang","Yao Zhao","Yunchao Wei","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2312.05915v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.14868v2","updated":"2024-12-04T05:53:39Z","published":"2024-11-22T11:34:18Z","title":"Defective Edge Detection Using Cascaded Ensemble Canny Operator","summary":" Edge detection has been one of the most difficult challenges in computer\nvision because of the difficulty in identifying the borders and edges from the\nreal-world images including objects of varying kinds and sizes. Methods based\non ensemble learning, which use a combination of backbones and attention\nmodules, outperformed more conventional approaches, such as Sobel and Canny\nedge detection. Nevertheless, these algorithms are still challenged when faced\nwith complicated scene photos. In addition, the identified edges utilizing the\ncurrent methods are not refined and often include incorrect edges. In this\nwork, we used a Cascaded Ensemble Canny operator to solve these problems and\ndetect the object edges. The most difficult Fresh and Rotten and Berkeley\ndatasets are used to test the suggested approach in Python. In terms of\nperformance metrics and output picture quality, the acquired results outperform\nthe specified edge detection networks\n","authors":["Anjali Nambiyar Rajkumar Kannan"],"pdf_url":"https://arxiv.org/pdf/2411.14868v2.pdf","comment":"2 Pages and 2 Figures"},{"id":"http://arxiv.org/abs/2412.03044v1","updated":"2024-12-04T05:43:53Z","published":"2024-12-04T05:43:53Z","title":"Frequency-Guided Diffusion Model with Perturbation Training for\n Skeleton-Based Video Anomaly Detection","summary":" Video anomaly detection is an essential yet challenging open-set task in\ncomputer vision, often addressed by leveraging reconstruction as a proxy task.\nHowever, existing reconstruction-based methods encounter challenges in two main\naspects: (1) limited model robustness for open-set scenarios, (2) and an\noveremphasis on, but restricted capacity for, detailed motion reconstruction.\nTo this end, we propose a novel frequency-guided diffusion model with\nperturbation training, which enhances the model robustness by perturbation\ntraining and emphasizes the principal motion components guided by motion\nfrequencies. Specifically, we first use a trainable generator to produce\nperturbative samples for perturbation training of the diffusion model. During\nthe perturbation training phase, the model robustness is enhanced and the\ndomain of the reconstructed model is broadened by training against this\ngenerator. Subsequently, perturbative samples are introduced for inference,\nwhich impacts the reconstruction of normal and abnormal motions differentially,\nthereby enhancing their separability. Considering that motion details originate\nfrom high-frequency information, we propose a masking method based on 2D\ndiscrete cosine transform to separate high-frequency information and\nlow-frequency information. Guided by the high-frequency information from\nobserved motion, the diffusion model can focus on generating low-frequency\ninformation, and thus reconstructing the motion accurately. Experimental\nresults on five video anomaly detection datasets, including human-related and\nopen-set benchmarks, demonstrate the effectiveness of the proposed method. Our\ncode is available at https://github.com/Xiaofeng-Tan/FGDMAD-Code.\n","authors":["Xiaofeng Tan","Hongsong Wang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2412.03044v1.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2412.03572v1","updated":"2024-12-04T18:59:45Z","published":"2024-12-04T18:59:45Z","title":"Navigation World Models","summary":" Navigation is a fundamental skill of agents with visual-motor capabilities.\nWe introduce a Navigation World Model (NWM), a controllable video generation\nmodel that predicts future visual observations based on past observations and\nnavigation actions. To capture complex environment dynamics, NWM employs a\nConditional Diffusion Transformer (CDiT), trained on a diverse collection of\negocentric videos of both human and robotic agents, and scaled up to 1 billion\nparameters. In familiar environments, NWM can plan navigation trajectories by\nsimulating them and evaluating whether they achieve the desired goal. Unlike\nsupervised navigation policies with fixed behavior, NWM can dynamically\nincorporate constraints during planning. Experiments demonstrate its\neffectiveness in planning trajectories from scratch or by ranking trajectories\nsampled from an external policy. Furthermore, NWM leverages its learned visual\npriors to imagine trajectories in unfamiliar environments from a single input\nimage, making it a flexible and powerful tool for next-generation navigation\nsystems.\n","authors":["Amir Bar","Gaoyue Zhou","Danny Tran","Trevor Darrell","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2412.03572v1.pdf","comment":"project page: https://www.amirbar.net/nwm/"},{"id":"http://arxiv.org/abs/2412.03568v1","updated":"2024-12-04T18:59:05Z","published":"2024-12-04T18:59:05Z","title":"The Matrix: Infinite-Horizon World Generation with Real-Time Moving\n Control","summary":" We present The Matrix, the first foundational realistic world simulator\ncapable of generating continuous 720p high-fidelity real-scene video streams\nwith real-time, responsive control in both first- and third-person\nperspectives, enabling immersive exploration of richly dynamic environments.\nTrained on limited supervised data from AAA games like Forza Horizon 5 and\nCyberpunk 2077, complemented by large-scale unsupervised footage from\nreal-world settings like Tokyo streets, The Matrix allows users to traverse\ndiverse terrains -- deserts, grasslands, water bodies, and urban landscapes --\nin continuous, uncut hour-long sequences. Operating at 16 FPS, the system\nsupports real-time interactivity and demonstrates zero-shot generalization,\ntranslating virtual game environments to real-world contexts where collecting\ncontinuous movement data is often infeasible. For example, The Matrix can\nsimulate a BMW X3 driving through an office setting--an environment present in\nneither gaming data nor real-world sources. This approach showcases the\npotential of AAA game data to advance robust world models, bridging the gap\nbetween simulations and real-world applications in scenarios with limited data.\n","authors":["Ruili Feng","Han Zhang","Zhantao Yang","Jie Xiao","Zhilei Shu","Zhiheng Liu","Andy Zheng","Yukun Huang","Yu Liu","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03561v1","updated":"2024-12-04T18:56:04Z","published":"2024-12-04T18:56:04Z","title":"FLAIR: VLM with Fine-grained Language-informed Image Representations","summary":" CLIP has shown impressive results in aligning images and texts at scale.\nHowever, its ability to capture detailed visual features remains limited\nbecause CLIP matches images and texts at a global level. To address this issue,\nwe propose FLAIR, Fine-grained Language-informed Image Representations, an\napproach that utilizes long and detailed image descriptions to learn localized\nimage embeddings. By sampling diverse sub-captions that describe fine-grained\ndetails about an image, we train our vision-language model to produce not only\nglobal embeddings but also text-specific image representations. Our model\nintroduces text-conditioned attention pooling on top of local image tokens to\nproduce fine-grained image representations that excel at retrieving detailed\nimage content. We achieve state-of-the-art performance on both, existing\nmultimodal retrieval benchmarks, as well as, our newly introduced fine-grained\nretrieval task which evaluates vision-language models' ability to retrieve\npartial image content. Furthermore, our experiments demonstrate the\neffectiveness of FLAIR trained on 30M image-text pairs in capturing\nfine-grained visual information, including zero-shot semantic segmentation,\noutperforming models trained on billions of pairs. Code is available at\nhttps://github.com/ExplainableML/flair .\n","authors":["Rui Xiao","Sanghwan Kim","Mariana-Iuliana Georgescu","Zeynep Akata","Stephan Alaniz"],"pdf_url":"https://arxiv.org/pdf/2412.03561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03556v1","updated":"2024-12-04T18:51:32Z","published":"2024-12-04T18:51:32Z","title":"Best-of-N Jailbreaking","summary":" We introduce Best-of-N (BoN) Jailbreaking, a simple black-box algorithm that\njailbreaks frontier AI systems across modalities. BoN Jailbreaking works by\nrepeatedly sampling variations of a prompt with a combination of augmentations\n- such as random shuffling or capitalization for textual prompts - until a\nharmful response is elicited. We find that BoN Jailbreaking achieves high\nattack success rates (ASRs) on closed-source language models, such as 89% on\nGPT-4o and 78% on Claude 3.5 Sonnet when sampling 10,000 augmented prompts.\nFurther, it is similarly effective at circumventing state-of-the-art\nopen-source defenses like circuit breakers. BoN also seamlessly extends to\nother modalities: it jailbreaks vision language models (VLMs) such as GPT-4o\nand audio language models (ALMs) like Gemini 1.5 Pro, using modality-specific\naugmentations. BoN reliably improves when we sample more augmented prompts.\nAcross all modalities, ASR, as a function of the number of samples (N),\nempirically follows power-law-like behavior for many orders of magnitude. BoN\nJailbreaking can also be composed with other black-box algorithms for even more\neffective attacks - combining BoN with an optimized prefix attack achieves up\nto a 35% increase in ASR. Overall, our work indicates that, despite their\ncapability, language models are sensitive to seemingly innocuous changes to\ninputs, which attackers can exploit across modalities.\n","authors":["John Hughes","Sara Price","Aengus Lynch","Rylan Schaeffer","Fazl Barez","Sanmi Koyejo","Henry Sleight","Erik Jones","Ethan Perez","Mrinank Sharma"],"pdf_url":"https://arxiv.org/pdf/2412.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03548v1","updated":"2024-12-04T18:45:35Z","published":"2024-12-04T18:45:35Z","title":"Perception Tokens Enhance Visual Reasoning in Multimodal Language Models","summary":" Multimodal language models (MLMs) still face challenges in fundamental visual\nperception tasks where specialized models excel. Tasks requiring reasoning\nabout 3D structures benefit from depth estimation, and reasoning about 2D\nobject instances benefits from object detection. Yet, MLMs can not produce\nintermediate depth or boxes to reason over. Finetuning MLMs on relevant data\ndoesn't generalize well and outsourcing computation to specialized vision tools\nis too compute-intensive and memory-inefficient. To address this, we introduce\nPerception Tokens, intrinsic image representations designed to assist reasoning\ntasks where language is insufficient. Perception tokens act as auxiliary\nreasoning tokens, akin to chain-of-thought prompts in language models. For\nexample, in a depth-related task, an MLM augmented with perception tokens can\nreason by generating a depth map as tokens, enabling it to solve the problem\neffectively. We propose AURORA, a training method that augments MLMs with\nperception tokens for improved reasoning over visual inputs. AURORA leverages a\nVQVAE to transform intermediate image representations, such as depth maps into\na tokenized format and bounding box tokens, which is then used in a multi-task\ntraining framework. AURORA achieves notable improvements across counting\nbenchmarks: +10.8% on BLINK, +11.3% on CVBench, and +8.3% on SEED-Bench,\noutperforming finetuning approaches in generalization across datasets. It also\nimproves on relative depth: over +6% on BLINK. With perception tokens, AURORA\nexpands the scope of MLMs beyond language-based reasoning, paving the way for\nmore effective visual reasoning capabilities.\n","authors":["Mahtab Bigverdi","Zelun Luo","Cheng-Yu Hsieh","Ethan Shen","Dongping Chen","Linda G. Shapiro","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2412.03548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19379v2","updated":"2024-12-04T18:40:24Z","published":"2024-11-28T21:10:20Z","title":"Marconi: Prefix Caching for the Era of Hybrid LLMs","summary":" Hybrid models that combine the language modeling capabilities of Attention\nlayers with the efficiency of Recurrent layers (e.g., State Space Models) have\ngained traction in practically supporting long contexts in Large Language Model\nserving. Yet, the unique properties of these models complicate the usage of\ncomplementary efficiency optimizations such as prefix caching that skip\nredundant computations across requests. Most notably, their use of in-place\nstate updates for recurrent layers precludes rolling back cache entries for\npartial sequence overlaps, and instead mandates only exact-match cache hits;\nthe effect is a deluge of (large) cache entries per sequence, most of which\nyield minimal reuse opportunities. We present Marconi, the first system that\nsupports efficient prefix caching with Hybrid LLMs. Key to Marconi are its\nnovel admission and eviction policies that more judiciously assess potential\ncache entries based not only on recency, but also on (1) forecasts of their\nreuse likelihood across a taxonomy of different hit scenarios, and (2) the\ncompute savings that hits deliver relative to memory footprints. Across diverse\nworkloads and Hybrid models, Marconi achieves up to 34.4$\\times$ higher token\nhit rates (71.1% or 617 ms lower TTFT) compared to state-of-the-art prefix\ncaching systems.\n","authors":["Rui Pan","Zhuang Wang","Zhen Jia","Can Karakus","Luca Zancato","Tri Dao","Yida Wang","Ravi Netravali"],"pdf_url":"https://arxiv.org/pdf/2411.19379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03539v1","updated":"2024-12-04T18:36:09Z","published":"2024-12-04T18:36:09Z","title":"NODE-AdvGAN: Improving the transferability and perceptual similarity of\n adversarial examples by dynamic-system-driven adversarial generative model","summary":" Understanding adversarial examples is crucial for improving the model's\nrobustness, as they introduce imperceptible perturbations that deceive models.\nEffective adversarial examples, therefore, offer the potential to train more\nrobust models by removing their singularities. We propose NODE-AdvGAN, a novel\napproach that treats adversarial generation as a continuous process and employs\na Neural Ordinary Differential Equation (NODE) for simulating the dynamics of\nthe generator. By mimicking the iterative nature of traditional gradient-based\nmethods, NODE-AdvGAN generates smoother and more precise perturbations that\npreserve high perceptual similarity when added to benign images. We also\npropose a new training strategy, NODE-AdvGAN-T, which enhances transferability\nin black-box attacks by effectively tuning noise parameters during training.\nExperiments demonstrate that NODE-AdvGAN and NODE-AdvGAN-T generate more\neffective adversarial examples that achieve higher attack success rates while\npreserving better perceptual quality than traditional GAN-based methods.\n","authors":["Xinheng Xie","Yue Wu","Cuiyu He"],"pdf_url":"https://arxiv.org/pdf/2412.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03537v1","updated":"2024-12-04T18:32:42Z","published":"2024-12-04T18:32:42Z","title":"Evaluating Gender Bias Transfer between Pre-trained and Prompt-Adapted\n Language Models","summary":" Large language models (LLMs) are increasingly being adapted to achieve\ntask-specificity for deployment in real-world decision systems. Several\nprevious works have investigated the bias transfer hypothesis (BTH) by studying\nthe effect of the fine-tuning adaptation strategy on model fairness to find\nthat fairness in pre-trained masked language models have limited effect on the\nfairness of models when adapted using fine-tuning. In this work, we expand the\nstudy of BTH to causal models under prompt adaptations, as prompting is an\naccessible, and compute-efficient way to deploy models in real-world systems.\nIn contrast to previous works, we establish that intrinsic biases in\npre-trained Mistral, Falcon and Llama models are strongly correlated (rho >=\n0.94) with biases when the same models are zero- and few-shot prompted, using a\npronoun co-reference resolution task. Further, we find that bias transfer\nremains strongly correlated even when LLMs are specifically prompted to exhibit\nfair or biased behavior (rho >= 0.92), and few-shot length and stereotypical\ncomposition are varied (rho >= 0.97). Our findings highlight the importance of\nensuring fairness in pre-trained LLMs, especially when they are later used to\nperform downstream tasks via prompt adaptation.\n","authors":["Natalie Mackraz","Nivedha Sivakumar","Samira Khorshidi","Krishna Patel","Barry-John Theobald","Luca Zappella","Nicholas Apostoloff"],"pdf_url":"https://arxiv.org/pdf/2412.03537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11556v2","updated":"2024-12-04T18:31:44Z","published":"2023-12-17T08:07:32Z","title":"StarVector: Generating Scalable Vector Graphics Code from Images and\n Text","summary":" Scalable Vector Graphics (SVGs) are vital for modern image rendering due to\ntheir scalability and versatility. Previous SVG generation methods have focused\non curve-based vectorization, lacking semantic understanding, often producing\nartifacts, and struggling with SVG primitives beyond path curves. To address\nthese issues, we introduce StarVector, a multimodal large language model for\nSVG generation. It performs image vectorization by understanding image\nsemantics and using SVG primitives for compact, precise outputs. Unlike\ntraditional methods, StarVector works directly in the SVG code space,\nleveraging visual understanding to apply accurate SVG primitives. To train\nStarVector, we create SVG-Stack, a diverse dataset of 2M samples that enables\ngeneralization across vectorization tasks and precise use of primitives like\nellipses, polygons, and text. We address challenges in SVG evaluation, showing\nthat pixel-based metrics like MSE fail to capture the unique qualities of\nvector graphics. We introduce SVG-Bench, a benchmark across 10 datasets, and 3\ntasks: Image-to-SVG, Text-to-SVG generation, and diagram generation. Using this\nsetup, StarVector achieves state-of-the-art performance, producing more compact\nand semantically rich SVGs.\n","authors":["Juan A. Rodriguez","Abhay Puri","Shubham Agarwal","Issam H. Laradji","Pau Rodriguez","Sai Rajeswar","David Vazquez","Christopher Pal","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2312.11556v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03526v1","updated":"2024-12-04T18:15:06Z","published":"2024-12-04T18:15:06Z","title":"Feed-Forward Bullet-Time Reconstruction of Dynamic Scenes from Monocular\n Videos","summary":" Recent advancements in static feed-forward scene reconstruction have\ndemonstrated significant progress in high-quality novel view synthesis.\nHowever, these models often struggle with generalizability across diverse\nenvironments and fail to effectively handle dynamic content. We present BTimer\n(short for BulletTimer), the first motion-aware feed-forward model for\nreal-time reconstruction and novel view synthesis of dynamic scenes. Our\napproach reconstructs the full scene in a 3D Gaussian Splatting representation\nat a given target ('bullet') timestamp by aggregating information from all the\ncontext frames. Such a formulation allows BTimer to gain scalability and\ngeneralization by leveraging both static and dynamic scene datasets. Given a\ncasual monocular dynamic video, BTimer reconstructs a bullet-time scene within\n150ms while reaching state-of-the-art performance on both static and dynamic\nscene datasets, even compared with optimization-based approaches.\n","authors":["Hanxue Liang","Jiawei Ren","Ashkan Mirzaei","Antonio Torralba","Ziwei Liu","Igor Gilitschenski","Sanja Fidler","Cengiz Oztireli","Huan Ling","Zan Gojcic","Jiahui Huang"],"pdf_url":"https://arxiv.org/pdf/2412.03526v1.pdf","comment":"Project website:\n https://research.nvidia.com/labs/toronto-ai/bullet-timer/"},{"id":"http://arxiv.org/abs/2412.03516v1","updated":"2024-12-04T17:57:39Z","published":"2024-12-04T17:57:39Z","title":"You're (Not) My Type -- Can LLMs Generate Feedback of Specific Types for\n Introductory Programming Tasks?","summary":" Background: Feedback as one of the most influential factors for learning has\nbeen subject to a great body of research. It plays a key role in the\ndevelopment of educational technology systems and is traditionally rooted in\ndeterministic feedback defined by experts and their experience. However, with\nthe rise of generative AI and especially Large Language Models (LLMs), we\nexpect feedback as part of learning systems to transform, especially for the\ncontext of programming. In the past, it was challenging to automate feedback\nfor learners of programming. LLMs may create new possibilities to provide\nricher, and more individual feedback than ever before.\n Objectives: This paper aims to generate specific types of feedback for\nintroductory programming tasks using LLMs. We revisit existing feedback\ntaxonomies to capture the specifics of the generated feedback, such as\nrandomness, uncertainty, and degrees of variation.\n Methods: We iteratively designed prompts for the generation of specific\nfeedback types (as part of existing feedback taxonomies) in response to\nauthentic student programs. We then evaluated the generated output and\ndetermined to what extent it reflected certain feedback types.\n Results and Conclusion: The present work provides a better understanding of\ndifferent feedback dimensions and characteristics. The results have\nimplications for future feedback research with regard to, for example, feedback\neffects and learners' informational needs. It further provides a basis for the\ndevelopment of new tools and learning systems for novice programmers including\nfeedback generated by AI.\n","authors":["Dominic Lohr","Hieke Keuning","Natalie Kiesler"],"pdf_url":"https://arxiv.org/pdf/2412.03516v1.pdf","comment":"Accepted at Journal of Computer Assisted Learning (2024)"},{"id":"http://arxiv.org/abs/2407.08152v2","updated":"2024-12-04T17:56:57Z","published":"2024-07-11T03:10:27Z","title":"Privacy-Preserving Data Deduplication for Enhancing Federated Learning\n of Language Models (Extended Version)","summary":" Deduplication is a vital preprocessing step that enhances machine learning\nmodel performance and saves training time and energy. However, enhancing\nfederated learning through deduplication poses challenges, especially regarding\nscalability and potential privacy violations if deduplication involves sharing\nall clients' data. In this paper, we address the problem of deduplication in a\nfederated setup by introducing a pioneering protocol, Efficient\nPrivacy-Preserving Multi-Party Deduplication (EP-MPD). It efficiently removes\nduplicates from multiple clients' datasets without compromising data privacy.\nEP-MPD is constructed in a modular fashion, utilizing two novel variants of the\nPrivate Set Intersection protocol. Our extensive experiments demonstrate the\nsignificant benefits of deduplication in federated learning of large language\nmodels. For instance, we observe up to 19.62\\% improvement in perplexity and up\nto 27.95\\% reduction in running time while varying the duplication level\nbetween 10\\% and 30\\%. EP-MPD effectively balances privacy and performance in\nfederated learning, making it a valuable solution for large-scale applications.\n","authors":["Aydin Abadi","Vishnu Asutosh Dasu","Sumanta Sarkar"],"pdf_url":"https://arxiv.org/pdf/2407.08152v2.pdf","comment":"Accepted at the Network and Distributed Systems Security (NDSS)\n Symposium, 2025"},{"id":"http://arxiv.org/abs/2412.03513v1","updated":"2024-12-04T17:56:49Z","published":"2024-12-04T17:56:49Z","title":"KKLIP: Knowledge Distillation Exploiting K-means Clustering for\n Language-Image Pre-Training","summary":" Recently, CLIP has emerged as a valuable model for aligning image and text\ninformation in multi-modal scenarios. However, researchers have observed\nlimitations in the ability of CLIP's text and image encoders to extract\ndetailed knowledge from caption-image pairs. In response, this paper introduces\nKKLIP, a novel approach designed to enhance the quality of CLIP by\nincorporating a new knowledge distillation (KD) method derived from Llama 2.\nOur method comprises three objectives: Text Embedding Distillation, Concept\nLearning, and Contrastive Learning. Firstly, Text Embedding Distillation\ninvolves training the KKLIP text encoder to emulate the teacher model, Llama 2.\nSecondly, Concept Learning assigns a soft concept label to each caption-image\npair through offline k-means clustering of text information from Llama 2,\nallowing KKLIP to learn from these soft concept labels. Finally, Contrastive\nLearning harmonizes text and image embeddings. Our experimental results\ndemonstrate that KKLIP enhances the quality of both text and image encoders.\n","authors":["Kuei-Chun Kao"],"pdf_url":"https://arxiv.org/pdf/2412.03513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03498v1","updated":"2024-12-04T17:39:55Z","published":"2024-12-04T17:39:55Z","title":"A Bidirectional Siamese Recurrent Neural Network for Accurate Gait\n Recognition Using Body Landmarks","summary":" Gait recognition is a significant biometric technique for person\nidentification, particularly in scenarios where other physiological biometrics\nare impractical or ineffective. In this paper, we address the challenges\nassociated with gait recognition and present a novel approach to improve its\naccuracy and reliability. The proposed method leverages advanced techniques,\nincluding sequential gait landmarks obtained through the Mediapipe pose\nestimation model, Procrustes analysis for alignment, and a Siamese\nbiGRU-dualStack Neural Network architecture for capturing temporal\ndependencies. Extensive experiments were conducted on large-scale cross-view\ndatasets to demonstrate the effectiveness of the approach, achieving high\nrecognition accuracy compared to other models. The model demonstrated\naccuracies of 95.7%, 94.44%, 87.71%, and 86.6% on CASIA-B, SZU RGB-D, OU-MVLP,\nand Gait3D datasets respectively. The results highlight the potential\napplications of the proposed method in various practical domains, indicating\nits significant contribution to the field of gait recognition.\n","authors":["Proma Hossain Progga","Md. Jobayer Rahman","Swapnil Biswas","Md. Shakil Ahmed","Arif Reza Anwary","Swakkhar Shatabda"],"pdf_url":"https://arxiv.org/pdf/2412.03498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02509v2","updated":"2024-12-04T17:35:48Z","published":"2024-12-03T15:48:33Z","title":"FCL-ViT: Task-Aware Attention Tuning for Continual Learning","summary":" Continual Learning (CL) involves adapting the prior Deep Neural Network (DNN)\nknowledge to new tasks, without forgetting the old ones. However, modern CL\ntechniques focus on provisioning memory capabilities to existing DNN models\nrather than designing new ones that are able to adapt according to the task at\nhand. This paper presents the novel Feedback Continual Learning Vision\nTransformer (FCL-ViT) that uses a feedback mechanism to generate real-time\ndynamic attention features tailored to the current task. The FCL-ViT operates\nin two Phases. In phase 1, the generic image features are produced and\ndetermine where the Transformer should attend on the current image. In phase 2,\ntask-specific image features are generated that leverage dynamic attention. To\nthis end, Tunable self-Attention Blocks (TABs) and Task Specific Blocks (TSBs)\nare introduced that operate in both phases and are responsible for tuning the\nTABs attention, respectively. The FCL-ViT surpasses state-of-the-art\nperformance on Continual Learning compared to benchmark methods, while\nretaining a small number of trainable DNN parameters.\n","authors":["Anestis Kaimakamidis","Ioannis Pitas"],"pdf_url":"https://arxiv.org/pdf/2412.02509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03487v1","updated":"2024-12-04T17:24:35Z","published":"2024-12-04T17:24:35Z","title":"Flow Matching with General Discrete Paths: A Kinetic-Optimal Perspective","summary":" The design space of discrete-space diffusion or flow generative models are\nsignificantly less well-understood than their continuous-space counterparts,\nwith many works focusing only on a simple masked construction. In this work, we\naim to take a holistic approach to the construction of discrete generative\nmodels based on continuous-time Markov chains, and for the first time, allow\nthe use of arbitrary discrete probability paths, or colloquially, corruption\nprocesses. Through the lens of optimizing the symmetric kinetic energy, we\npropose velocity formulas that can be applied to any given probability path,\ncompletely decoupling the probability and velocity, and giving the user the\nfreedom to specify any desirable probability path based on expert knowledge\nspecific to the data domain. Furthermore, we find that a special construction\nof mixture probability paths optimizes the symmetric kinetic energy for the\ndiscrete case. We empirically validate the usefulness of this new design space\nacross multiple modalities: text generation, inorganic material generation, and\nimage generation. We find that we can outperform the mask construction even in\ntext with kinetic-optimal mixture paths, while we can make use of\ndomain-specific constructions of the probability path over the visual domain.\n","authors":["Neta Shaul","Itai Gat","Marton Havasi","Daniel Severo","Anuroop Sriram","Peter Holderrieth","Brian Karrer","Yaron Lipman","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.13492v3","updated":"2024-12-04T17:05:02Z","published":"2024-07-18T13:20:53Z","title":"Enhancing Biomedical Knowledge Discovery for Diseases: An Open-Source\n Framework Applied on Rett Syndrome and Alzheimer's Disease","summary":" The ever-growing volume of biomedical publications creates a critical need\nfor efficient knowledge discovery. In this context, we introduce an open-source\nend-to-end framework designed to construct knowledge around specific diseases\ndirectly from raw text. To facilitate research in disease-related knowledge\ndiscovery, we create two annotated datasets focused on Rett syndrome and\nAlzheimer's disease, enabling the identification of semantic relations between\nbiomedical entities. Extensive benchmarking explores various ways to represent\nrelations and entity representations, offering insights into optimal modeling\nstrategies for semantic relation detection and highlighting language models'\ncompetence in knowledge discovery. We also conduct probing experiments using\ndifferent layer representations and attention scores to explore transformers'\nability to capture semantic relations.\n","authors":["Christos Theodoropoulos","Andrei Catalin Coman","James Henderson","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2407.13492v3.pdf","comment":"Published in IEEE Access, doi: 10.1109/ACCESS.2024.3509714"},{"id":"http://arxiv.org/abs/2412.03467v1","updated":"2024-12-04T16:56:20Z","published":"2024-12-04T16:56:20Z","title":"Training-Free Mitigation of Language Reasoning Degradation After\n Multimodal Instruction Tuning","summary":" Multimodal models typically combine a powerful large language model (LLM)\nwith a vision encoder and are then trained on multimodal data via instruction\ntuning. While this process adapts LLMs to multimodal settings, it remains\nunclear whether this adaptation compromises their original language reasoning\ncapabilities. In this work, we explore the effects of multimodal instruction\ntuning on language reasoning performance. We focus on LLaVA, a leading\nmultimodal framework that integrates LLMs such as Vicuna or Mistral with the\nCLIP vision encoder. We compare the performance of the original LLMs with their\nmultimodal-adapted counterparts across eight language reasoning tasks. Our\nexperiments yield several key insights. First, the impact of multimodal\nlearning varies between Vicuna and Mistral: we observe a degradation in\nlanguage reasoning for Mistral but improvements for Vicuna across most tasks.\nSecond, while multimodal instruction learning consistently degrades performance\non mathematical reasoning tasks (e.g., GSM8K), it enhances performance on\ncommonsense reasoning tasks (e.g., CommonsenseQA). Finally, we demonstrate that\na training-free model merging technique can effectively mitigate the language\nreasoning degradation observed in multimodal-adapted Mistral and even improve\nperformance on visual tasks.\n","authors":["Neale Ratzlaff","Man Luo","Xin Su","Vasudev Lal","Phillip Howard"],"pdf_url":"https://arxiv.org/pdf/2412.03467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08181v2","updated":"2024-12-04T16:55:18Z","published":"2024-11-12T20:57:12Z","title":"Challenges in Guardrailing Large Language Models for Science","summary":" The rapid development in large language models (LLMs) has transformed the\nlandscape of natural language processing and understanding (NLP/NLU), offering\nsignificant benefits across various domains. However, when applied to\nscientific research, these powerful models exhibit critical failure modes\nrelated to scientific integrity and trustworthiness. Existing general-purpose\nLLM guardrails are insufficient to address these unique challenges in the\nscientific domain. We provide comprehensive guidelines for deploying LLM\nguardrails in the scientific domain. We identify specific challenges --\nincluding time sensitivity, knowledge contextualization, conflict resolution,\nand intellectual property concerns -- and propose a guideline framework for the\nguardrails that can align with scientific needs. These guardrail dimensions\ninclude trustworthiness, ethics & bias, safety, and legal aspects. We also\noutline in detail the implementation strategies that employ white-box,\nblack-box, and gray-box methodologies that can be enforced within scientific\ncontexts.\n","authors":["Nishan Pantha","Muthukumaran Ramasubramanian","Iksha Gurung","Manil Maskey","Rahul Ramachandran"],"pdf_url":"https://arxiv.org/pdf/2411.08181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03465v1","updated":"2024-12-04T16:54:58Z","published":"2024-12-04T16:54:58Z","title":"YT-30M: A multi-lingual multi-category dataset of YouTube comments","summary":" This paper introduces two large-scale multilingual comment datasets, YT-30M\n(and YT-100K) from YouTube. The analysis in this paper is performed on a\nsmaller sample (YT-100K) of YT-30M. Both the datasets: YT-30M (full) and\nYT-100K (randomly selected 100K sample from YT-30M) are publicly released for\nfurther research. YT-30M (YT-100K) contains 32236173 (108694) comments posted\nby YouTube channel that belong to YouTube categories. Each comment is\nassociated with a video ID, comment ID, commentor name, commentor channel ID,\ncomment text, upvotes, original channel ID and category of the YouTube channel\n(e.g., 'News & Politics', 'Science & Technology', etc.).\n","authors":["Hridoy Sankar Dutta"],"pdf_url":"https://arxiv.org/pdf/2412.03465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06967v2","updated":"2024-12-04T16:54:03Z","published":"2023-05-11T16:48:58Z","title":"Data quality dimensions for fair AI","summary":" Artificial Intelligence (AI) systems are not intrinsically neutral and biases\ntrickle in any type of technological tool. In particular when dealing with\npeople, the impact of AI algorithms' technical errors originating with\nmislabeled data is undeniable. As they feed wrong and discriminatory\nclassifications, these systems are not systematically guarded against bias. In\nthis article we consider the problem of bias in AI systems from the point of\nview of data quality dimensions. We highlight the limited model construction of\nbias mitigation tools based on accuracy strategy, illustrating potential\nimprovements of a specific tool in gender classification errors occurring in\ntwo typically difficult contexts: the classification of non-binary individuals,\nfor which the label set becomes incomplete with respect to the dataset; and the\nclassification of transgender individuals, for which the dataset becomes\ninconsistent with respect to the label set. Using formal methods for reasoning\nabout the behavior of the classification system in presence of a changing\nworld, we propose to reconsider the fairness of the classification task in\nterms of completeness, consistency, timeliness and reliability, and offer some\ntheoretical results.\n","authors":["Camilla Quaresmini","Giuseppe Primiero"],"pdf_url":"https://arxiv.org/pdf/2305.06967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.01083v3","updated":"2024-12-04T16:39:07Z","published":"2024-09-02T09:11:28Z","title":"Affordance-based Robot Manipulation with Flow Matching","summary":" We present a framework for assistive robot manipulation, which focuses on two\nfundamental challenges: first, efficiently adapting large-scale models to\ndownstream scene affordance understanding tasks, especially in daily living\nscenarios where gathering multi-task data involving humans requires strenuous\neffort; second, effectively learning robot trajectories by grounding the visual\naffordance model. We tackle the first challenge by employing a\nparameter-efficient prompt tuning method that prepends learnable text prompts\nto the frozen vision model to predict manipulation affordances in multi-task\nscenarios. Then we propose to learn robot trajectories guided by affordances in\na supervised Flow Matching method. Flow matching represents a robot visuomotor\npolicy as a conditional process of flowing random waypoints to desired robot\ntrajectories. Finally, we introduce a real-world dataset with 10 tasks across\nActivities of Daily Living to test our framework. Our extensive evaluation\nhighlights that the proposed prompt tuning method for learning manipulation\naffordance with language prompter achieves competitive performance and even\noutperforms other finetuning protocols across data scales, while satisfying\nparameter efficiency. Learning multi-task robot trajectories with flow matching\npolicy also leads to consistently better results than alternative behavior\ncloning methods, including marginally better generalization performance and\nprominently faster inference than diffusion policy with DDPM. Our framework\nseamlessly unifies affordance model learning and trajectory generation with\nflow matching for robot manipulation.\n","authors":["Fan Zhang","Michael Gienger"],"pdf_url":"https://arxiv.org/pdf/2409.01083v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03766v2","updated":"2024-12-04T16:39:04Z","published":"2024-11-06T08:59:44Z","title":"Number Cookbook: Number Understanding of Language Models and How to\n Improve It","summary":" Large language models (LLMs) can solve an increasing number of complex\nreasoning tasks while making surprising mistakes in basic numerical\nunderstanding and processing (such as 9.11 > 9.9). The latter ability is\nessential for tackling complex arithmetic and mathematical problems and serves\nas a foundation for most reasoning tasks, but previous work paid little\nattention to it or only discussed several restricted tasks (like integer\naddition). In this paper, we comprehensively investigate the numerical\nunderstanding and processing ability (NUPA) of LLMs. Firstly, we introduce a\nbenchmark covering four common numerical representations and 17 distinct\nnumerical tasks in four major categories, resulting in 41 meaningful\ncombinations in total. These tasks are derived from primary and secondary\neducation curricula, encompassing nearly all everyday numerical understanding\nand processing scenarios, and the rules of these tasks are very simple and\nclear. Through the benchmark, we find that current LLMs fail frequently in many\nof the tasks. To study the problem, we train small models with existing and\npotential techniques for enhancing NUPA (such as tokenizers, PEs, and number\nformats), comprehensively evaluating their effectiveness using our testbed. We\nalso finetune practical-scale LLMs on our proposed NUPA tasks and find that 1)\nnaive finetuning can improve NUPA a lot on many but not all tasks, and 2)\nsurprisingly, techniques designed to enhance NUPA prove ineffective for\nfinetuning pretrained models. We further explore the impact of chain-of-thought\ntechniques on NUPA. Our work provides a more detailed and comprehensive\nunderstanding of NUPA in LLMs. Our benchmark and code are released at\nhttps://github.com/GraphPKU/number_cookbook.\n","authors":["Haotong Yang","Yi Hu","Shijia Kang","Zhouchen Lin","Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.03766v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03446v1","updated":"2024-12-04T16:34:35Z","published":"2024-12-04T16:34:35Z","title":"From Words to Workflows: Automating Business Processes","summary":" As businesses increasingly rely on automation to streamline operations, the\nlimitations of Robotic Process Automation (RPA) have become apparent,\nparticularly its dependence on expert knowledge and inability to handle complex\ndecision-making tasks. Recent advancements in Artificial Intelligence (AI),\nparticularly Generative AI (GenAI) and Large Language Models (LLMs), have paved\nthe way for Intelligent Automation (IA), which integrates cognitive\ncapabilities to overcome the shortcomings of RPA. This paper introduces\nText2Workflow, a novel method that automatically generates workflows from\nnatural language user requests. Unlike traditional automation approaches,\nText2Workflow offers a generalized solution for automating any business\nprocess, translating user inputs into a sequence of executable steps\nrepresented in JavaScript Object Notation (JSON) format. Leveraging the\ndecision-making and instruction-following capabilities of LLMs, this method\nprovides a scalable, adaptable framework that enables users to visualize and\nexecute workflows with minimal manual intervention. This research outlines the\nText2Workflow methodology and its broader implications for automating complex\nbusiness processes.\n","authors":["Laura Minkova","Jessica López Espejel","Taki Eddine Toufik Djaidja","Walid Dahhane","El Hassane Ettifouri"],"pdf_url":"https://arxiv.org/pdf/2412.03446v1.pdf","comment":"Under review at Elsevier's Engineering Applications of Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2412.03441v1","updated":"2024-12-04T16:30:03Z","published":"2024-12-04T16:30:03Z","title":"PBP: Post-training Backdoor Purification for Malware Classifiers","summary":" In recent years, the rise of machine learning (ML) in cybersecurity has\nbrought new challenges, including the increasing threat of backdoor poisoning\nattacks on ML malware classifiers. For instance, adversaries could inject\nmalicious samples into public malware repositories, contaminating the training\ndata and potentially misclassifying malware by the ML model. Current\ncountermeasures predominantly focus on detecting poisoned samples by leveraging\ndisagreements within the outputs of a diverse set of ensemble models on\ntraining data points. However, these methods are not suitable for scenarios\nwhere Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove\nbackdoors from a model after it has been trained. Addressing this scenario, we\nintroduce PBP, a post-training defense for malware classifiers that mitigates\nvarious types of backdoor embeddings without assuming any specific backdoor\nembedding mechanism. Our method exploits the influence of backdoor attacks on\nthe activation distribution of neural networks, independent of the\ntrigger-embedding method. In the presence of a backdoor attack, the activation\ndistribution of each layer is distorted into a mixture of distributions. By\nregulating the statistics of the batch normalization layers, we can guide a\nbackdoored model to perform similarly to a clean one. Our method demonstrates\nsubstantial advantages over several state-of-the-art methods, as evidenced by\nexperiments on two datasets, two types of backdoor methods, and various attack\nconfigurations. Notably, our approach requires only a small portion of the\ntraining data -- only 1\\% -- to purify the backdoor and reduce the attack\nsuccess rate from 100\\% to almost 0\\%, a 100-fold improvement over the baseline\nmethods. Our code is available at\n\\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}.\n","authors":["Dung Thuy Nguyen","Ngoc N. Tran","Taylor T. Johnson","Kevin Leach"],"pdf_url":"https://arxiv.org/pdf/2412.03441v1.pdf","comment":"Accepted at NDSS 2025"},{"id":"http://arxiv.org/abs/2412.03434v1","updated":"2024-12-04T16:26:17Z","published":"2024-12-04T16:26:17Z","title":"BIMCaP: BIM-based AI-supported LiDAR-Camera Pose Refinement","summary":" This paper introduces BIMCaP, a novel method to integrate mobile 3D sparse\nLiDAR data and camera measurements with pre-existing building information\nmodels (BIMs), enhancing fast and accurate indoor mapping with affordable\nsensors. BIMCaP refines sensor poses by leveraging a 3D BIM and employing a\nbundle adjustment technique to align real-world measurements with the model.\nExperiments using real-world open-access data show that BIMCaP achieves\nsuperior accuracy, reducing translational error by over 4 cm compared to\ncurrent state-of-the-art methods. This advancement enhances the accuracy and\ncost-effectiveness of 3D mapping methodologies like SLAM. BIMCaP's improvements\nbenefit various fields, including construction site management and emergency\nresponse, by providing up-to-date, aligned digital maps for better\ndecision-making and productivity. Link to the repository:\nhttps://github.com/MigVega/BIMCaP\n","authors":["Miguel Arturo Vega Torres","Anna Ribic","Borja García de Soto","André Borrmann"],"pdf_url":"https://arxiv.org/pdf/2412.03434v1.pdf","comment":"10 pages, 24 figures, Conference: EG-ICE: 31st International Workshop\n on Intelligent Computing in Engineering"},{"id":"http://arxiv.org/abs/2412.03433v1","updated":"2024-12-04T16:24:41Z","published":"2024-12-04T16:24:41Z","title":"Genetic Algorithm Based System for Path Planning with Unmanned Aerial\n Vehicles Swarms in Cell-Grid Environments","summary":" Path Planning methods for autonomously controlling swarms of unmanned aerial\nvehicles (UAVs) are gaining momentum due to their operational advantages. An\nincreasing number of scenarios now require autonomous control of multiple UAVs,\nas autonomous operation can significantly reduce labor costs. Additionally,\nobtaining optimal flight paths can lower energy consumption, thereby extending\nbattery life for other critical operations. Many of these scenarios, however,\ninvolve obstacles such as power lines and trees, which complicate Path\nPlanning. This paper presents an evolutionary computation-based system\nemploying genetic algorithms to address this problem in environments with\nobstacles. The proposed approach aims to ensure complete coverage of areas with\nfixed obstacles, such as in field exploration tasks, while minimizing flight\ntime regardless of map size or the number of UAVs in the swarm. No specific\ngoal points or prior information beyond the provided map is required. The\nexperiments conducted in this study used five maps of varying sizes and\nobstacle densities, as well as a control map without obstacles, with different\nnumbers of UAVs. The results demonstrate that this method can determine optimal\npaths for all UAVs during full map traversal, thus minimizing resource\nconsumption. A comparative analysis with other state-of-the-art approach is\npresented to highlight the advantages and potential limitations of the proposed\nmethod.\n","authors":["Alejandro Puente-Castro","Enrique Fernandez-Blanco","Daniel Rivero"],"pdf_url":"https://arxiv.org/pdf/2412.03433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03424v1","updated":"2024-12-04T16:14:02Z","published":"2024-12-04T16:14:02Z","title":"Tango*: Constrained synthesis planning using chemically informed value\n functions","summary":" Computer-aided synthesis planning (CASP) has made significant strides in\ngenerating retrosynthetic pathways for simple molecules in a non-constrained\nfashion. Recent work introduces a specialised bidirectional search algorithm\nwith forward and retro expansion to address the starting material-constrained\nsynthesis problem, allowing CASP systems to provide synthesis pathways from\nspecified starting materials, such as waste products or renewable feed-stocks.\nIn this work, we introduce a simple guided search which allows solving the\nstarting material-constrained synthesis planning problem using an existing,\nuni-directional search algorithm, Retro*. We show that by optimising a single\nhyperparameter, Tango* outperforms existing methods in terms of efficiency and\nsolve rate. We find the Tango* cost function catalyses strong improvements for\nthe bidirectional DESP methods. Our method also achieves lower wall clock times\nwhile proposing synthetic routes of similar length, a common metric for route\nquality. Finally, we highlight potential reasons for the strong performance of\nTango over neural guided search methods\n","authors":["Daniel Armstrong","Zlatko Joncev","Jeff Guo","Philippe Schwaller"],"pdf_url":"https://arxiv.org/pdf/2412.03424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02205v2","updated":"2024-12-04T16:12:08Z","published":"2024-12-03T06:47:15Z","title":"DataLab: A Unified Platform for LLM-Powered Business Intelligence","summary":" Business intelligence (BI) transforms large volumes of data within modern\norganizations into actionable insights for informed decision-making. Recently,\nlarge language model (LLM)-based agents have streamlined the BI workflow by\nautomatically performing task planning, reasoning, and actions in executable\nenvironments based on natural language (NL) queries. However, existing\napproaches primarily focus on individual BI tasks such as NL2SQL and NL2VIS.\nThe fragmentation of tasks across different data roles and tools lead to\ninefficiencies and potential errors due to the iterative and collaborative\nnature of BI. In this paper, we introduce DataLab, a unified BI platform that\nintegrates a one-stop LLM-based agent framework with an augmented computational\nnotebook interface. DataLab supports a wide range of BI tasks for different\ndata roles by seamlessly combining LLM assistance with user customization\nwithin a single environment. To achieve this unification, we design a domain\nknowledge incorporation module tailored for enterprise-specific BI tasks, an\ninter-agent communication mechanism to facilitate information sharing across\nthe BI workflow, and a cell-based context management strategy to enhance\ncontext utilization efficiency in BI notebooks. Extensive experiments\ndemonstrate that DataLab achieves state-of-the-art performance on various BI\ntasks across popular research benchmarks. Moreover, DataLab maintains high\neffectiveness and efficiency on real-world datasets from Tencent, achieving up\nto a 58.58% increase in accuracy and a 61.65% reduction in token cost on\nenterprise-specific BI tasks.\n","authors":["Luoxuan Weng","Yinghao Tang","Yingchaojie Feng","Zhuo Chang","Peng Chen","Ruiqin Chen","Haozhe Feng","Chen Hou","Danqing Huang","Yang Li","Huaming Rao","Haonan Wang","Canshi Wei","Xiaofeng Yang","Yuhui Zhang","Yifeng Zheng","Xiuqi Huang","Minfeng Zhu","Yuxin Ma","Bin Cui","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.17826v3","updated":"2024-12-04T16:03:04Z","published":"2024-02-27T19:00:01Z","title":"Prediction-Powered Ranking of Large Language Models","summary":" Large language models are often ranked according to their level of alignment\nwith human preferences -- a model is better than other models if its outputs\nare more frequently preferred by humans. One of the popular ways to elicit\nhuman preferences utilizes pairwise comparisons between the outputs provided by\ndifferent models to the same inputs. However, since gathering pairwise\ncomparisons by humans is costly and time-consuming, it has become a common\npractice to gather pairwise comparisons by a strong large language model -- a\nmodel strongly aligned with human preferences. Surprisingly, practitioners\ncannot currently measure the uncertainty that any mismatch between human and\nmodel preferences may introduce in the constructed rankings. In this work, we\ndevelop a statistical framework to bridge this gap. Given a (small) set of\npairwise comparisons by humans and a large set of pairwise comparisons by a\nmodel, our framework provides a rank-set -- a set of possible ranking positions\n-- for each of the models under comparison. Moreover, it guarantees that, with\na probability greater than or equal to a user-specified value, the rank-sets\ncover the true ranking consistent with the distribution of human pairwise\npreferences asymptotically. Using pairwise comparisons made by humans in the\nLMSYS Chatbot Arena platform and pairwise comparisons made by three strong\nlarge language models, we empirically demonstrate the effectivity of our\nframework and show that the rank-sets constructed using only pairwise\ncomparisons by the strong large language models are often inconsistent with\n(the distribution of) human pairwise preferences.\n","authors":["Ivi Chatzi","Eleni Straitouri","Suhas Thejaswi","Manuel Gomez Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2402.17826v3.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03420v1","updated":"2024-12-04T16:00:14Z","published":"2024-12-04T16:00:14Z","title":"Automated Test-Case Generation for REST APIs Using Model Inference\n Search Heuristic","summary":" The rising popularity of the microservice architectural style has led to a\ngrowing demand for automated testing approaches tailored to these systems.\nEvoMaster is a state-of-the-art tool that uses Evolutionary Algorithms (EAs) to\nautomatically generate test cases for microservices' REST APIs. One limitation\nof these EAs is the use of unit-level search heuristics, such as branch\ndistances, which focus on fine-grained code coverage and may not effectively\ncapture the complex, interconnected behaviors characteristic of system-level\ntesting. To address this limitation, we propose a new search heuristic (MISH)\nthat uses real-time automaton learning to guide the test case generation\nprocess. We capture the sequential call patterns exhibited by a test case by\nlearning an automaton from the stream of log events outputted by different\nmicroservices within the same system. Therefore, MISH learns a representation\nof the systemwide behavior, allowing us to define the fitness of a test case\nbased on the path it traverses within the inferred automaton. We empirically\nevaluate MISH's effectiveness on six real-world benchmark microservice\napplications and compare it against a state-of-the-art technique, MOSA, for\ntesting REST APIs. Our evaluation shows promising results for using MISH to\nguide the automated test case generation within EvoMaster.\n","authors":["Clinton Cao","Annibale Panichella","Sicco Verwer"],"pdf_url":"https://arxiv.org/pdf/2412.03420v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2412.03417v1","updated":"2024-12-04T15:53:45Z","published":"2024-12-04T15:53:45Z","title":"Learning Semantic Association Rules from Internet of Things Data","summary":" Association Rule Mining (ARM) is the task of discovering commonalities in\ndata in the form of logical implications. ARM is used in the Internet of Things\n(IoT) for different tasks including monitoring and decision-making. However,\nexisting methods give limited consideration to IoT-specific requirements such\nas heterogeneity and volume. Furthermore, they do not utilize important static\ndomain-specific description data about IoT systems, which is increasingly\nrepresented as knowledge graphs. In this paper, we propose a novel ARM pipeline\nfor IoT data that utilizes both dynamic sensor data and static IoT system\nmetadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method\n(Aerial) as part of the pipeline to address the high volume of IoT data and\nreduce the total number of rules that are resource-intensive to process. Aerial\nlearns a neural representation of a given data and extracts association rules\nfrom this representation by exploiting the reconstruction (decoding) mechanism\nof an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show\nthat ARM on both static and dynamic IoT data results in more generically\napplicable rules while Aerial can learn a more concise set of high-quality\nassociation rules than the state-of-the-art with full coverage over the\ndatasets.\n","authors":["Erkan Karabulut","Paul Groth","Victoria Degeler"],"pdf_url":"https://arxiv.org/pdf/2412.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09695v3","updated":"2024-12-04T15:35:48Z","published":"2024-10-13T02:10:26Z","title":"Can In-context Learning Really Generalize to Out-of-distribution Tasks?","summary":" In this work, we explore the mechanism of in-context learning (ICL) on\nout-of-distribution (OOD) tasks that were not encountered during training. To\nachieve this, we conduct synthetic experiments where the objective is to learn\nOOD mathematical functions through ICL using a GPT-2 model. We reveal that\nTransformers may struggle to learn OOD task functions through ICL.\nSpecifically, ICL performance resembles implementing a function within the\npretraining hypothesis space and optimizing it with gradient descent based on\nthe in-context examples. Additionally, we investigate ICL's well-documented\nability to learn unseen abstract labels in context. We demonstrate that such\nability only manifests in the scenarios without distributional shifts and,\ntherefore, may not serve as evidence of new-task-learning ability. Furthermore,\nwe assess ICL's performance on OOD tasks when the model is pretrained on\nmultiple tasks. Both empirical and theoretical analyses demonstrate the\nexistence of the \\textbf{low-test-error preference} of ICL, where it tends to\nimplement the pretraining function that yields low test error in the testing\ncontext. We validate this through numerical experiments. This new theoretical\nresult, combined with our empirical findings, elucidates the mechanism of ICL\nin addressing OOD tasks.\n","authors":["Qixun Wang","Yifei Wang","Yisen Wang","Xianghua Ying"],"pdf_url":"https://arxiv.org/pdf/2410.09695v3.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2412.03401v1","updated":"2024-12-04T15:32:37Z","published":"2024-12-04T15:32:37Z","title":"Benchmarking Pretrained Attention-based Models for Real-Time Recognition\n in Robot-Assisted Esophagectomy","summary":" Esophageal cancer is among the most common types of cancer worldwide. It is\ntraditionally treated using open esophagectomy, but in recent years,\nrobot-assisted minimally invasive esophagectomy (RAMIE) has emerged as a\npromising alternative. However, robot-assisted surgery can be challenging for\nnovice surgeons, as they often suffer from a loss of spatial orientation.\nComputer-aided anatomy recognition holds promise for improving surgical\nnavigation, but research in this area remains limited. In this study, we\ndeveloped a comprehensive dataset for semantic segmentation in RAMIE, featuring\nthe largest collection of vital anatomical structures and surgical instruments\nto date. Handling this diverse set of classes presents challenges, including\nclass imbalance and the recognition of complex structures such as nerves. This\nstudy aims to understand the challenges and limitations of current\nstate-of-the-art algorithms on this novel dataset and problem. Therefore, we\nbenchmarked eight real-time deep learning models using two pretraining\ndatasets. We assessed both traditional and attention-based networks,\nhypothesizing that attention-based networks better capture global patterns and\naddress challenges such as occlusion caused by blood or other tissues. The\nbenchmark includes our RAMIE dataset and the publicly available CholecSeg8k\ndataset, enabling a thorough assessment of surgical segmentation tasks. Our\nfindings indicate that pretraining on ADE20k, a dataset for semantic\nsegmentation, is more effective than pretraining on ImageNet. Furthermore,\nattention-based models outperform traditional convolutional neural networks,\nwith SegNeXt and Mask2Former achieving higher Dice scores, and Mask2Former\nadditionally excelling in average symmetric surface distance.\n","authors":["Ronald L. P. D. de Jong","Yasmina al Khalil","Tim J. M. Jaspers","Romy C. van Jaarsveld","Gino M. Kuiper","Yiping Li","Richard van Hillegersberg","Jelle P. Ruurda","Marcel Breeuwer","Fons van der Sommen"],"pdf_url":"https://arxiv.org/pdf/2412.03401v1.pdf","comment":"Accepted for presentation at the SPIE Medical Imaging Conference,\n 2025"},{"id":"http://arxiv.org/abs/2412.03390v1","updated":"2024-12-04T15:19:01Z","published":"2024-12-04T15:19:01Z","title":"Enhancing Supply Chain Visibility with Generative AI: An Exploratory\n Case Study on Relationship Prediction in Knowledge Graphs","summary":" A key stumbling block in effective supply chain risk management for companies\nand policymakers is a lack of visibility on interdependent supply network\nrelationships. Relationship prediction, also called link prediction is an\nemergent area of supply chain surveillance research that aims to increase the\nvisibility of supply chains using data-driven techniques. Existing methods have\nbeen successful for predicting relationships but struggle to extract the\ncontext in which these relationships are embedded - such as the products being\nsupplied or locations they are supplied from. Lack of context prevents\npractitioners from distinguishing transactional relations from established\nsupply chain relations, hindering accurate estimations of risk. In this work,\nwe develop a new Generative Artificial Intelligence (Gen AI) enhanced machine\nlearning framework that leverages pre-trained language models as embedding\nmodels combined with machine learning models to predict supply chain\nrelationships within knowledge graphs. By integrating Generative AI techniques,\nour approach captures the nuanced semantic relationships between entities,\nthereby improving supply chain visibility and facilitating more precise risk\nmanagement. Using data from a real case study, we show that GenAI-enhanced link\nprediction surpasses all benchmarks, and demonstrate how GenAI models can be\nexplored and effectively used in supply chain risk management.\n","authors":["Ge Zheng","Alexandra Brintrup"],"pdf_url":"https://arxiv.org/pdf/2412.03390v1.pdf","comment":"18 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.03388v1","updated":"2024-12-04T15:17:25Z","published":"2024-12-04T15:17:25Z","title":"DiffStyleTTS: Diffusion-based Hierarchical Prosody Modeling for\n Text-to-Speech with Diverse and Controllable Styles","summary":" Human speech exhibits rich and flexible prosodic variations. To address the\none-to-many mapping problem from text to prosody in a reasonable and flexible\nmanner, we propose DiffStyleTTS, a multi-speaker acoustic model based on a\nconditional diffusion module and an improved classifier-free guidance, which\nhierarchically models speech prosodic features, and controls different prosodic\nstyles to guide prosody prediction. Experiments show that our method\noutperforms all baselines in naturalness and achieves superior synthesis speed\ncompared to three diffusion-based baselines. Additionally, by adjusting the\nguiding scale, DiffStyleTTS effectively controls the guidance intensity of the\nsynthetic prosody.\n","authors":["Jiaxuan Liu","Zhaoci Liu","Yajun Hu","Yingying Gao","Shilei Zhang","Zhenhua Ling"],"pdf_url":"https://arxiv.org/pdf/2412.03388v1.pdf","comment":"COLING 2025"},{"id":"http://arxiv.org/abs/2404.08093v2","updated":"2024-12-04T14:45:23Z","published":"2024-04-11T19:15:45Z","title":"Towards a Robust Soft Baby Robot With Rich Interaction Ability for\n Advanced Machine Learning Algorithms","summary":" Advanced machine learning algorithms require platforms that are extremely\nrobust and equipped with rich sensory feedback to handle extensive\ntrial-and-error learning without relying on strong inductive biases.\nTraditional robotic designs, while well-suited for their specific use cases,\nare often fragile when used with these algorithms. To address this gap -- and\ninspired by the vision of enabling curiosity-driven baby robots -- we present a\nnovel robotic limb designed from scratch. Our design has a hybrid soft-hard\nstructure, high redundancy with rich non-contact sensors (exclusively cameras),\nand easily replaceable failure points. Proof-of-concept experiments using two\ncontemporary reinforcement learning algorithms on a physical prototype\ndemonstrate that our design is able to succeed in a simple target-finding task\neven under simulated sensor failures, all with minimal human oversight during\nextended learning periods. We believe this design represents a concrete step\ntoward more tailored robotic designs for achieving general-purpose, generally\nintelligent robots.\n","authors":["Mohannad Alhakami","Dylan R. Ashley","Joel Dunham","Yanning Dai","Francesco Faccio","Eric Feron","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2404.08093v2.pdf","comment":"6 pages in main text + 2 pages of references, 8 figures in main text,\n 1 table in main text; source code available at\n https://github.com/dylanashley/robot-limb-testai"},{"id":"http://arxiv.org/abs/2412.03359v1","updated":"2024-12-04T14:45:09Z","published":"2024-12-04T14:45:09Z","title":"WiS Platform: Enhancing Evaluation of LLM-Based Multi-Agent Systems\n Through Game-Based Analysis","summary":" Recent advancements in autonomous multi-agent systems (MAS) based on large\nlanguage models (LLMs) have enhanced the application scenarios and improved the\ncapability of LLMs to handle complex tasks. Despite demonstrating\neffectiveness, existing studies still evidently struggle to evaluate, analysis,\nand reproducibility of LLM-based MAS. In this paper, to facilitate the research\non LLM-based MAS, we introduce an open, scalable, and real-time updated\nplatform for accessing and analyzing the LLM-based MAS based on the games Who\nis Spy?\" (WiS). Our platform is featured with three main worths: (1) a unified\nmodel evaluate interface that supports models available on Hugging Face; (2)\nreal-time updated leaderboard for model evaluation; (3) a comprehensive\nevaluation covering game-winning rates, attacking, defense strategies, and\nreasoning of LLMs. To rigorously test WiS, we conduct extensive experiments\ncoverage of various open- and closed-source LLMs, we find that different agents\nexhibit distinct and intriguing behaviors in the game. The experimental results\ndemonstrate the effectiveness and efficiency of our platform in evaluating\nLLM-based MAS. Our platform and its documentation are publicly available at\n\\url{https://whoisspy.ai/}\n","authors":["Chengwei Hu","Jianhui Zheng","Yancheng He","Hangyu Guo","Junguang Jiang","Han Zhu","Kai Sun","Yuning Jiang","Wenbo Su","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.03359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03352v1","updated":"2024-12-04T14:35:06Z","published":"2024-12-04T14:35:06Z","title":"Intuitive Axial Augmentation Using Polar-Sine-Based Piecewise Distortion\n for Medical Slice-Wise Segmentation","summary":" Most data-driven models for medical image analysis rely on universal\naugmentations to improve performance. Experimental evidence has confirmed their\neffectiveness, but the unclear mechanism underlying them poses a barrier to the\nwidespread acceptance and trust in such methods within the medical community.\nWe revisit and acknowledge the unique characteristics of medical images apart\nfrom traditional digital images, and consequently, proposed a medical-specific\naugmentation algorithm that is more elastic and aligns well with radiology scan\nprocedure. The method performs piecewise affine with sinusoidal distorted ray\naccording to radius on polar coordinates, thus simulating uncertain postures of\nhuman lying flat on the scanning table. Our method could generate human\nvisceral distribution without affecting the fundamental relative position on\naxial plane. Two non-adaptive algorithms, namely Meta-based Scan Table Removal\nand Similarity-Guided Parameter Search, are introduced to bolster robustness of\nour augmentation method. Experiments show our method improves accuracy across\nmultiple famous segmentation frameworks without requiring more data samples.\nOur preview code is available in: https://github.com/MGAMZ/PSBPD.\n","authors":["Yiqin Zhang","Qingkui Chen","Chen Huang","Zhengjie Zhang","Meiling Chen","Zhibing Fu"],"pdf_url":"https://arxiv.org/pdf/2412.03352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16995v2","updated":"2024-12-04T14:33:44Z","published":"2024-06-24T08:36:40Z","title":"tcrLM: a lightweight protein language model for predicting T cell\n receptor and epitope binding specificity","summary":" The anti-cancer immune response relies on the bindings between T-cell\nreceptors (TCRs) and antigens, which elicits adaptive immunity to eliminate\ntumor cells. This ability of the immune system to respond to novel various\nneoantigens arises from the immense diversity of TCR repository. However, TCR\ndiversity poses a significant challenge on accurately predicting antigen-TCR\nbindings. In this study, we introduce a lightweight masked language model,\ntermed tcrLM, to address this challenge. Our approach involves randomly masking\nsegments of TCR sequences and training tcrLM to infer the masked segments,\nthereby enabling the extraction of expressive features from TCR sequences. To\nfurther enhance robustness, we incorporate virtual adversarial training into\ntcrLM. We construct the largest TCR CDR3 sequence set with more than 100\nmillion distinct sequences, and pretrain tcrLM on these sequences. The\npre-trained encoder is subsequently applied to predict TCR-antigen binding\nspecificity. We evaluate model performance on three test datasets: independent,\nexternal, and COVID-19 test set. The results demonstrate that tcrLM not only\nsurpasses existing TCR-antigen binding prediction methods, but also outperforms\nother mainstream protein language models. More interestingly, tcrLM effectively\ncaptures the biochemical properties and positional preference of amino acids\nwithin TCR sequences. Additionally, the predicted TCR-neoantigen binding scores\nindicates the immunotherapy responses and clinical outcomes in a melanoma\ncohort. These findings demonstrate the potential of tcrLM in predicting\nTCR-antigen binding specificity, with significant implications for advancing\nimmunotherapy and personalized medicine.\n","authors":["Xing Fang","Chenpeng Yu","Shiye Tian","Hui Liu"],"pdf_url":"https://arxiv.org/pdf/2406.16995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03347v1","updated":"2024-12-04T14:28:43Z","published":"2024-12-04T14:28:43Z","title":"DIVE: Taming DINO for Subject-Driven Video Editing","summary":" Building on the success of diffusion models in image generation and editing,\nvideo editing has recently gained substantial attention. However, maintaining\ntemporal consistency and motion alignment still remains challenging. To address\nthese issues, this paper proposes DINO-guided Video Editing (DIVE), a framework\ndesigned to facilitate subject-driven editing in source videos conditioned on\neither target text prompts or reference images with specific identities. The\ncore of DIVE lies in leveraging the powerful semantic features extracted from a\npretrained DINOv2 model as implicit correspondences to guide the editing\nprocess. Specifically, to ensure temporal motion consistency, DIVE employs DINO\nfeatures to align with the motion trajectory of the source video. Extensive\nexperiments on diverse real-world videos demonstrate that our framework can\nachieve high-quality editing results with robust motion consistency,\nhighlighting the potential of DINO to contribute to video editing. For precise\nsubject editing, DIVE incorporates the DINO features of reference images into a\npretrained text-to-image model to learn Low-Rank Adaptations (LoRAs),\neffectively registering the target subject's identity. Project page:\nhttps://dino-video-editing.github.io\n","authors":["Yi Huang","Wei Xiong","He Zhang","Chaoqi Chen","Jianzhuang Liu","Mingfu Yan","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.03644v2","updated":"2024-12-04T14:27:06Z","published":"2024-05-06T17:07:28Z","title":"When LLMs Meet Cybersecurity: A Systematic Literature Review","summary":" The rapid development of large language models (LLMs) has opened new avenues\nacross various fields, including cybersecurity, which faces an evolving threat\nlandscape and demand for innovative technologies. Despite initial explorations\ninto the application of LLMs in cybersecurity, there is a lack of a\ncomprehensive overview of this research area. This paper addresses this gap by\nproviding a systematic literature review, covering the analysis of over 300\nworks, encompassing 25 LLMs and more than 10 downstream scenarios. Our\ncomprehensive overview addresses three key research questions: the construction\nof cybersecurity-oriented LLMs, the application of LLMs to various\ncybersecurity tasks, the challenges and further research in this area. This\nstudy aims to shed light on the extensive potential of LLMs in enhancing\ncybersecurity practices and serve as a valuable resource for applying LLMs in\nthis field. We also maintain and regularly update a list of practical guides on\nLLMs for cybersecurity at https://github.com/tmylla/Awesome-LLM4Cybersecurity.\n","authors":["Jie Zhang","Haoyu Bu","Hui Wen","Yongji Liu","Haiqiang Fei","Rongrong Xi","Lun Li","Yun Yang","Hongsong Zhu","Dan Meng"],"pdf_url":"https://arxiv.org/pdf/2405.03644v2.pdf","comment":"We have updated the related papers up to Aug 31st, with 50+ new\n papers added"},{"id":"http://arxiv.org/abs/2412.03343v1","updated":"2024-12-04T14:23:16Z","published":"2024-12-04T14:23:16Z","title":"Improving Linguistic Diversity of Large Language Models with Possibility\n Exploration Fine-Tuning","summary":" While Large Language Models (LLMs) have made significant strides in\nreplicating human-like abilities, there are concerns about a reduction in the\nlinguistic diversity of their outputs. This results in the homogenization of\nviewpoints and perspectives, as well as the underrepresentation of specific\ndemographic groups. Although several fine-tuning and prompting techniques have\nbeen suggested to tackle the issue, they are often tailored to specific tasks\nor come with a substantial increase in computational cost and latency. This\nmakes them challenging to apply to applications that demand very low latency,\nsuch as chatbots and virtual assistants. We propose Possibility Exploration\nFine-Tuning (PEFT), a task-agnostic framework that enhances the text diversity\nof LLMs without increasing latency or computational cost. Given the same\nprompt, models fine-tuned with PEFT can simultaneously generate multiple\ndiverse responses, each corresponding with a controllable possibility number.\nExperiments on dialogue and story generation tasks demonstrate that PEFT\nsignificantly enhances the diversity of LLM outputs, as evidenced by lower\nsimilarity between candidate responses. Since PEFT emphasizes semantic\ndiversity over lexical diversity, it can also notably reduce demographic bias\nin dialogue systems. The implementations and datasets are available in our\nrepository: https://github.com/mailong25/peft_diversity\n","authors":["Long Mai","Julie Carson-Berndsen"],"pdf_url":"https://arxiv.org/pdf/2412.03343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01951v2","updated":"2024-12-04T14:20:21Z","published":"2024-12-02T20:24:17Z","title":"Self-Improvement in Language Models: The Sharpening Mechanism","summary":" Recent work in language modeling has raised the possibility of\nself-improvement, where a language models evaluates and refines its own\ngenerations to achieve higher performance without external feedback. It is\nimpossible for this self-improvement to create information that is not already\nin the model, so why should we expect that this will lead to improved\ncapabilities? We offer a new perspective on the capabilities of\nself-improvement through a lens we refer to as sharpening. Motivated by the\nobservation that language models are often better at verifying response quality\nthan they are at generating correct responses, we formalize self-improvement as\nusing the model itself as a verifier during post-training in order to\n``sharpen'' the model to one placing large mass on high-quality sequences,\nthereby amortizing the expensive inference-time computation of generating good\nsequences. We begin by introducing a new statistical framework for sharpening\nin which the learner aims to sharpen a pre-trained base policy via sample\naccess, and establish fundamental limits. Then we analyze two natural families\nof self-improvement algorithms based on SFT and RLHF. We find that (i) the\nSFT-based approach is minimax optimal whenever the initial model has sufficient\ncoverage, but (ii) the RLHF-based approach can improve over SFT-based\nself-improvement by leveraging online exploration, bypassing the need for\ncoverage. Finally, we empirically validate the sharpening mechanism via\ninference-time and amortization experiments. We view these findings as a\nstarting point toward a foundational understanding that can guide the design\nand evaluation of self-improvement algorithms.\n","authors":["Audrey Huang","Adam Block","Dylan J. Foster","Dhruv Rohatgi","Cyril Zhang","Max Simchowitz","Jordan T. Ash","Akshay Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2412.01951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03338v1","updated":"2024-12-04T14:13:38Z","published":"2024-12-04T14:13:38Z","title":"AI-Driven Day-to-Day Route Choice","summary":" Understanding travelers' route choices can help policymakers devise optimal\noperational and planning strategies for both normal and abnormal circumstances.\nHowever, existing choice modeling methods often rely on predefined assumptions\nand struggle to capture the dynamic and adaptive nature of travel behavior.\nRecently, Large Language Models (LLMs) have emerged as a promising alternative,\ndemonstrating remarkable ability to replicate human-like behaviors across\nvarious fields. Despite this potential, their capacity to accurately simulate\nhuman route choice behavior in transportation contexts remains doubtful. To\nsatisfy this curiosity, this paper investigates the potential of LLMs for route\nchoice modeling by introducing an LLM-empowered agent, \"LLMTraveler.\" This\nagent integrates an LLM as its core, equipped with a memory system that learns\nfrom past experiences and makes decisions by balancing retrieved data and\npersonality traits. The study systematically evaluates the LLMTraveler's\nability to replicate human-like decision-making through two stages: (1)\nanalyzing its route-switching behavior in single origin-destination (OD) pair\ncongestion game scenarios, where it demonstrates patterns align with laboratory\ndata but are not fully explained by traditional models, and (2) testing its\ncapacity to model day-to-day (DTD) adaptive learning behaviors on the Ortuzar\nand Willumsen (OW) network, producing results comparable to Multinomial Logit\n(MNL) and Reinforcement Learning (RL) models. These experiments demonstrate\nthat the framework can partially replicate human-like decision-making in route\nchoice while providing natural language explanations for its decisions. This\ncapability offers valuable insights for transportation policymaking, such as\nsimulating traveler responses to new policies or changes in the network.\n","authors":["Leizhen Wang","Peibo Duan","Zhengbing He","Cheng Lyu","Xin Chen","Nan Zheng","Li Yao","Zhenliang Ma"],"pdf_url":"https://arxiv.org/pdf/2412.03338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03331v1","updated":"2024-12-04T14:02:12Z","published":"2024-12-04T14:02:12Z","title":"LuxEmbedder: A Cross-Lingual Approach to Enhanced Luxembourgish Sentence\n Embeddings","summary":" Sentence embedding models play a key role in various Natural Language\nProcessing tasks, such as in Topic Modeling, Document Clustering and\nRecommendation Systems. However, these models rely heavily on parallel data,\nwhich can be scarce for many low-resource languages, including Luxembourgish.\nThis scarcity results in suboptimal performance of monolingual and\ncross-lingual sentence embedding models for these languages. To address this\nissue, we compile a relatively small but high-quality human-generated\ncross-lingual parallel dataset to train \\tool, an enhanced sentence embedding\nmodel for Luxembourgish with strong cross-lingual capabilities. Additionally,\nwe present evidence suggesting that including low-resource languages in\nparallel training datasets can be more advantageous for other low-resource\nlanguages than relying solely on high-resource language pairs. Furthermore,\nrecognizing the lack of sentence embedding benchmarks for low-resource\nlanguages, we create a paraphrase detection benchmark specifically for\nLuxembourgish, aiming to partially fill this gap and promote further research.\n","authors":["Fred Philippy","Siwen Guo","Jacques Klein","Tegawendé F. Bissyandé"],"pdf_url":"https://arxiv.org/pdf/2412.03331v1.pdf","comment":"Accepted at COLING 2025"},{"id":"http://arxiv.org/abs/2412.03312v1","updated":"2024-12-04T13:44:56Z","published":"2024-12-04T13:44:56Z","title":"Path-Guided Particle-based Sampling","summary":" Particle-based Bayesian inference methods by sampling from a partition-free\ntarget (posterior) distribution, e.g., Stein variational gradient descent\n(SVGD), have attracted significant attention. We propose a path-guided\nparticle-based sampling~(PGPS) method based on a novel Log-weighted Shrinkage\n(LwS) density path linking an initial distribution to the target distribution.\nWe propose to utilize a Neural network to learn a vector field motivated by the\nFokker-Planck equation of the designed density path. Particles, initiated from\nthe initial distribution, evolve according to the ordinary differential\nequation defined by the vector field. The distribution of these particles is\nguided along a density path from the initial distribution to the target\ndistribution. The proposed LwS density path allows for an efficient search of\nmodes of the target distribution while canonical methods fail. We theoretically\nanalyze the Wasserstein distance of the distribution of the PGPS-generated\nsamples and the target distribution due to approximation and discretization\nerrors. Practically, the proposed PGPS-LwS method demonstrates higher Bayesian\ninference accuracy and better calibration ability in experiments conducted on\nboth synthetic and real-world Bayesian learning tasks, compared to baselines,\nsuch as SVGD and Langevin dynamics, etc.\n","authors":["Mingzhou Fan","Ruida Zhou","Chao Tian","Xiaoning Qian"],"pdf_url":"https://arxiv.org/pdf/2412.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04203v3","updated":"2024-12-04T13:43:10Z","published":"2023-04-09T10:08:38Z","title":"OpenDriver: An Open-Road Driver State Detection Dataset","summary":" Among numerous studies for driver state detection, wearable physiological\nmeasurements offer a practical method for real-time monitoring. However, there\nare few driver physiological datasets in open-road scenarios, and the existing\ndatasets suffer from issues such as poor signal quality, small sample sizes,\nand short data collection periods. Therefore, in this paper, a large-scale\nmultimodal driving dataset, OpenDriver, for driver state detection is\ndeveloped. The OpenDriver encompasses a total of 3,278 driving trips, with a\nsignal collection duration spanning approximately 4,600 hours. Two modalities\nof driving signals are enrolled in OpenDriver: electrocardiogram (ECG) signals\nand six-axis motion data of the steering wheel from a motion measurement unit\n(IMU), which were recorded from 81 drivers and their vehicles. Furthermore,\nthree challenging tasks are involved in our work, namely ECG signal quality\nassessment, individual biometric identification based on ECG signals, and\nphysiological signal analysis in complex driving environments. To facilitate\nresearch in these tasks, corresponding benchmarks have also been introduced.\nFirst, a noisy augmentation strategy is applied to generate a larger-scale ECG\nsignal dataset with realistic noise simulation for quality assessment. Second,\nan end-to-end contrastive learning framework is employed for individual\nbiometric identification. Finally, a comprehensive analysis of drivers' HRV\nfeatures under different driving conditions is conducted. Each benchmark\nprovides evaluation metrics and reference results. The OpenDriver dataset will\nbe publicly available at https://github.com/bdne/OpenDriver.\n","authors":["Delong Liu","Shichao Li","Tianyi Shi","Zhu Meng","Guanyu Chen","Yadong Huang","Jin Dong","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.04203v3.pdf","comment":"Considering that there are flaws in the statistical data of the\n dataset, all the authors agreed to withdraw the manuscript"},{"id":"http://arxiv.org/abs/2412.03307v1","updated":"2024-12-04T13:29:52Z","published":"2024-12-04T13:29:52Z","title":"Contextual Data Integration for Bike-sharing Demand Prediction with\n Graph Neural Networks in Degraded Weather Conditions","summary":" Demand for bike sharing is impacted by various factors, such as weather\nconditions, events, and the availability of other transportation modes. This\nimpact remains elusive due to the complex interdependence of these factors or\nlocationrelated user behavior variations. It is also not clear which factor is\nadditional information which are not already contained in the historical\ndemand. Intermodal dependencies between bike-sharing and other modes are also\nunderexplored, and the value of this information has not been studied in\ndegraded situations. The proposed study analyzes the impact of adding\ncontextual data, such as weather, time embedding, and road traffic flow, to\npredict bike-sharing Origin-Destination (OD) flows in atypical weather\nsituations Our study highlights a mild relationship between prediction quality\nof bike-sharing demand and road traffic flow, while the introduced time\nembedding allows outperforming state-of-the-art results, particularly in the\ncase of degraded weather conditions. Including weather data as an additional\ninput further improves our model with respect to the basic ST-ED-RMGC\nprediction model by reducing of more than 20% the prediction error in degraded\nweather condition.\n","authors":["Romain Rochas","Angelo Furno","Nour-Eddin El Faouzi"],"pdf_url":"https://arxiv.org/pdf/2412.03307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03287v1","updated":"2024-12-04T12:58:55Z","published":"2024-12-04T12:58:55Z","title":"Integrating Generative AI into Art Therapy: A Technical Showcase","summary":" This paper explores the integration of generative AI into the field of art\ntherapy. Leveraging proven text-to-image models, we introduce a novel technical\ndesign to complement art therapy. The resulting AI-based tools shall enable\npatients to refine and customize their creative work, opening up new avenues of\nexpression and accessibility. Using three illustrative examples, we demonstrate\npotential outputs of our solution and evaluate them qualitatively. Furthermore,\nwe discuss the current limitations and ethical considerations associated with\nthis integration and provide an outlook into future research efforts. Our\nimplementations are publicly available at https://github.com/BFH-AMI/sds24.\n","authors":["Yannis Valentin Schmutz","Tetiana Kravchenko","Souhir Ben Souissi","Mascha Kurpicz-Briki"],"pdf_url":"https://arxiv.org/pdf/2412.03287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03283v1","updated":"2024-12-04T12:57:17Z","published":"2024-12-04T12:57:17Z","title":"Black-Box Forgery Attacks on Semantic Watermarks for Diffusion Models","summary":" Integrating watermarking into the generation process of latent diffusion\nmodels (LDMs) simplifies detection and attribution of generated content.\nSemantic watermarks, such as Tree-Rings and Gaussian Shading, represent a novel\nclass of watermarking techniques that are easy to implement and highly robust\nagainst various perturbations. However, our work demonstrates a fundamental\nsecurity vulnerability of semantic watermarks. We show that attackers can\nleverage unrelated models, even with different latent spaces and architectures\n(UNet vs DiT), to perform powerful and realistic forgery attacks. Specifically,\nwe design two watermark forgery attacks. The first imprints a targeted\nwatermark into real images by manipulating the latent representation of an\narbitrary image in an unrelated LDM to get closer to the latent representation\nof a watermarked image. We also show that this technique can be used for\nwatermark removal. The second attack generates new images with the target\nwatermark by inverting a watermarked image and re-generating it with an\narbitrary prompt. Both attacks just need a single reference image with the\ntarget watermark. Overall, our findings question the applicability of semantic\nwatermarks by revealing that attackers can easily forge or remove these\nwatermarks under realistic conditions.\n","authors":["Andreas Müller","Denis Lukovnikov","Jonas Thietke","Asja Fischer","Erwin Quiring"],"pdf_url":"https://arxiv.org/pdf/2412.03283v1.pdf","comment":"23 pages, 21 figures, 6 tables"},{"id":"http://arxiv.org/abs/2406.08534v2","updated":"2024-12-04T12:53:37Z","published":"2024-06-12T16:47:45Z","title":"Optimizing Container Loading and Unloading through Dual-Cycling and\n Dockyard Rehandle Reduction Using a Hybrid Genetic Algorithm","summary":" This paper addresses the optimization of container unloading and loading\noperations at ports, integrating quay-crane dual-cycling with dockyard rehandle\nminimization. We present a unified model encompassing both operations: ship\ncontainer unloading and loading by quay crane, and the other is reducing\ndockyard rehandles while loading the ship. We recognize that optimizing one\naspect in isolation can lead to suboptimal outcomes due to interdependencies.\nSpecifically, optimizing unloading sequences for minimal operation time may\ninadvertently increase dockyard rehandles during loading and vice versa. To\naddress this NP-hard problem, we propose a hybrid genetic algorithm (GA)\nQCDC-DR-GA comprising one-dimensional and two-dimensional GA components. Our\nmodel, QCDC-DR-GA, consistently outperforms four state-of-the-art methods in\nmaximizing dual cycles and minimizing dockyard rehandles. Compared to those\nmethods, it reduced 15-20% of total operation time for large vessels.\nStatistical validation through a two-tailed paired t-test confirms the\nsuperiority of QCDC-DR-GA at a 5% significance level. The approach effectively\ncombines QCDC optimization with dockyard rehandle minimization, optimizing the\ntotal unloading-loading time. Results underscore the inefficiency of separately\noptimizing QCDC and dockyard rehandles. Fragmented approaches, such as QCDC\nScheduling Optimized by bi-level GA and GA-ILSRS (Scenario 2), show limited\nimprovement compared to QCDC-DR-GA. As in GA-ILSRS (Scenario 1), neglecting\ndual-cycle optimization leads to inferior performance than QCDC-DR-GA. This\nemphasizes the necessity of simultaneously considering both aspects for optimal\nresource utilization and overall operational efficiency.\n","authors":["Md. Mahfuzur Rahman","Md Abrar Jahin","Md. Saiful Islam","M. F. Mridha"],"pdf_url":"https://arxiv.org/pdf/2406.08534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18555v3","updated":"2024-12-04T12:42:04Z","published":"2024-07-26T07:08:05Z","title":"How to Segment in 3D Using 2D Models: Automated 3D Segmentation of\n Prostate Cancer Metastatic Lesions on PET Volumes Using Multi-angle Maximum\n Intensity Projections and Diffusion Models","summary":" Prostate specific membrane antigen (PSMA) positron emission\ntomography/computed tomography (PET/CT) imaging provides a tremendously\nexciting frontier in visualization of prostate cancer (PCa) metastatic lesions.\nHowever, accurate segmentation of metastatic lesions is challenging due to low\nsignal-to-noise ratios and variable sizes, shapes, and locations of the\nlesions. This study proposes a novel approach for automated segmentation of\nmetastatic lesions in PSMA PET/CT 3D volumetric images using 2D denoising\ndiffusion probabilistic models (DDPMs). Instead of 2D trans-axial slices or 3D\nvolumes, the proposed approach segments the lesions on generated multi-angle\nmaximum intensity projections (MA-MIPs) of the PSMA PET images, then obtains\nthe final 3D segmentation masks from 3D ordered subset expectation maximization\n(OSEM) reconstruction of 2D MA-MIPs segmentations. Our proposed method achieved\nsuperior performance compared to state-of-the-art 3D segmentation approaches in\nterms of accuracy and robustness in detecting and segmenting small metastatic\nPCa lesions. The proposed method has significant potential as a tool for\nquantitative analysis of metastatic burden in PCa patients.\n","authors":["Amirhosein Toosi","Sara Harsini","François Bénard","Carlos Uribe","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2407.18555v3.pdf","comment":"11 pages, 2 figures, accepted in the DGM4MICCAI workshop, MICCAI,\n 2024"},{"id":"http://arxiv.org/abs/2405.01756v3","updated":"2024-12-04T12:37:40Z","published":"2024-05-02T21:46:13Z","title":"Segmentation-Free Outcome Prediction from Head and Neck Cancer PET/CT\n Images: Deep Learning-Based Feature Extraction from Multi-Angle Maximum\n Intensity Projections (MA-MIPs)","summary":" We introduce an innovative, simple, effective segmentation-free approach for\noutcome prediction in head \\& neck cancer (HNC) patients. By harnessing deep\nlearning-based feature extraction techniques and multi-angle maximum intensity\nprojections (MA-MIPs) applied to Fluorodeoxyglucose Positron Emission\nTomography (FDG-PET) volumes, our proposed method eliminates the need for\nmanual segmentations of regions-of-interest (ROIs) such as primary tumors and\ninvolved lymph nodes. Instead, a state-of-the-art object detection model is\ntrained to perform automatic cropping of the head and neck region on the PET\nvolumes. A pre-trained deep convolutional neural network backbone is then\nutilized to extract deep features from MA-MIPs obtained from 72 multi-angel\naxial rotations of the cropped PET volumes. These deep features extracted from\nmultiple projection views of the PET volumes are then aggregated and fused, and\nemployed to perform recurrence-free survival analysis on a cohort of 489 HNC\npatients. The proposed approach outperforms the best performing method on the\ntarget dataset for the task of recurrence-free survival analysis. By\ncircumventing the manual delineation of the malignancies on the FDG PET-CT\nimages, our approach eliminates the dependency on subjective interpretations\nand highly enhances the reproducibility of the proposed survival analysis\nmethod.\n","authors":["Amirhosein Toosi","Isaac Shiri","Habib Zaidi","Arman Rahmim"],"pdf_url":"https://arxiv.org/pdf/2405.01756v3.pdf","comment":"15 pages, 4 tables, 4 figures. Published in Cancers 2024, Volume 16,\n Issue 14, page 2538"},{"id":"http://arxiv.org/abs/2412.03270v1","updated":"2024-12-04T12:25:41Z","published":"2024-12-04T12:25:41Z","title":"Intent-driven In-context Learning for Few-shot Dialogue State Tracking","summary":" Dialogue state tracking (DST) plays an essential role in task-oriented\ndialogue systems. However, user's input may contain implicit information,\nposing significant challenges for DST tasks. Additionally, DST data includes\ncomplex information, which not only contains a large amount of noise unrelated\nto the current turn, but also makes constructing DST datasets expensive. To\naddress these challenges, we introduce Intent-driven In-context Learning for\nFew-shot DST (IDIC-DST). By extracting user's intent, we propose an\nIntent-driven Dialogue Information Augmentation module to augment the dialogue\ninformation, which can track dialogue states more effectively. Moreover, we\nmask noisy information from DST data and rewrite user's input in the\nIntent-driven Examples Retrieval module, where we retrieve similar examples. We\nthen utilize a pre-trained large language model to update the dialogue state\nusing the augmented dialogue information and examples. Experimental results\ndemonstrate that IDIC-DST achieves state-of-the-art performance in few-shot\nsettings on MultiWOZ 2.1 and MultiWOZ 2.4 datasets.\n","authors":["Zihao Yi","Zhe Xu","Ying Shen"],"pdf_url":"https://arxiv.org/pdf/2412.03270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01946v2","updated":"2024-12-04T12:19:35Z","published":"2024-12-02T20:14:46Z","title":"The Reality of AI and Biorisk","summary":" To accurately and confidently answer the question 'could an AI model or\nsystem increase biorisk', it is necessary to have both a sound theoretical\nthreat model for how AI models or systems could increase biorisk and a robust\nmethod for testing that threat model. This paper provides an analysis of\nexisting available research surrounding two AI and biorisk threat models: 1)\naccess to information and planning via large language models (LLMs), and 2) the\nuse of AI-enabled biological tools (BTs) in synthesizing novel biological\nartifacts. We find that existing studies around AI-related biorisk are nascent,\noften speculative in nature, or limited in terms of their methodological\nmaturity and transparency. The available literature suggests that current LLMs\nand BTs do not pose an immediate risk, and more work is needed to develop\nrigorous approaches to understanding how future models could increase biorisks.\nWe end with recommendations about how empirical work can be expanded to more\nprecisely target biorisk and ensure rigor and validity of findings.\n","authors":["Aidan Peppin","Anka Reuel","Stephen Casper","Elliot Jones","Andrew Strait","Usman Anwar","Anurag Agrawal","Sayash Kapoor","Sanmi Koyejo","Marie Pellat","Rishi Bommasani","Nick Frosst","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2412.01946v2.pdf","comment":"Updated to correct author affiliations"},{"id":"http://arxiv.org/abs/2412.03267v1","updated":"2024-12-04T12:18:21Z","published":"2024-12-04T12:18:21Z","title":"Detecting abnormal heart sound using mobile phones and on-device IConNet","summary":" Given the global prevalence of cardiovascular diseases, there is a pressing\nneed for easily accessible early screening methods. Typically, this requires\nmedical practitioners to investigate heart auscultations for irregular sounds,\nfollowed by echocardiography and electrocardiography tests. To democratize\nearly diagnosis, we present a user-friendly solution for abnormal heart sound\ndetection, utilizing mobile phones and a lightweight neural network optimized\nfor on-device inference. Unlike previous approaches reliant on specialized\nstethoscopes, our method directly analyzes audio recordings, facilitated by a\nnovel architecture known as IConNet. IConNet, an Interpretable Convolutional\nNeural Network, harnesses insights from audio signal processing, enhancing\nefficiency and providing transparency in neural pattern extraction from raw\nwaveform signals. This is a significant step towards trustworthy AI in\nhealthcare, aiding in remote health monitoring efforts.\n","authors":["Linh Vu","Thu Tran"],"pdf_url":"https://arxiv.org/pdf/2412.03267v1.pdf","comment":"N2Women'24 Workshop, MobiSys 2024, Tokyo, Japan"},{"id":"http://arxiv.org/abs/2410.07980v3","updated":"2024-12-04T12:14:56Z","published":"2024-10-10T14:36:24Z","title":"D-Wave's Nonlinear-Program Hybrid Solver: Description and Performance\n Analysis","summary":" The development of advanced quantum-classical algorithms is among the most\nprominent strategies in quantum computing. Numerous hybrid solvers have been\nintroduced recently. Many of these methods are created ad hoc to address\nspecific use cases. However, several well-established schemes are frequently\nutilized to address optimization problems. In this context, D-Wave launched the\nHybrid Solver Service in 2020, offering a portfolio of methods designed to\naccelerate time-to-solution for users aiming to optimize performance and\noperational processes. Recently, a new technique has been added to this\nportfolio: the Nonlinear-Program Hybrid Solver. This paper describes this\nsolver and evaluates its performance through a benchmark of 45 instances across\nthree combinatorial optimization problems: the Traveling Salesman Problem, the\nKnapsack Problem, and the Maximum Cut Problem. To facilitate the use of this\nrelatively unexplored solver, we provide details of the implementation used to\nsolve these three optimization problems.\n","authors":["Eneko Osaba","Pablo Miranda-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2410.07980v3.pdf","comment":"13 pages, 9 figures and 7 tables"},{"id":"http://arxiv.org/abs/2412.01322v2","updated":"2024-12-04T11:53:32Z","published":"2024-12-02T09:40:03Z","title":"Explainable fault and severity classification for rolling element\n bearings using Kolmogorov-Arnold networks","summary":" Rolling element bearings are critical components of rotating machinery, with\ntheir performance directly influencing the efficiency and reliability of\nindustrial systems. At the same time, bearing faults are a leading cause of\nmachinery failures, often resulting in costly downtime, reduced productivity,\nand, in extreme cases, catastrophic damage. This study presents a methodology\nthat utilizes Kolmogorov-Arnold Networks to address these challenges through\nautomatic feature selection, hyperparameter tuning and interpretable fault\nanalysis within a unified framework. By training shallow network architectures\nand minimizing the number of selected features, the framework produces\nlightweight models that deliver explainable results through feature attribution\nand symbolic representations of their activation functions. Validated on two\nwidely recognized datasets for bearing fault diagnosis, the framework achieved\nperfect F1-Scores for fault detection and high performance in fault and\nseverity classification tasks, including 100% F1-Scores in most cases. Notably,\nit demonstrated adaptability by handling diverse fault types, such as imbalance\nand misalignment, within the same dataset. The symbolic representations\nenhanced model interpretability, while feature attribution offered insights\ninto the optimal feature types or signals for each studied task. These results\nhighlight the framework's potential for practical applications, such as\nreal-time machinery monitoring, and for scientific research requiring efficient\nand explainable models.\n","authors":["Spyros Rigas","Michalis Papachristou","Ioannis Sotiropoulos","Georgios Alexandridis"],"pdf_url":"https://arxiv.org/pdf/2412.01322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03248v1","updated":"2024-12-04T11:47:57Z","published":"2024-12-04T11:47:57Z","title":"AIM: Adaptive Inference of Multi-Modal LLMs via Token Merging and\n Pruning","summary":" Large language models (LLMs) have enabled the creation of multi-modal LLMs\nthat exhibit strong comprehension of visual data such as images and videos.\nHowever, these models usually rely on extensive visual tokens from visual\nencoders, leading to high computational demands, which limits their\napplicability in resource-constrained environments and for long-context tasks.\nIn this work, we propose a training-free adaptive inference method for\nmulti-modal LLMs that can accommodate a broad range of efficiency requirements\nwith a minimum performance drop. Our method consists of a) iterative token\nmerging based on embedding similarity before LLMs, and b) progressive token\npruning within LLM layers based on multi-modal importance. With a minimalist\ndesign, our method can be applied to both video and image LLMs. Extensive\nexperiments on diverse video and image benchmarks demonstrate that, our method\nsubstantially reduces computation load (e.g., a $\\textbf{7-fold}$ reduction in\nFLOPs) while preserving the performance of video and image LLMs. Further, under\na similar computational cost, our method outperforms the state-of-the-art\nmethods in long video understanding (e.g., $\\textbf{+4.6}$ on MLVU).\nAdditionally, our in-depth analysis provides insights into token redundancy and\nLLM layer behaviors, offering guidance for future research in designing\nefficient multi-modal LLMs. Our code will be available at\nhttps://github.com/LaVi-Lab/AIM.\n","authors":["Yiwu Zhong","Zhuoming Liu","Yin Li","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03248v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.02626v2","updated":"2024-12-04T11:45:34Z","published":"2024-12-03T17:54:12Z","title":"Time-Reversal Provides Unsupervised Feedback to LLMs","summary":" Large Language Models (LLMs) are typically trained to predict in the forward\ndirection of time. However, recent works have shown that prompting these models\nto look back and critique their own generations can produce useful feedback.\nMotivated by this, we explore the question of whether LLMs can be empowered to\nthink (predict and score) backwards to provide unsupervised feedback that\ncomplements forward LLMs. Towards this, we introduce Time Reversed Language\nModels (TRLMs), which can score and generate queries when conditioned on\nresponses, effectively functioning in the reverse direction of time. Further,\nto effectively infer in the response to query direction, we pre-train and\nfine-tune a language model (TRLM-Ba) in the reverse token order from scratch.\nWe show empirically (and theoretically in a stylized setting) that\ntime-reversed models can indeed complement forward model predictions when used\nto score the query given response for re-ranking multiple forward generations.\nWe obtain up to 5\\% improvement on the widely used AlpacaEval Leaderboard over\nthe competent baseline of best-of-N re-ranking using self log-perplexity\nscores. We further show that TRLM scoring outperforms conventional forward\nscoring of response given query, resulting in significant gains in applications\nsuch as citation generation and passage retrieval. We next leverage the\ngenerative ability of TRLM to augment or provide unsupervised feedback to input\nsafety filters of LLMs, demonstrating a drastic reduction in false negative\nrate with negligible impact on false positive rates against several attacks\npublished on the popular JailbreakBench leaderboard.\n","authors":["Yerram Varun","Rahul Madhavan","Sravanti Addepalli","Arun Suggala","Karthikeyan Shanmugam","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2412.02626v2.pdf","comment":"Accepted as a spotlight in NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03235v1","updated":"2024-12-04T11:36:37Z","published":"2024-12-04T11:36:37Z","title":"Does Safety Training of LLMs Generalize to Semantically Related Natural\n Prompts?","summary":" Large Language Models (LLMs) are known to be susceptible to crafted\nadversarial attacks or jailbreaks that lead to the generation of objectionable\ncontent despite being aligned to human preferences using safety fine-tuning\nmethods. While the large dimensionality of input token space makes it\ninevitable to find adversarial prompts that can jailbreak these models, we aim\nto evaluate whether safety fine-tuned LLMs are safe against natural prompts\nwhich are semantically related to toxic seed prompts that elicit safe responses\nafter alignment. We surprisingly find that popular aligned LLMs such as GPT-4\ncan be compromised using naive prompts that are NOT even crafted with an\nobjective of jailbreaking the model. Furthermore, we empirically show that\ngiven a seed prompt that elicits a toxic response from an unaligned model, one\ncan systematically generate several semantically related natural prompts that\ncan jailbreak aligned LLMs. Towards this, we propose a method of Response\nGuided Question Augmentation (ReG-QA) to evaluate the generalization of safety\naligned LLMs to natural prompts, that first generates several toxic answers\ngiven a seed question using an unaligned LLM (Q to A), and further leverages an\nLLM to generate questions that are likely to produce these answers (A to Q). We\ninterestingly find that safety fine-tuned LLMs such as GPT-4o are vulnerable to\nproducing natural jailbreak questions from unsafe content (without denial) and\ncan thus be used for the latter (A to Q) step. We obtain attack success rates\nthat are comparable to/ better than leading adversarial attack methods on the\nJailbreakBench leaderboard, while being significantly more stable against\ndefenses such as Smooth-LLM and Synonym Substitution, which are effective\nagainst existing all attacks on the leaderboard.\n","authors":["Sravanti Addepalli","Yerram Varun","Arun Suggala","Karthikeyan Shanmugam","Prateek Jain"],"pdf_url":"https://arxiv.org/pdf/2412.03235v1.pdf","comment":"Accepted at the Safe Generative AI Workshop @ NeurIPS 2024"},{"id":"http://arxiv.org/abs/2404.11243v4","updated":"2024-12-04T11:23:37Z","published":"2024-04-17T10:49:00Z","title":"Multi-Sensor Diffusion-Driven Optical Image Translation for Large-Scale\n Applications","summary":" Comparing images captured by disparate sensors is a common challenge in\nremote sensing. This requires image translation -- converting imagery from one\nsensor domain to another while preserving the original content. Denoising\nDiffusion Implicit Models (DDIM) are potential state-of-the-art solutions for\nsuch domain translation due to their proven superiority in multiple\nimage-to-image translation tasks in computer vision. However, these models\nstruggle with reproducing radiometric features of large-scale multi-patch\nimagery, resulting in inconsistencies across the full image. This renders\ndownstream tasks like Heterogeneous Change Detection impractical. To overcome\nthese limitations, we propose a method that leverages denoising diffusion for\neffective multi-sensor optical image translation over large areas. Our approach\nsuper-resolves large-scale low spatial resolution images into high-resolution\nequivalents from disparate optical sensors, ensuring uniformity across hundreds\nof patches. Our contributions lie in new forward and reverse diffusion\nprocesses that address the challenges of large-scale image translation.\nExtensive experiments using paired Sentinel-II (10m) and Planet Dove (3m)\nimages demonstrate that our approach provides precise domain adaptation,\npreserving image content while improving radiometric accuracy and feature\nrepresentation. A thorough image quality assessment and comparisons with the\nstandard DDIM framework and five other leading methods are presented. We reach\na mean Learned Perceptual Image Patch Similarity (mLPIPS) of 0.1884 and a\nFr\\'echet Inception Distance (FID) of 45.64, expressively outperforming all\ncompared methods, including DDIM, ShuffleMixer, and SwinIR. The usefulness of\nour approach is further demonstrated in two Heterogeneous Change Detection\ntasks.\n","authors":["João Gabriel Vinholi","Marco Chini","Anis Amziane","Renato Machado","Danilo Silva","Patrick Matgen"],"pdf_url":"https://arxiv.org/pdf/2404.11243v4.pdf","comment":"This is the accepted version of the manuscript published in IEEE\n Journal of Selected Topics in Applied Earth Observations and Remote Sensing\n (JSTARS). Please access the final version at IEEEXplore (Open Access). DOI\n 10.1109/JSTARS.2024.3506032. This technology is protected by a patent filed\n on 23 december 2023 at Office Luxembourgeois de la propri\\'et\\'e\n intellectuelle (LU505861)"},{"id":"http://arxiv.org/abs/2407.05650v2","updated":"2024-12-04T11:12:42Z","published":"2024-07-08T06:22:10Z","title":"The Cooperative Network Architecture: Learning Structured Networks as\n Representation of Sensory Patterns","summary":" Nets, cooperative networks of neurons, have been proposed as format for the\nrepresentation of sensory signals, as physical implementation of the Gestalt\nphenomenon and as solution to the neural binding problem, while the direct\ninteraction between nets by structure-sensitive matching has been proposed as\nbasis for object-global operations such as object detection. The nets are\nflexibly composed of overlapping net fragments, which are learned from\nstatistical regularities of sensory input. We here present the cooperative\nnetwork architecture (CNA), a concrete model that learns such net structure to\nrepresent input patterns and deals robustly with noise, deformation, and\nout-of-distribution data, thus laying the groundwork for a novel neural\narchitecture.\n","authors":["Pascal J. Sager","Jan M. Deriu","Benjamin F. Grewe","Thilo Stadelmann","Christoph von der Malsburg"],"pdf_url":"https://arxiv.org/pdf/2407.05650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03213v1","updated":"2024-12-04T10:58:27Z","published":"2024-12-04T10:58:27Z","title":"ClusterKV: Manipulating LLM KV Cache in Semantic Space for Recallable\n Compression","summary":" Large Language Models (LLMs) have been widely deployed in a variety of\napplications, and the context length is rapidly increasing to handle tasks such\nas long-document QA and complex logical reasoning. However, long context poses\nsignificant challenges for inference efficiency, including high memory costs of\nkey-value (KV) cache and increased latency due to extensive memory accesses.\nRecent works have proposed compressing KV cache to approximate computation, but\nthese methods either evict tokens permanently, never recalling them for later\ninference, or recall previous tokens at the granularity of pages divided by\ntextual positions. Both approaches degrade the model accuracy and output\nquality. To achieve efficient and accurate recallable KV cache compression, we\nintroduce ClusterKV, which recalls tokens at the granularity of semantic\nclusters. We design and implement efficient algorithms and systems for\nclustering, selection, indexing and caching. Experiment results show that\nClusterKV attains negligible accuracy loss across various tasks with 32k\ncontext lengths, using only a 1k to 2k KV cache budget, and achieves up to a\n2$\\times$ speedup in latency and a 2.5$\\times$ improvement in decoding\nthroughput. Compared to SoTA recallable KV compression methods, ClusterKV\ndemonstrates higher model accuracy and output quality, while maintaining or\nexceeding inference efficiency.\n","authors":["Guangda Liu","Chengwei Li","Jieru Zhao","Chenqi Zhang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2412.03213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03098v2","updated":"2024-12-04T10:52:25Z","published":"2024-11-05T13:44:25Z","title":"Local Lesion Generation is Effective for Capsule Endoscopy Image Data\n Augmentation in a Limited Data Setting","summary":" Limited medical imaging datasets challenge deep learning models by increasing\nrisks of overfitting and reduced generalization, particularly in Generative\nAdversarial Networks (GANs), where discriminators may overfit, leading to\ntraining divergence. This constraint also impairs classification models trained\non small datasets. Generative Data Augmentation (GDA) addresses this by\nexpanding training datasets with synthetic data, although it requires training\na generative model. We propose and evaluate two local lesion generation\napproaches to address the challenge of augmenting small medical image datasets.\nThe first approach employs the Poisson Image Editing algorithm, a classical\nimage processing technique, to create realistic image composites that\noutperform current state-of-the-art methods. The second approach introduces a\nnovel generative method, leveraging a fine-tuned Image Inpainting GAN to\nsynthesize realistic lesions within specified regions of real training images.\nA comprehensive comparison of the two proposed methods demonstrates that\neffective local lesion generation in a data-constrained setting allows for\nreaching new state-of-the-art results in capsule endoscopy lesion\nclassification. Combination of our techniques achieves a macro F1-score of\n33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on\nthe highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule\nendoscopy. To the best of our knowledge, this work is the first to apply a\nfine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that\nan image-conditional GAN can be adapted effectively to limited datasets to\ngenerate high-quality examples, facilitating effective data augmentation.\nAdditionally, we show that combining this GAN-based approach with classical\nimage processing techniques further improves the results.\n","authors":["Adrian B. Chłopowiec","Adam R. Chłopowiec","Krzysztof Galus","Wojciech Cebula","Martin Tabakov"],"pdf_url":"https://arxiv.org/pdf/2411.03098v2.pdf","comment":"54 pages, 35 figures"},{"id":"http://arxiv.org/abs/2411.00850v2","updated":"2024-12-04T10:45:41Z","published":"2024-10-30T11:16:04Z","title":"GWQ: Gradient-Aware Weight Quantization for Large Language Models","summary":" Large language models (LLMs) show impressive performance in solving complex\nlanguage tasks. However, its large number of parameters present significant\nchallenges for the deployment and application of the model on edge devices.\nCompressing large language models to low bits can enable them to run on\nresource-constrained devices, often leading to performance degradation. To\naddress this problem, we propose gradient-aware weight quantization (GWQ), the\nfirst quantization approach for low-bit weight quantization that leverages\ngradients to localize outliers, requiring only a minimal amount of calibration\ndata for outlier detection. GWQ retains the weights corresponding to the top 1%\noutliers preferentially at FP16 precision, while the remaining non-outlier\nweights are stored in a low-bit format. GWQ found experimentally that utilizing\nthe sensitive weights in the gradient localization model is more scientific\ncompared to utilizing the sensitive weights in the Hessian matrix localization\nmodel. Compared to current quantization methods, GWQ can be applied to multiple\nlanguage models and achieves lower PPL on the WikiText2 and C4 dataset. In the\nzero-shot task, GWQ quantized models have higher accuracy compared to other\nquantization methods. GWQ is also suitable for multimodal model quantization,\nand the quantized Qwen-VL family model is more accurate than other methods.\nZero-shot target detection task dataset RefCOCO outperforms the current\nstat-of-the-arts method SPQR. GWQ achieves 1.2 times inference speedup in\ncomparison to the original model, and effectively reduces the inference memory.\n","authors":["Yihua Shao","Siyu Liang","Zijian Ling","Minxi Yan","Haiyang Liu","Siyu Chen","Ziyang Yan","Chenyu Zhang","Haotong Qin","Michele Magno","Yang Yang","Zhen Lei","Yan Wang","Jingcai Guo","Ling Shao","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2411.00850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03205v1","updated":"2024-12-04T10:44:50Z","published":"2024-12-04T10:44:50Z","title":"U-MATH: A University-Level Benchmark for Evaluating Mathematical Skills\n in LLMs","summary":" The current evaluation of mathematical skills in LLMs is limited, as existing\nbenchmarks are either relatively small, primarily focus on elementary and\nhigh-school problems, or lack diversity in topics. Additionally, the inclusion\nof visual elements in tasks remains largely under-explored.\n To address these gaps, we introduce U-MATH, a novel benchmark of 1,100\nunpublished open-ended university-level problems sourced from teaching\nmaterials. It is balanced across six core subjects, with 20% of multimodal\nproblems. Given the open-ended nature of U-MATH problems, we employ an LLM to\njudge the correctness of generated solutions. To this end, we release\n$\\mu$-MATH, a dataset to evaluate the LLMs' capabilities in judging solutions.\n The evaluation of general domain, math-specific, and multimodal LLMs\nhighlights the challenges presented by U-MATH. Our findings reveal that LLMs\nachieve a maximum accuracy of only 63% on text-based tasks, with even lower 45%\non visual problems. The solution assessment proves challenging for LLMs, with\nthe best LLM judge having an F1-score of 80% on $\\mu$-MATH.\n","authors":["Konstantin Chernyshev","Vitaliy Polshkov","Ekaterina Artemova","Alex Myasnikov","Vlad Stepanov","Alexei Miasnikov","Sergei Tilga"],"pdf_url":"https://arxiv.org/pdf/2412.03205v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18053v3","updated":"2024-12-04T10:35:51Z","published":"2024-09-26T16:58:04Z","title":"DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving","summary":" We present a novel autonomous driving framework, DualAD, designed to imitate\nhuman reasoning during driving. DualAD comprises two layers: a rule-based\nmotion planner at the bottom layer that handles routine driving tasks requiring\nminimal reasoning, and an upper layer featuring a rule-based text encoder that\nconverts driving scenarios from absolute states into text description. This\ntext is then processed by a large language model (LLM) to make driving\ndecisions. The upper layer intervenes in the bottom layer's decisions when\npotential danger is detected, mimicking human reasoning in critical situations.\nClosed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained\nmodel, significantly outperforms rule-based motion planners that lack reasoning\nabilities. Our experiments also highlight the effectiveness of the text\nencoder, which considerably enhances the model's scenario understanding.\nAdditionally, the integrated DualAD model improves with stronger LLMs,\nindicating the framework's potential for further enhancement. Code and\nbenchmarks are available at github.com/TUM-AVS/DualAD.\n","authors":["Dingrui Wang","Marc Kaufeld","Johannes Betz"],"pdf_url":"https://arxiv.org/pdf/2409.18053v3.pdf","comment":"Autonomous Driving, Large Language Models (LLMs), Human Reasoning,\n Critical Scenario"},{"id":"http://arxiv.org/abs/2403.08004v2","updated":"2024-12-04T10:35:25Z","published":"2024-03-12T18:12:50Z","title":"Leveraging LLMs for On-the-Fly Instruction Guided Image Editing","summary":" The combination of language processing and image processing keeps attracting\nincreased interest given recent impressive advances that leverage the combined\nstrengths of both domains of research. Among these advances, the task of\nediting an image on the basis solely of a natural language instruction stands\nout as a most challenging endeavour. While recent approaches for this task\nresort, in one way or other, to some form of preliminary preparation, training\nor fine-tuning, this paper explores a novel approach: We propose a\npreparation-free method that permits instruction-guided image editing on the\nfly. This approach is organized along three steps properly orchestrated that\nresort to image captioning and DDIM inversion, followed by obtaining the edit\ndirection embedding, followed by image editing proper. While dispensing with\npreliminary preparation, our approach demonstrates to be effective and\ncompetitive, outperforming recent, state of the art models for this task when\nevaluated on the MAGICBRUSH dataset.\n","authors":["Rodrigo Santos","João Silva","António Branco"],"pdf_url":"https://arxiv.org/pdf/2403.08004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06209v3","updated":"2024-12-04T10:33:18Z","published":"2024-04-09T10:58:21Z","title":"Elephants Never Forget: Memorization and Learning of Tabular Data in\n Large Language Models","summary":" While many have shown how Large Language Models (LLMs) can be applied to a\ndiverse set of tasks, the critical issues of data contamination and\nmemorization are often glossed over. In this work, we address this concern for\ntabular data. Specifically, we introduce a variety of different techniques to\nassess whether a language model has seen a tabular dataset during training.\nThis investigation reveals that LLMs have memorized many popular tabular\ndatasets verbatim. We then compare the few-shot learning performance of LLMs on\ndatasets that were seen during training to the performance on datasets released\nafter training. We find that LLMs perform better on datasets seen during\ntraining, indicating that memorization leads to overfitting. At the same time,\nLLMs show non-trivial performance on novel datasets and are surprisingly robust\nto data transformations. We then investigate the in-context statistical\nlearning abilities of LLMs. While LLMs are significantly better than random at\nsolving statistical classification problems, the sample efficiency of few-shot\nlearning lags behind traditional statistical learning algorithms, especially as\nthe dimension of the problem increases. This suggests that much of the observed\nfew-shot performance on novel real-world datasets is due to the LLM's world\nknowledge. Overall, our results highlight the importance of testing whether an\nLLM has seen an evaluation dataset during pre-training. We release the\nhttps://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package\nto test LLMs for memorization of tabular datasets.\n","authors":["Sebastian Bordt","Harsha Nori","Vanessa Rodrigues","Besmira Nushi","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2404.06209v3.pdf","comment":"COLM camera ready, fix typo"},{"id":"http://arxiv.org/abs/2412.03188v1","updated":"2024-12-04T10:20:21Z","published":"2024-12-04T10:20:21Z","title":"Semi-decentralized Training of Spatio-Temporal Graph Neural Networks for\n Traffic Prediction","summary":" In smart mobility, large networks of geographically distributed sensors\nproduce vast amounts of high-frequency spatio-temporal data that must be\nprocessed in real time to avoid major disruptions. Traditional centralized\napproaches are increasingly unsuitable to this task, as they struggle to scale\nwith expanding sensor networks, and reliability issues in central components\ncan easily affect the whole deployment. To address these challenges, we explore\nand adapt semi-decentralized training techniques for Spatio-Temporal Graph\nNeural Networks (ST-GNNs) in smart mobility domain. We implement a simulation\nframework where sensors are grouped by proximity into multiple cloudlets, each\nhandling a subgraph of the traffic graph, fetching node features from other\ncloudlets to train its own local ST-GNN model, and exchanging model updates\nwith other cloudlets to ensure consistency, enhancing scalability and removing\nreliance on a centralized aggregator. We perform extensive comparative\nevaluation of four different ST-GNN training setups -- centralized, traditional\nFL, server-free FL, and Gossip Learning -- on large-scale traffic datasets, the\nMETR-LA and PeMS-BAY datasets, for short-, mid-, and long-term vehicle speed\npredictions. Experimental results show that semi-decentralized setups are\ncomparable to centralized approaches in performance metrics, while offering\nadvantages in terms of scalability and fault tolerance. In addition, we\nhighlight often overlooked issues in existing literature for distributed\nST-GNNs, such as the variation in model performance across different\ngeographical areas due to region-specific traffic patterns, and the significant\ncommunication overhead and computational costs that arise from the large\nreceptive field of GNNs, leading to substantial data transfers and increased\ncomputation of partial embeddings.\n","authors":["Ivan Kralj","Lodovico Giaretta","Gordan Ježić","Ivana Podnar Žarko","Šarūnas Girdzijauskas"],"pdf_url":"https://arxiv.org/pdf/2412.03188v1.pdf","comment":"8 pages, 4 figures, 3 tables, conference"},{"id":"http://arxiv.org/abs/2412.03179v1","updated":"2024-12-04T10:05:47Z","published":"2024-12-04T10:05:47Z","title":"Optimizing Dense Visual Predictions Through Multi-Task Coherence and\n Prioritization","summary":" Multi-Task Learning (MTL) involves the concurrent training of multiple tasks,\noffering notable advantages for dense prediction tasks in computer vision. MTL\nnot only reduces training and inference time as opposed to having multiple\nsingle-task models, but also enhances task accuracy through the interaction of\nmultiple tasks. However, existing methods face limitations. They often rely on\nsuboptimal cross-task interactions, resulting in task-specific predictions with\npoor geometric and predictive coherence. In addition, many approaches use\ninadequate loss weighting strategies, which do not address the inherent\nvariability in task evolution during training. To overcome these challenges, we\npropose an advanced MTL model specifically designed for dense vision tasks. Our\nmodel leverages state-of-the-art vision transformers with task-specific\ndecoders. To enhance cross-task coherence, we introduce a trace-back method\nthat improves both cross-task geometric and predictive features. Furthermore,\nwe present a novel dynamic task balancing approach that projects task losses\nonto a common scale and prioritizes more challenging tasks during training.\nExtensive experiments demonstrate the superiority of our method, establishing\nnew state-of-the-art performance across two benchmark datasets. The code is\navailable at:https://github.com/Klodivio355/MT-CP\n","authors":["Maxime Fontana","Michael Spratling","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2412.03179v1.pdf","comment":"Accepted by WACV 2025"},{"id":"http://arxiv.org/abs/2412.03178v1","updated":"2024-12-04T10:03:52Z","published":"2024-12-04T10:03:52Z","title":"Towards Understanding and Quantifying Uncertainty for Text-to-Image\n Generation","summary":" Uncertainty quantification in text-to-image (T2I) generative models is\ncrucial for understanding model behavior and improving output reliability. In\nthis paper, we are the first to quantify and evaluate the uncertainty of T2I\nmodels with respect to the prompt. Alongside adapting existing approaches\ndesigned to measure uncertainty in the image space, we also introduce\nPrompt-based UNCertainty Estimation for T2I models (PUNC), a novel method\nleveraging Large Vision-Language Models (LVLMs) to better address uncertainties\narising from the semantics of the prompt and generated images. PUNC utilizes a\nLVLM to caption a generated image, and then compares the caption with the\noriginal prompt in the more semantically meaningful text space. PUNC also\nenables the disentanglement of both aleatoric and epistemic uncertainties via\nprecision and recall, which image-space approaches are unable to do. Extensive\nexperiments demonstrate that PUNC outperforms state-of-the-art uncertainty\nestimation techniques across various settings. Uncertainty quantification in\ntext-to-image generation models can be used on various applications including\nbias detection, copyright protection, and OOD detection. We also introduce a\ncomprehensive dataset of text prompts and generation pairs to foster further\nresearch in uncertainty quantification for generative models. Our findings\nillustrate that PUNC not only achieves competitive performance but also enables\nnovel applications in evaluating and improving the trustworthiness of\ntext-to-image models.\n","authors":["Gianni Franchi","Dat Nguyen Trong","Nacim Belkhir","Guoxuan Xia","Andrea Pilzer"],"pdf_url":"https://arxiv.org/pdf/2412.03178v1.pdf","comment":"28 pages and 22 figures"},{"id":"http://arxiv.org/abs/2407.15017v4","updated":"2024-12-04T09:54:59Z","published":"2024-07-22T06:15:59Z","title":"Knowledge Mechanisms in Large Language Models: A Survey and Perspective","summary":" Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.\n","authors":["Mengru Wang","Yunzhi Yao","Ziwen Xu","Shuofei Qiao","Shumin Deng","Peng Wang","Xiang Chen","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15017v4.pdf","comment":"EMNLP 2024 Findings; 39 pages (v4)"},{"id":"http://arxiv.org/abs/2412.01289v2","updated":"2024-12-04T09:51:16Z","published":"2024-12-02T09:02:28Z","title":"Enhancing Perception Capabilities of Multimodal LLMs with Training-Free\n Fusion","summary":" Multimodal LLMs (MLLMs) equip language models with visual capabilities by\naligning vision encoders with language models. Existing methods to enhance the\nvisual perception of MLLMs often involve designing more powerful vision\nencoders, which requires exploring a vast design space and re-aligning each\npotential encoder with the language model, resulting in prohibitively high\ntraining costs. In this paper, we introduce VisionFuse, a novel integration\nframework that efficiently utilizes multiple vision encoders from off-the-shelf\nMLLMs to enhance visual perception without requiring additional training. Our\napproach is motivated by the observation that different MLLMs tend to focus on\ndistinct regions given the same query and image. Moreover, we find that the\nfeature distributions of vision encoders within an MLLM family, a group of\nMLLMs sharing the same pretrained LLM, are highly aligned. Building on these\ninsights, VisionFuse enriches the visual context by concatenating the tokens\ngenerated by the vision encoders of selected MLLMs within a family. By merging\nthe parameters of language models from these MLLMs, VisionFuse allows a single\nlanguage model to align with various vision encoders, significantly reducing\ndeployment overhead. We conduct comprehensive evaluations across multiple\nmultimodal benchmarks using various MLLM combinations, demonstrating\nsubstantial improvements in multimodal tasks. Notably, when integrating\nMiniGemini-8B and SLIME-8B, VisionFuse achieves an average performance increase\nof over 4%.\n","authors":["Zhuokun Chen","Jinwu Hu","Zeshuai Deng","Yufeng Wang","Bohan Zhuang","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2412.01289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18908v2","updated":"2024-12-04T09:47:37Z","published":"2024-02-29T07:08:18Z","title":"Facility Location Games with Scaling Effects","summary":" We take the classic facility location problem and consider a variation, in\nwhich each agent's individual cost function is equal to their distance from the\nfacility multiplied by a scaling factor which is determined by the facility\nplacement. In addition to the general class of continuous scaling functions, we\nalso provide results for piecewise linear scaling functions which can\neffectively approximate or model the scaling of many real world scenarios. We\nfocus on the objectives of total and maximum cost, describing the computation\nof the optimal solution. We then move to the approximate mechanism design\nsetting, observing that the agents' preferences may no longer be single-peaked.\nConsequently, we characterize the conditions on scaling functions which ensure\nthat agents have single-peaked preferences. Under these conditions, we find a\ncharacterization of continuous, strategyproof, and anonymous mechanisms, and\ncompute the total and maximum cost approximation ratios achievable by these\nmechanisms.\n","authors":["Yu He","Alexander Lam","Minming Li"],"pdf_url":"https://arxiv.org/pdf/2402.18908v2.pdf","comment":"This is an updated version of the paper which appeared at the 23rd\n International Conference on Autonomous Agents and Multi-Agent Systems\n (AAMAS-24)"},{"id":"http://arxiv.org/abs/2412.01064v2","updated":"2024-12-04T09:43:18Z","published":"2024-12-02T02:50:07Z","title":"FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking\n Portrait","summary":" With the rapid advancement of diffusion-based generative models, portrait\nimage animation has achieved remarkable results. However, it still faces\nchallenges in temporally consistent video generation and fast sampling due to\nits iterative sampling nature. This paper presents FLOAT, an audio-driven\ntalking portrait video generation method based on flow matching generative\nmodel. We shift the generative modeling from the pixel-based latent space to a\nlearned motion latent space, enabling efficient design of temporally consistent\nmotion. To achieve this, we introduce a transformer-based vector field\npredictor with a simple yet effective frame-wise conditioning mechanism.\nAdditionally, our method supports speech-driven emotion enhancement, enabling a\nnatural incorporation of expressive motions. Extensive experiments demonstrate\nthat our method outperforms state-of-the-art audio-driven talking portrait\nmethods in terms of visual quality, motion fidelity, and efficiency.\n","authors":["Taekyung Ki","Dongchan Min","Gyeongsu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.01064v2.pdf","comment":"Project page: https://deepbrainai-research.github.io/float/"},{"id":"http://arxiv.org/abs/2412.03161v1","updated":"2024-12-04T09:38:58Z","published":"2024-12-04T09:38:58Z","title":"Physics-Informed Deep Inverse Operator Networks for Solving PDE Inverse\n Problems","summary":" Inverse problems involving partial differential equations (PDEs) can be seen\nas discovering a mapping from measurement data to unknown quantities, often\nframed within an operator learning approach. However, existing methods\ntypically rely on large amounts of labeled training data, which is impractical\nfor most real-world applications. Moreover, these supervised models may fail to\ncapture the underlying physical principles accurately. To address these\nlimitations, we propose a novel architecture called Physics-Informed Deep\nInverse Operator Networks (PI-DIONs), which can learn the solution operator of\nPDE-based inverse problems without labeled training data. We extend the\nstability estimates established in the inverse problem literature to the\noperator learning framework, thereby providing a robust theoretical foundation\nfor our method. These estimates guarantee that the proposed model, trained on a\nfinite sample and grid, generalizes effectively across the entire domain and\nfunction space. Extensive experiments are conducted to demonstrate that\nPI-DIONs can effectively and accurately learn the solution operators of the\ninverse problems without the need for labeled data.\n","authors":["Sung Woong Cho","Hwijae Son"],"pdf_url":"https://arxiv.org/pdf/2412.03161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00809v2","updated":"2024-12-04T09:26:47Z","published":"2024-10-23T16:16:15Z","title":"Adaptive Dense Reward: Understanding the Gap Between Action and Reward\n Space in Alignment","summary":" Reinforcement Learning from Human Feedback (RLHF) has proven highly effective\nin aligning Large Language Models (LLMs) with human preferences. However, the\noriginal RLHF typically optimizes under an overall reward, which can lead to a\nsuboptimal learning process. This limitation stems from RLHF's lack of\nawareness regarding which specific tokens should be reinforced or suppressed.\nMoreover, conflicts in supervision can arise, for instance, when a chosen\nresponse includes erroneous tokens, while a rejected response contains accurate\nelements. To rectify these shortcomings, increasing dense reward methods, such\nas step-wise and token-wise RLHF, have been proposed. However, these existing\nmethods are limited to specific tasks (like mathematics). In this paper, we\npropose the ``Adaptive Message-wise RLHF'' method, which robustly applies to\nvarious tasks. By defining pivot tokens as key indicators, our approach\nadaptively identifies essential information and converts sequence-level\nsupervision into fine-grained, subsequence-level supervision. This aligns the\ndensity of rewards and action spaces more closely with the information density\nof the input. Experiments demonstrate that our method can be integrated into\nvarious training methods, significantly mitigating hallucinations and\ncatastrophic forgetting problems, while outperforming other methods on multiple\nevaluation metrics. Our method improves the success rate on adversarial samples\nby 10\\% compared to the sample-wise approach, and achieves a 1.3\\% improvement\non evaluation benchmarks such as MMLU, GSM8K, HumanEval, etc.\n","authors":["Yanshi Li","Shaopan Xiong","Gengru Chen","Xiaoyang Li","Yijia Luo","Xingyao Zhang","Yanhui Huang","Xingyuan Bu","Yingshui Tan","Chun Yuan","Jiamang Wang","Wenbo Su","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.00809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03154v1","updated":"2024-12-04T09:24:33Z","published":"2024-12-04T09:24:33Z","title":"Testing Neural Network Verifiers: A Soundness Benchmark with Hidden\n Counterexamples","summary":" In recent years, many neural network (NN) verifiers have been developed to\nformally verify certain properties of neural networks such as robustness.\nAlthough many benchmarks have been constructed to evaluate the performance of\nNN verifiers, they typically lack a ground-truth for hard instances where no\ncurrent verifier can verify and no counterexample can be found, which makes it\ndifficult to check the soundness of a new verifier if it claims to verify hard\ninstances which no other verifier can do. We propose to develop a soundness\nbenchmark for NN verification. Our benchmark contains instances with\ndeliberately inserted counterexamples while we also try to hide the\ncounterexamples from regular adversarial attacks which can be used for finding\ncounterexamples. We design a training method to produce neural networks with\nsuch hidden counterexamples. Our benchmark aims to be used for testing the\nsoundness of NN verifiers and identifying falsely claimed verifiability when it\nis known that hidden counterexamples exist. We systematically construct our\nbenchmark and generate instances across diverse model architectures, activation\nfunctions, input sizes, and perturbation radii. We demonstrate that our\nbenchmark successfully identifies bugs in state-of-the-art NN verifiers, as\nwell as synthetic bugs, providing a crucial step toward enhancing the\nreliability of testing NN verifiers. Our code is available at\nhttps://github.com/MVP-Harry/SoundnessBench and our benchmark is available at\nhttps://huggingface.co/datasets/SoundnessBench/SoundnessBench.\n","authors":["Xingjian Zhou","Hongji Xu","Andy Xu","Zhouxing Shi","Cho-Jui Hsieh","Huan Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03154v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.03152v1","updated":"2024-12-04T09:21:46Z","published":"2024-12-04T09:21:46Z","title":"A Measure of the System Dependence of Automated Metrics","summary":" Automated metrics for Machine Translation have made significant progress,\nwith the goal of replacing expensive and time-consuming human evaluations.\nThese metrics are typically assessed by their correlation with human judgments,\nwhich captures the monotonic relationship between human and metric scores.\nHowever, we argue that it is equally important to ensure that metrics treat all\nsystems fairly and consistently. In this paper, we introduce a method to\nevaluate this aspect.\n","authors":["Pius von Däniken","Jan Deriu","Mark Cieliebak"],"pdf_url":"https://arxiv.org/pdf/2412.03152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03151v1","updated":"2024-12-04T09:18:54Z","published":"2024-12-04T09:18:54Z","title":"Large Language Models show both individual and collective creativity\n comparable to humans","summary":" Artificial intelligence has, so far, largely automated routine tasks, but\nwhat does it mean for the future of work if Large Language Models (LLMs) show\ncreativity comparable to humans? To measure the creativity of LLMs\nholistically, the current study uses 13 creative tasks spanning three domains.\nWe benchmark the LLMs against individual humans, and also take a novel approach\nby comparing them to the collective creativity of groups of humans. We find\nthat the best LLMs (Claude and GPT-4) rank in the 52nd percentile against\nhumans, and overall LLMs excel in divergent thinking and problem solving but\nlag in creative writing. When questioned 10 times, an LLM's collective\ncreativity is equivalent to 8-10 humans. When more responses are requested, two\nadditional responses of LLMs equal one extra human. Ultimately, LLMs, when\noptimally applied, may compete with a small group of humans in the future of\nwork.\n","authors":["Luning Sun","Yuzhuo Yuan","Yuan Yao","Yanyan Li","Hao Zhang","Xing Xie","Xiting Wang","Fang Luo","David Stillwell"],"pdf_url":"https://arxiv.org/pdf/2412.03151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03148v1","updated":"2024-12-04T09:14:56Z","published":"2024-12-04T09:14:56Z","title":"Fine-Grained Behavior Simulation with Role-Playing Large Language Model\n on Social Media","summary":" Large language models (LLMs) have demonstrated impressive capabilities in\nrole-playing tasks. However, there is limited research on whether LLMs can\naccurately simulate user behavior in real-world scenarios, such as social\nmedia. This requires models to effectively analyze a user's history and\nsimulate their role. In this paper, we introduce \\textbf{FineRob}, a novel\nfine-grained behavior simulation dataset. We collect the complete behavioral\nhistory of 1,866 distinct users across three social media platforms. Each\nbehavior is decomposed into three fine-grained elements: object, type, and\ncontent, resulting in 78.6k QA records. Based on FineRob, we identify two\ndominant reasoning patterns in LLMs' behavior simulation processes and propose\nthe \\textbf{OM-CoT} fine-tuning method to enhance the capability. Through\ncomprehensive experiments, we conduct an in-depth analysis of key factors of\nbehavior simulation and also demonstrate the effectiveness of OM-CoT\napproach\\footnote{Code and dataset are available at\n\\url{https://github.com/linkseed18612254945/FineRob}}\n","authors":["Kun Li","Chenwei Dai","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2412.03148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03123v1","updated":"2024-12-04T08:43:12Z","published":"2024-12-04T08:43:12Z","title":"Robust Multi-bit Text Watermark with LLM-based Paraphrasers","summary":" We propose an imperceptible multi-bit text watermark embedded by paraphrasing\nwith LLMs. We fine-tune a pair of LLM paraphrasers that are designed to behave\ndifferently so that their paraphrasing difference reflected in the text\nsemantics can be identified by a trained decoder. To embed our multi-bit\nwatermark, we use two paraphrasers alternatively to encode the pre-defined\nbinary code at the sentence level. Then we use a text classifier as the decoder\nto decode each bit of the watermark. Through extensive experiments, we show\nthat our watermarks can achieve over 99.99\\% detection AUC with small (1.1B)\ntext paraphrasers while keeping the semantic information of the original\nsentence. More importantly, our pipeline is robust under word substitution and\nsentence paraphrasing perturbations and generalizes well to\nout-of-distributional data. We also show the stealthiness of our watermark with\nLLM-based evaluation. We open-source the code:\nhttps://github.com/xiaojunxu/multi-bit-text-watermark.\n","authors":["Xiaojun Xu","Jinghan Jia","Yuanshun Yao","Yang Liu","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2412.03123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16730v3","updated":"2024-12-04T08:21:17Z","published":"2024-11-23T09:32:44Z","title":"\"Moralized\" Multi-Step Jailbreak Prompts: Black-Box Testing of\n Guardrails in Large Language Models for Verbal Attacks","summary":" As the application of large language models continues to expand in various\nfields, it poses higher challenges to the effectiveness of identifying harmful\ncontent generation and guardrail mechanisms. This research aims to evaluate the\nguardrail effectiveness of GPT-4o, Grok-2 Beta, Llama 3.1 (405B), Gemini 1.5,\nand Claude 3.5 Sonnet through black-box testing of seemingly ethical multi-step\njailbreak prompts. It conducts ethical attacks by designing an identical\nmulti-step prompts that simulates the scenario of \"corporate middle managers\ncompeting for promotions.\" The data results show that the guardrails of the\nabove-mentioned LLMs were bypassed and the content of verbal attacks was\ngenerated. Claude 3.5 Sonnet's resistance to multi-step jailbreak prompts is\nmore obvious. To ensure objectivity, the experimental process, black box test\ncode, and enhanced guardrail code are uploaded to the GitHub repository:\nhttps://github.com/brucewang123456789/GeniusTrail.git.\n","authors":["Libo Wang"],"pdf_url":"https://arxiv.org/pdf/2411.16730v3.pdf","comment":"This paper has been submitted to Nature Machine Intelligence and\n OpenReview preprints. It has 7 pages of text, 3 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2412.03111v1","updated":"2024-12-04T08:20:03Z","published":"2024-12-04T08:20:03Z","title":"Experience-driven discovery of planning strategies","summary":" One explanation for how people can plan efficiently despite limited cognitive\nresources is that we possess a set of adaptive planning strategies and know\nwhen and how to use them. But how are these strategies acquired? While previous\nresearch has studied how individuals learn to choose among existing strategies,\nlittle is known about the process of forming new planning strategies. In this\nwork, we propose that new planning strategies are discovered through\nmetacognitive reinforcement learning. To test this, we designed a novel\nexperiment to investigate the discovery of new planning strategies. We then\npresent metacognitive reinforcement learning models and demonstrate their\ncapability for strategy discovery as well as show that they provide a better\nexplanation of human strategy discovery than alternative learning mechanisms.\nHowever, when fitted to human data, these models exhibit a slower discovery\nrate than humans, leaving room for improvement.\n","authors":["Ruiqi He","Falk Lieder"],"pdf_url":"https://arxiv.org/pdf/2412.03111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.16436v3","updated":"2024-12-04T08:15:35Z","published":"2024-05-26T05:38:50Z","title":"Provably Mitigating Overoptimization in RLHF: Your SFT Loss is\n Implicitly an Adversarial Regularizer","summary":" Aligning generative models with human preference via RLHF typically suffers\nfrom overoptimization, where an imperfectly learned reward model can misguide\nthe generative model to output undesired responses. We investigate this problem\nin a principled manner by identifying the source of the misalignment as a form\nof distributional shift and uncertainty in learning human preferences. To\nmitigate overoptimization, we first propose a theoretical algorithm that\nchooses the best policy for an adversarially chosen reward model; one that\nsimultaneously minimizes the maximum likelihood estimation of the loss and a\nreward penalty term. Here, the reward penalty term is introduced to prevent the\npolicy from choosing actions with spurious high proxy rewards, resulting in\nprovable sample efficiency of the algorithm under a partial coverage style\ncondition. Moving from theory to practice, the proposed algorithm further\nenjoys an equivalent but surprisingly easy-to-implement reformulation. Using\nthe equivalence between reward models and the corresponding optimal policy, the\nalgorithm features a simple objective that combines: (i) a preference\noptimization loss that directly aligns the policy with human preference, and\n(ii) a supervised learning loss that explicitly imitates the policy with a\n(suitable) baseline distribution. In the context of aligning large language\nmodels (LLM), this objective fuses the direct preference optimization (DPO)\nloss with the supervised fine-tuning (SFT) loss to help mitigate the\noveroptimization towards undesired responses, for which we name the algorithm\nRegularized Preference Optimization (RPO). Experiments of aligning LLMs\ndemonstrate the improved performance of RPO compared with DPO baselines. Our\nwork sheds light on the interplay between preference optimization and SFT in\ntuning LLMs with both theoretical guarantees and empirical evidence.\n","authors":["Zhihan Liu","Miao Lu","Shenao Zhang","Boyi Liu","Hongyi Guo","Yingxiang Yang","Jose Blanchet","Zhaoran Wang"],"pdf_url":"https://arxiv.org/pdf/2405.16436v3.pdf","comment":"Accepted by The Thirty-Eighth Annual Conference on Neural Information\n Processing Systems. 31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.03107v1","updated":"2024-12-04T08:13:29Z","published":"2024-12-04T08:13:29Z","title":"CredID: Credible Multi-Bit Watermark for Large Language Models\n Identification","summary":" Large Language Models (LLMs) are widely used in complex natural language\nprocessing tasks but raise privacy and security concerns due to the lack of\nidentity recognition. This paper proposes a multi-party credible watermarking\nframework (CredID) involving a trusted third party (TTP) and multiple LLM\nvendors to address these issues. In the watermark embedding stage, vendors\nrequest a seed from the TTP to generate watermarked text without sending the\nuser's prompt. In the extraction stage, the TTP coordinates each vendor to\nextract and verify the watermark from the text. This provides a credible\nwatermarking scheme while preserving vendor privacy. Furthermore, current\nwatermarking algorithms struggle with text quality, information capacity, and\nrobustness, making it challenging to meet the diverse identification needs of\nLLMs. Thus, we propose a novel multi-bit watermarking algorithm and an\nopen-source toolkit to facilitate research. Experiments show our CredID\nenhances watermark credibility and efficiency without compromising text\nquality. Additionally, we successfully utilized this framework to achieve\nhighly accurate identification among multiple LLM vendors.\n","authors":["Haoyu Jiang","Xuhong Wang","Ping Yi","Shanzhe Lei","Yilun Lin"],"pdf_url":"https://arxiv.org/pdf/2412.03107v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2412.03104v1","updated":"2024-12-04T08:06:15Z","published":"2024-12-04T08:06:15Z","title":"ChatTS: Aligning Time Series with LLMs via Synthetic Data for Enhanced\n Understanding and Reasoning","summary":" Understanding time series is crucial for its application in real-world\nscenarios. Recently, large language models (LLMs) have been increasingly\napplied to time series tasks, leveraging their strong language capabilities to\nenhance various applications. However, research on multimodal LLMs (MLLMs) for\ntime series understanding and reasoning remains limited, primarily due to the\nscarcity of high-quality datasets that align time series with textual\ninformation. This paper introduces ChatTS, a novel MLLM designed for time\nseries analysis. ChatTS treats time series as a modality, similar to how vision\nMLLMs process images, enabling it to perform both understanding and reasoning\nwith time series. To address the scarcity of training data, we propose an\nattribute-based method for generating synthetic time series with detailed\nattribute descriptions. We further introduce Time Series Evol-Instruct, a novel\napproach that generates diverse time series Q&As, enhancing the model's\nreasoning capabilities. To the best of our knowledge, ChatTS is the first MLLM\nthat takes multivariate time series as input, which is fine-tuned exclusively\non synthetic datasets. We evaluate its performance using benchmark datasets\nwith real-world data, including six alignment tasks and four reasoning tasks.\nOur results show that ChatTS significantly outperforms existing vision-based\nMLLMs (e.g., GPT-4o) and text/agent-based LLMs, achieving a 46.0% improvement\nin alignment tasks and a 25.8% improvement in reasoning tasks.\n","authors":["Zhe Xie","Zeyan Li","Xiao He","Longlong Xu","Xidao Wen","Tieying Zhang","Jianjun Chen","Rui Shi","Dan Pei"],"pdf_url":"https://arxiv.org/pdf/2412.03104v1.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2410.14946v2","updated":"2024-12-04T07:58:40Z","published":"2024-10-19T02:32:09Z","title":"DEL-Ranking: Ranking-Correction Denoising Framework for Elucidating\n Molecular Affinities in DNA-Encoded Libraries","summary":" DNA-encoded library (DEL) screening has revolutionized the detection of\nprotein-ligand interactions through read counts, enabling rapid exploration of\nvast chemical spaces. However, noise in read counts, stemming from nonspecific\ninteractions, can mislead this exploration process. We present DEL-Ranking, a\nnovel distribution-correction denoising framework that addresses these\nchallenges. Our approach introduces two key innovations: (1) a novel ranking\nloss that rectifies relative magnitude relationships between read counts,\nenabling the learning of causal features determining activity levels, and (2)\nan iterative algorithm employing self-training and consistency loss to\nestablish model coherence between activity label and read count predictions.\nFurthermore, we contribute three new DEL screening datasets, the first to\ncomprehensively include multi-dimensional molecular representations,\nprotein-ligand enrichment values, and their activity labels. These datasets\nmitigate data scarcity issues in AI-driven DEL screening research. Rigorous\nevaluation on diverse DEL datasets demonstrates DEL-Ranking's superior\nperformance across multiple correlation metrics, with significant improvements\nin binding affinity prediction accuracy. Our model exhibits zero-shot\ngeneralization ability across different protein targets and successfully\nidentifies potential motifs determining compound binding affinity. This work\nadvances DEL screening analysis and provides valuable resources for future\nresearch in this area.\n","authors":["Hanqun Cao","Mutian He","Ning Ma","Chang-yu Hsieh","Chunbin Gu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2410.14946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17843v2","updated":"2024-12-04T07:50:27Z","published":"2024-07-25T07:57:55Z","title":"DragText: Rethinking Text Embedding in Point-based Image Editing","summary":" Point-based image editing enables accurate and flexible control through\ncontent dragging. However, the role of text embedding during the editing\nprocess has not been thoroughly investigated. A significant aspect that remains\nunexplored is the interaction between text and image embeddings. During the\nprogressive editing in a diffusion model, the text embedding remains constant.\nAs the image embedding increasingly diverges from its initial state, the\ndiscrepancy between the image and text embeddings presents a significant\nchallenge. In this study, we found that the text prompt significantly\ninfluences the dragging process, particularly in maintaining content integrity\nand achieving the desired manipulation. Upon these insights, we propose\nDragText, which optimizes text embedding in conjunction with the dragging\nprocess to pair with the modified image embedding. Simultaneously, we\nregularize the text optimization process to preserve the integrity of the\noriginal text prompt. Our approach can be seamlessly integrated with existing\ndiffusion-based drag methods, enhancing performance with only a few lines of\ncode.\n","authors":["Gayoon Choi","Taejin Jeong","Sujung Hong","Seong Jae Hwang"],"pdf_url":"https://arxiv.org/pdf/2407.17843v2.pdf","comment":"Accepted at WACV 2025; Code is released at\n https://github.com/MICV-yonsei/DragText"},{"id":"http://arxiv.org/abs/2404.16331v2","updated":"2024-12-04T07:47:10Z","published":"2024-04-25T04:37:35Z","title":"IMWA: Iterative Model Weight Averaging Benefits Class-Imbalanced\n Learning Tasks","summary":" Model Weight Averaging (MWA) is a technique that seeks to enhance model's\nperformance by averaging the weights of multiple trained models. This paper\nfirst empirically finds that 1) the vanilla MWA can benefit the\nclass-imbalanced learning, and 2) performing model averaging in the early\nepochs of training yields a greater performance improvement than doing that in\nlater epochs. Inspired by these two observations, in this paper we propose a\nnovel MWA technique for class-imbalanced learning tasks named Iterative Model\nWeight Averaging (IMWA). Specifically, IMWA divides the entire training stage\ninto multiple episodes. Within each episode, multiple models are concurrently\ntrained from the same initialized model weight, and subsequently averaged into\na singular model. Then, the weight of this average model serves as a fresh\ninitialization for the ensuing episode, thus establishing an iterative learning\nparadigm. Compared to vanilla MWA, IMWA achieves higher performance\nimprovements with the same computational cost. Moreover, IMWA can further\nenhance the performance of those methods employing EMA strategy, demonstrating\nthat IMWA and EMA can complement each other. Extensive experiments on various\nclass-imbalanced learning tasks, i.e., class-imbalanced image classification,\nsemi-supervised class-imbalanced image classification and semi-supervised\nobject detection tasks showcase the effectiveness of our IMWA.\n","authors":["Zitong Huang","Ze Chen","Bowen Dong","Chaoqi Liang","Erjin Zhou","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2404.16331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03092v1","updated":"2024-12-04T07:44:35Z","published":"2024-12-04T07:44:35Z","title":"Revolve: Optimizing AI Systems by Tracking Response Evolution in Textual\n Optimization","summary":" Recent advancements in large language models (LLMs) have significantly\nenhanced the ability of LLM-based systems to perform complex tasks through\nnatural language processing and tool interaction. However, optimizing these\nLLM-based systems for specific tasks remains challenging, often requiring\nmanual interventions like prompt engineering and hyperparameter tuning.\nExisting automatic optimization methods, such as textual feedback-based\ntechniques (e.g., TextGrad), tend to focus on immediate feedback, analogous to\nusing immediate derivatives in traditional numerical gradient descent. However,\nrelying solely on such feedback can be limited when the adjustments made in\nresponse to this feedback are either too small or fluctuate irregularly,\npotentially slowing down or even stalling the optimization process. To overcome\nthese challenges, more adaptive methods are needed, especially in situations\nwhere the system's response is evolving slowly or unpredictably. In this paper,\nwe introduce REVOLVE, an optimization method that tracks how \"R\"esponses\n\"EVOLVE\" across iterations in LLM systems. By focusing on the evolution of\nresponses over time, REVOLVE enables more stable and effective optimization by\nmaking thoughtful, progressive adjustments at each step. Experimental results\ndemonstrate that REVOLVE outperforms competitive baselines, achieving a 7.8%\nimprovement in prompt optimization, a 20.72% gain in solution refinement, and a\n29.17% increase in code optimization. Additionally, REVOLVE converges in fewer\niterations, resulting in significant computational savings. These advantages\nhighlight its adaptability and efficiency, positioning REVOLVE as a valuable\ntool for optimizing LLM-based systems and accelerating the development of\nnext-generation AI technologies. Code is available at:\nhttps://github.com/Peiyance/REVOLVE.\n","authors":["Peiyan Zhang","Haibo Jin","Leyang Hu","Xinnuo Li","Liying Kang","Man Luo","Yangqiu Song","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03092v1.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2410.11374v2","updated":"2024-12-04T07:35:20Z","published":"2024-10-15T08:12:54Z","title":"Preserve or Modify? Context-Aware Evaluation for Balancing Preservation\n and Modification in Text-Guided Image Editing","summary":" The development of vision-language and generative models has significantly\nadvanced text-guided image editing, which seeks the \\textit{preservation} of\ncore elements in the source image while implementing \\textit{modifications}\nbased on the target text. However, existing metrics have a\n\\textbf{context-blindness} problem, indiscriminately applying the same\nevaluation criteria on completely different pairs of source image and target\ntext, biasing towards either modification or preservation. Directional CLIP\nsimilarity, the only metric that considers both source image and target text,\nis also biased towards modification aspects and attends to irrelevant editing\nregions of the image. We propose \\texttt{AugCLIP}, a \\textbf{context-aware}\nmetric that adaptively coordinates preservation and modification aspects,\ndepending on the specific context of a given source image and target text. This\nis done by deriving the CLIP representation of an ideally edited image, that\npreserves the source image with necessary modifications to align with target\ntext. More specifically, using a multi-modal large language model,\n\\texttt{AugCLIP} augments the textual descriptions of the source and target,\nthen calculates a modification vector through a hyperplane that separates\nsource and target attributes in CLIP space. Extensive experiments on five\nbenchmark datasets, encompassing a diverse range of editing scenarios, show\nthat \\texttt{AugCLIP} aligns remarkably well with human evaluation standards,\noutperforming existing metrics. The code will be open-sourced for community\nuse.\n","authors":["Yoonjeon Kim","Soohyun Ryu","Yeonsung Jung","Hyunkoo Lee","Joowon Kim","June Yong Yang","Jaeryong Hwang","Eunho Yang"],"pdf_url":"https://arxiv.org/pdf/2410.11374v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2410.07170v2","updated":"2024-12-04T07:18:17Z","published":"2024-10-09T17:59:06Z","title":"One Initialization to Rule them All: Fine-tuning via Explained Variance\n Adaptation","summary":" Foundation models (FMs) are pre-trained on large-scale datasets and then\nfine-tuned on a downstream task for a specific application. The most successful\nand most commonly used fine-tuning method is to update the pre-trained weights\nvia a low-rank adaptation (LoRA). LoRA introduces new weight matrices that are\nusually initialized at random with a uniform rank distribution across the model\nweights. Recent works focus on different initialization schemes or the learning\nof adaptive ranks during fine-tuning. Both approaches have only been\ninvestigated in isolation, resulting in slow convergence or a uniform rank\ndistribution, in turn leading to suboptimal performance. We propose to improve\nLoRA by initializing the new weights in a data-driven manner by computing\nsingular value decomposition (SVD) on minibatches of activation vectors. Then,\nwe initialize the LoRA matrices with the obtained right-singular vectors and\nredistribute ranks among all weight matrices to provably store the maximum\namount of information of the downstream data in the newly introduced weights.\nIn this way, only what information to maintain or neglect during the\nfine-tuning process needs to be learned. We call our new method Explained\nVariance Adaptation (EVA). We apply EVA to a variety of fine-tuning tasks\nranging from language generation and understanding to image classification and\nreinforcement learning. EVA exhibits faster convergence than competitors and\nachieves the highest average score across a multitude of tasks per domain while\nreducing the number of trainable parameters through rank redistribution.\n","authors":["Fabian Paischer","Lukas Hauzenberger","Thomas Schmied","Benedikt Alkin","Marc Peter Deisenroth","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2410.07170v2.pdf","comment":"11 pages + references and appendix, code available at\n https://github.com/ml-jku/EVA"},{"id":"http://arxiv.org/abs/2410.08631v2","updated":"2024-12-04T06:58:26Z","published":"2024-10-11T08:53:58Z","title":"CryoFM: A Flow-based Foundation Model for Cryo-EM Densities","summary":" Cryo-electron microscopy (cryo-EM) is a powerful technique in structural\nbiology and drug discovery, enabling the study of biomolecules at high\nresolution. Significant advancements by structural biologists using cryo-EM\nhave led to the production of over 38,626 protein density maps at various\nresolutions1. However, cryo-EM data processing algorithms have yet to fully\nbenefit from our knowledge of biomolecular density maps, with only a few recent\nmodels being data-driven but limited to specific tasks. In this study, we\npresent CryoFM, a foundation model designed as a generative model, learning the\ndistribution of high-quality density maps and generalizing effectively to\ndownstream tasks. Built on flow matching, CryoFM is trained to accurately\ncapture the prior distribution of biomolecular density maps. Furthermore, we\nintroduce a flow posterior sampling method that leverages CRYOFM as a flexible\nprior for several downstream tasks in cryo-EM and cryo-electron tomography\n(cryo-ET) without the need for fine-tuning, achieving state-of-the-art\nperformance on most tasks and demonstrating its potential as a foundational\nmodel for broader applications in these fields.\n","authors":["Yi Zhou","Yilai Li","Jing Yuan","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2410.08631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03076v1","updated":"2024-12-04T06:53:59Z","published":"2024-12-04T06:53:59Z","title":"Coordinated Multi-Armed Bandits for Improved Spatial Reuse in Wi-Fi","summary":" Multi-Access Point Coordination (MAPC) and Artificial Intelligence and\nMachine Learning (AI/ML) are expected to be key features in future Wi-Fi, such\nas the forthcoming IEEE 802.11bn (Wi-Fi 8) and beyond. In this paper, we\nexplore a coordinated solution based on online learning to drive the\noptimization of Spatial Reuse (SR), a method that allows multiple devices to\nperform simultaneous transmissions by controlling interference through Packet\nDetect (PD) adjustment and transmit power control. In particular, we focus on a\nMulti-Agent Multi-Armed Bandit (MA-MAB) setting, where multiple decision-making\nagents concurrently configure SR parameters from coexisting networks by\nleveraging the MAPC framework, and study various algorithms and reward-sharing\nmechanisms. We evaluate different MA-MAB implementations using Komondor, a\nwell-adopted Wi-Fi simulator, and demonstrate that AI-native SR enabled by\ncoordinated MABs can improve the network performance over current Wi-Fi\noperation: mean throughput increases by 15%, fairness is improved by increasing\nthe minimum throughput across the network by 210%, while the maximum access\ndelay is kept below 3 ms.\n","authors":["Francesc Wilhelmi","Boris Bellalta","Szymon Szott","Katarzyna Kosek-Szott","Sergio Barrachina-Muñoz"],"pdf_url":"https://arxiv.org/pdf/2412.03076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03072v1","updated":"2024-12-04T06:49:21Z","published":"2024-12-04T06:49:21Z","title":"Preference-based opponent shaping in differentiable games","summary":" Strategy learning in game environments with multi-agent is a challenging\nproblem. Since each agent's reward is determined by the joint strategy, a\ngreedy learning strategy that aims to maximize its own reward may fall into a\nlocal optimum. Recent studies have proposed the opponent modeling and shaping\nmethods for game environments. These methods enhance the efficiency of strategy\nlearning by modeling the strategies and updating processes of other agents.\nHowever, these methods often rely on simple predictions of opponent strategy\nchanges. Due to the lack of modeling behavioral preferences such as cooperation\nand competition, they are usually applicable only to predefined scenarios and\nlack generalization capabilities. In this paper, we propose a novel\nPreference-based Opponent Shaping (PBOS) method to enhance the strategy\nlearning process by shaping agents' preferences towards cooperation. We\nintroduce the preference parameter, which is incorporated into the agent's loss\nfunction, thus allowing the agent to directly consider the opponent's loss\nfunction when updating the strategy. We update the preference parameters\nconcurrently with strategy learning to ensure that agents can adapt to any\ncooperative or competitive game environment. Through a series of experiments,\nwe verify the performance of PBOS algorithm in a variety of differentiable\ngames. The experimental results show that the PBOS algorithm can guide the\nagent to learn the appropriate preference parameters, so as to achieve better\nreward distribution in multiple game environments.\n","authors":["Xinyu Qiao","Yudong Hu","Congying Han","Weiyan Wu","Tiande Guo"],"pdf_url":"https://arxiv.org/pdf/2412.03072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03069v1","updated":"2024-12-04T06:46:55Z","published":"2024-12-04T06:46:55Z","title":"TokenFlow: Unified Image Tokenizer for Multimodal Understanding and\n Generation","summary":" We present TokenFlow, a novel unified image tokenizer that bridges the\nlong-standing gap between multimodal understanding and generation. Prior\nresearch attempt to employ a single reconstruction-targeted Vector Quantization\n(VQ) encoder for unifying these two tasks. We observe that understanding and\ngeneration require fundamentally different granularities of visual information.\nThis leads to a critical trade-off, particularly compromising performance in\nmultimodal understanding tasks. TokenFlow addresses this challenge through an\ninnovative dual-codebook architecture that decouples semantic and pixel-level\nfeature learning while maintaining their alignment via a shared mapping\nmechanism. This design enables direct access to both high-level semantic\nrepresentations crucial for understanding tasks and fine-grained visual\nfeatures essential for generation through shared indices. Our extensive\nexperiments demonstrate TokenFlow's superiority across multiple dimensions.\nLeveraging TokenFlow, we demonstrate for the first time that discrete visual\ninput can surpass LLaVA-1.5 13B in understanding performance, achieving a 7.2\\%\naverage improvement. For image reconstruction, we achieve a strong FID score of\n0.63 at 384*384 resolution. Moreover, TokenFlow establishes state-of-the-art\nperformance in autoregressive image generation with a GenEval score of 0.55 at\n256*256 resolution, achieving comparable results to SDXL.\n","authors":["Liao Qu","Huichao Zhang","Yiheng Liu","Xu Wang","Yi Jiang","Yiming Gao","Hu Ye","Daniel K. Du","Zehuan Yuan","Xinglong Wu"],"pdf_url":"https://arxiv.org/pdf/2412.03069v1.pdf","comment":"https://byteflow-ai.github.io/TokenFlow/"},{"id":"http://arxiv.org/abs/2412.03068v1","updated":"2024-12-04T06:42:55Z","published":"2024-12-04T06:42:55Z","title":"UTSD: Unified Time Series Diffusion Model","summary":" Transformer-based architectures have achieved unprecedented success in time\nseries analysis. However, facing the challenge of across-domain modeling,\nexisting studies utilize statistical prior as prompt engineering fails under\nthe huge distribution shift among various domains. In this paper, a Unified\nTime Series Diffusion (UTSD) model is established for the first time to model\nthe multi-domain probability distribution, utilizing the powerful probability\ndistribution modeling ability of Diffusion. Unlike the autoregressive models\nthat capture the conditional probabilities of the prediction horizon to the\nhistorical sequence, we use a diffusion denoising process to model the mixture\ndistribution of the cross-domain data and generate the prediction sequence for\nthe target domain directly utilizing conditional sampling. The proposed UTSD\ncontains three pivotal designs: (1) The condition network captures the\nmulti-scale fluctuation patterns from the observation sequence, which are\nutilized as context representations to guide the denoising network to generate\nthe prediction sequence; (2) Adapter-based fine-tuning strategy, the\nmulti-domain universal representation learned in the pretraining stage is\nutilized for downstream tasks in target domains; (3) The diffusion and\ndenoising process on the actual sequence space, combined with the improved\nclassifier free guidance as the conditional generation strategy, greatly\nimproves the stability and accuracy of the downstream task. We conduct\nextensive experiments on mainstream benchmarks, and the pre-trained UTSD\noutperforms existing foundation models on all data domains, exhibiting superior\nzero-shot generalization ability. After training from scratch, UTSD achieves\ncomparable performance against domain-specific proprietary models. The\nempirical results validate the potential of UTSD as a time series foundational\nmodel.\n","authors":["Xiangkai Ma","Xiaobin Hong","Wenzhong Li","Sanglu Lu"],"pdf_url":"https://arxiv.org/pdf/2412.03068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00721v2","updated":"2024-12-04T06:23:40Z","published":"2024-12-01T08:07:01Z","title":"A Comparative Study of LLM-based ASR and Whisper in Low Resource and\n Code Switching Scenario","summary":" Large Language Models (LLMs) have showcased exceptional performance across\ndiverse NLP tasks, and their integration with speech encoder is rapidly\nemerging as a dominant trend in the Automatic Speech Recognition (ASR) field.\nPrevious works mainly concentrated on leveraging LLMs for speech recognition in\nEnglish and Chinese. However, their potential for addressing speech recognition\nchallenges in low resource settings remains underexplored. Hence, in this work,\nwe aim to explore the capability of LLMs in low resource ASR and\nMandarin-English code switching ASR. We also evaluate and compare the\nrecognition performance of LLM-based ASR systems against Whisper model.\nExtensive experiments demonstrate that LLM-based ASR yields a relative gain of\n12.8\\% over the Whisper model in low resource ASR while Whisper performs better\nin Mandarin-English code switching ASR. We hope that this study could shed\nlight on ASR for low resource scenarios.\n","authors":["Zheshu Song","Ziyang Ma","Yifan Yang","Jianheng Zhuo","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2412.00721v2.pdf","comment":"This work hasn't been finished yet"},{"id":"http://arxiv.org/abs/2412.03056v1","updated":"2024-12-04T06:20:51Z","published":"2024-12-04T06:20:51Z","title":"Point-GN: A Non-Parametric Network Using Gaussian Positional Encoding\n for Point Cloud Classification","summary":" This paper introduces Point-GN, a novel non-parametric network for efficient\nand accurate 3D point cloud classification. Unlike conventional deep learning\nmodels that rely on a large number of trainable parameters, Point-GN leverages\nnon-learnable components-specifically, Farthest Point Sampling (FPS), k-Nearest\nNeighbors (k-NN), and Gaussian Positional Encoding (GPE)-to extract both local\nand global geometric features. This design eliminates the need for additional\ntraining while maintaining high performance, making Point-GN particularly\nsuited for real-time, resource-constrained applications. We evaluate Point-GN\non two benchmark datasets, ModelNet40 and ScanObjectNN, achieving\nclassification accuracies of 85.29% and 85.89%, respectively, while\nsignificantly reducing computational complexity. Point-GN outperforms existing\nnon-parametric methods and matches the performance of fully trained models, all\nwith zero learnable parameters. Our results demonstrate that Point-GN is a\npromising solution for 3D point cloud classification in practical, real-time\nenvironments.\n","authors":["Marzieh Mohammadi","Amir Salarpour"],"pdf_url":"https://arxiv.org/pdf/2412.03056v1.pdf","comment":"This paper has been accepted for presentation at the IEEE Winter\n Conference on Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2412.03051v1","updated":"2024-12-04T06:11:09Z","published":"2024-12-04T06:11:09Z","title":"Less is More: A Stealthy and Efficient Adversarial Attack Method for\n DRL-based Autonomous Driving Policies","summary":" Despite significant advancements in deep reinforcement learning (DRL)-based\nautonomous driving policies, these policies still exhibit vulnerability to\nadversarial attacks. This vulnerability poses a formidable challenge to the\npractical deployment of these policies in autonomous driving. Designing\neffective adversarial attacks is an indispensable prerequisite for enhancing\nthe robustness of these policies. In view of this, we present a novel stealthy\nand efficient adversarial attack method for DRL-based autonomous driving\npolicies. Specifically, we introduce a DRL-based adversary designed to trigger\nsafety violations (e.g., collisions) by injecting adversarial samples at\ncritical moments. We model the attack as a mixed-integer optimization problem\nand formulate it as a Markov decision process. Then, we train the adversary to\nlearn the optimal policy for attacking at critical moments without domain\nknowledge. Furthermore, we introduce attack-related information and a\ntrajectory clipping method to enhance the learning capability of the adversary.\nFinally, we validate our method in an unprotected left-turn scenario across\ndifferent traffic densities. The experimental results show that our method\nachieves more than 90% collision rate within three attacks in most cases.\nFurthermore, our method achieves more than 130% improvement in attack\nefficiency compared to the unlimited attack method.\n","authors":["Junchao Fan","Xuyang Lei","Xiaolin Chang","Jelena Mišić","Vojislav B. Mišić"],"pdf_url":"https://arxiv.org/pdf/2412.03051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.15862v2","updated":"2024-12-04T05:52:03Z","published":"2024-11-24T14:38:59Z","title":"LLMs Do Not Think Step-by-step In Implicit Reasoning","summary":" It has been well-known that Chain-of-Thought can remarkably enhance LLMs'\nperformance on complex tasks. However, because it also introduces slower\ninference speeds and higher computational costs, many researches have attempted\nto use implicit CoT, which does not need LLMs to explicitly generate the\nintermediate steps. But there is still gap between their efficacy and typical\nexplicit CoT methods. This leaves us a doubt that, does implicit CoT really\nequal to explicit CoT? Therefore, in this study, we address this question\nthrough experiments. We probe the information of intermediate steps from the\nmodel's hidden states when it is performing implicit CoT. The results\nsurprisingly indicate that LLMs hardly think about intermediate steps,\nsuggesting they may just rely on experience rather than strict step-by-step\nreasoning. Moreover, we find LLMs' implicit reasoning capabilities are\nsusceptible and unstable, reaffirming the necessity of explicit CoT to\neffectively support complex tasks.\n","authors":["Yijiong Yu"],"pdf_url":"https://arxiv.org/pdf/2411.15862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03039v1","updated":"2024-12-04T05:23:46Z","published":"2024-12-04T05:23:46Z","title":"MRNet: Multifaceted Resilient Networks for Medical Image-to-Image\n Translation","summary":" We propose a Multifaceted Resilient Network(MRNet), a novel architecture\ndeveloped for medical image-to-image translation that outperforms\nstate-of-the-art methods in MRI-to-CT and MRI-to-MRI conversion. MRNet\nleverages the Segment Anything Model (SAM) to exploit frequency-based features\nto build a powerful method for advanced medical image transformation. The\narchitecture extracts comprehensive multiscale features from diverse datasets\nusing a powerful SAM image encoder and performs resolution-aware feature fusion\nthat consistently integrates U-Net encoder outputs with SAM-derived features.\nThis fusion optimizes the traditional U-Net skip connection while leveraging\ntransformer-based contextual analysis. The translation is complemented by an\ninnovative dual-mask configuration incorporating dynamic attention patterns and\na specialized loss function designed to address regional mapping mismatches,\npreserving both the gross anatomy and tissue details. Extensive validation\nstudies have shown that MRNet outperforms state-of-the-art architectures,\nparticularly in maintaining anatomical fidelity and minimizing translation\nartifacts.\n","authors":["Hyojeong Lee","Youngwan Jo","Inpyo Hong","Sanghyun Park"],"pdf_url":"https://arxiv.org/pdf/2412.03039v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.03038v1","updated":"2024-12-04T05:19:34Z","published":"2024-12-04T05:19:34Z","title":"MILLION: A General Multi-Objective Framework with Controllable Risk for\n Portfolio Management","summary":" Portfolio management is an important yet challenging task in AI for FinTech,\nwhich aims to allocate investors' budgets among different assets to balance the\nrisk and return of an investment. In this study, we propose a general\nMulti-objectIve framework with controLLable rIsk for pOrtfolio maNagement\n(MILLION), which consists of two main phases, i.e., return-related maximization\nand risk control. Specifically, in the return-related maximization phase, we\nintroduce two auxiliary objectives, i.e., return rate prediction, and return\nrate ranking, combined with portfolio optimization to remit the overfitting\nproblem and improve the generalization of the trained model to future markets.\nSubsequently, in the risk control phase, we propose two methods, i.e.,\nportfolio interpolation and portfolio improvement, to achieve fine-grained risk\ncontrol and fast risk adaption to a user-specified risk level. For the\nportfolio interpolation method, we theoretically prove that the risk can be\nperfectly controlled if the to-be-set risk level is in a proper interval. In\naddition, we also show that the return rate of the adjusted portfolio after\nportfolio interpolation is no less than that of the min-variance optimization,\nas long as the model in the reward maximization phase is effective.\nFurthermore, the portfolio improvement method can achieve greater return rates\nwhile keeping the same risk level compared to portfolio interpolation.\nExtensive experiments are conducted on three real-world datasets. The results\ndemonstrate the effectiveness and efficiency of the proposed framework.\n","authors":["Liwei Deng","Tianfu Wang","Yan Zhao","Kai Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.03038v1.pdf","comment":"accepted by VLDB 2025"},{"id":"http://arxiv.org/abs/2412.02632v2","updated":"2024-12-04T05:02:45Z","published":"2024-12-03T18:01:45Z","title":"Scaling Image Tokenizers with Grouped Spherical Quantization","summary":" Vision tokenizers have gained a lot of attraction due to their scalability\nand compactness; previous works depend on old-school GAN-based hyperparameters,\nbiased comparisons, and a lack of comprehensive analysis of the scaling\nbehaviours. To tackle those issues, we introduce Grouped Spherical Quantization\n(GSQ), featuring spherical codebook initialization and lookup regularization to\nconstrain codebook latent to a spherical surface. Our empirical analysis of\nimage tokenizer training strategies demonstrates that GSQ-GAN achieves superior\nreconstruction quality over state-of-the-art methods with fewer training\niterations, providing a solid foundation for scaling studies. Building on this,\nwe systematically examine the scaling behaviours of GSQ, specifically in latent\ndimensionality, codebook size, and compression ratios, and their impact on\nmodel performance. Our findings reveal distinct behaviours at high and low\nspatial compression levels, underscoring challenges in representing\nhigh-dimensional latent spaces. We show that GSQ can restructure\nhigh-dimensional latent into compact, low-dimensional spaces, thus enabling\nefficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x\ndown-sampling with a reconstruction FID (rFID) of 0.50.\n","authors":["Jiangtao Wang","Zhen Qin","Yifan Zhang","Vincent Tao Hu","Björn Ommer","Rania Briq","Stefan Kesselheim"],"pdf_url":"https://arxiv.org/pdf/2412.02632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03028v1","updated":"2024-12-04T04:45:36Z","published":"2024-12-04T04:45:36Z","title":"Specification Generation for Neural Networks in Systems","summary":" Specifications - precise mathematical representations of correct\ndomain-specific behaviors - are crucial to guarantee the trustworthiness of\ncomputer systems. With the increasing development of neural networks as\ncomputer system components, specifications gain more importance as they can be\nused to regulate the behaviors of these black-box models. Traditionally,\nspecifications are designed by domain experts based on their intuition of\ncorrect behavior. However, this is labor-intensive and hence not a scalable\napproach as computer system applications diversify. We hypothesize that the\ntraditional (aka reference) algorithms that neural networks replace for higher\nperformance can act as effective proxies for correct behaviors of the models,\nwhen available. This is because they have been used and tested for long enough\nto encode several aspects of the trustworthy/correct behaviors in the\nunderlying domain. Driven by our hypothesis, we develop a novel automated\nframework, SpecTRA to generate specifications for neural networks using\nreferences. We formulate specification generation as an optimization problem\nand solve it with observations of reference behaviors. SpecTRA clusters similar\nobservations into compact specifications. We present specifications generated\nby SpecTRA for neural networks in adaptive bit rate and congestion control\nalgorithms. Our specifications show evidence of being correct and matching\nintuition. Moreover, we use our specifications to show several unknown\nvulnerabilities of the SOTA models for computer systems.\n","authors":["Isha Chaudhary","Shuyi Lin","Cheng Tan","Gagandeep Singh"],"pdf_url":"https://arxiv.org/pdf/2412.03028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01490v3","updated":"2024-12-04T04:44:33Z","published":"2024-12-02T13:41:38Z","title":"Intelligent Spark Agents: A Modular LangGraph Framework for Scalable,\n Visualized, and Enhanced Big Data Machine Learning Workflows","summary":" This paper introduces a visual process modeling tool for AI and machine\nlearning in big data contexts, utilizing the LangGraph framework to construct\nintelligent Spark agents. The tool represents key machine learning stages -\ndata preprocessing, feature engineering, model training, and evaluation - as\nmodular components. Analysts can visually design workflows, which are then\nautomatically translated into optimized Spark code for execution. This approach\nsimplifies the complexity of Apache Spark, reduces the learning curve\nassociated with Scala, and enhances code reusability. The paper discusses the\ntheoretical foundations, key technologies, and evaluates the effectiveness of\nthe proposed solution.\n","authors":["Jialin Wang","Zhihua Duan"],"pdf_url":"https://arxiv.org/pdf/2412.01490v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19600v2","updated":"2024-12-04T04:41:49Z","published":"2024-05-30T01:30:34Z","title":"Rethinking Spectral Augmentation for Contrast-based Graph\n Self-Supervised Learning","summary":" The recent surge in contrast-based graph self-supervised learning has\nprominently featured an intensified exploration of spectral cues. Spectral\naugmentation, which involves modifying a graph's spectral properties such as\neigenvalues or eigenvectors, is widely believed to enhance model performance.\nHowever, an intriguing paradox emerges, as methods grounded in seemingly\nconflicting assumptions regarding the spectral domain demonstrate notable\nenhancements in learning performance. Through extensive empirical studies, we\nfind that simple edge perturbations - random edge dropping for node-level and\nrandom edge adding for graph-level self-supervised learning - consistently\nyield comparable or superior performance while being significantly more\ncomputationally efficient. This suggests that the computational overhead of\nsophisticated spectral augmentations may not justify their practical benefits.\nOur theoretical analysis of the InfoNCE loss bounds for shallow GNNs further\nsupports this observation. The proposed insights represent a significant leap\nforward in the field, potentially refining the understanding and implementation\nof graph self-supervised learning.\n","authors":["Xiangru Jian","Xinjian Zhao","Wei Pang","Chaolong Ying","Yimu Wang","Yaoyao Xu","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2405.19600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08688v2","updated":"2024-12-04T04:28:41Z","published":"2024-10-11T10:21:42Z","title":"Chain-of-Restoration: Multi-Task Image Restoration Models are Zero-Shot\n Step-by-Step Universal Image Restorers","summary":" Despite previous image restoration (IR) methods have often concentrated on\nisolated degradations, recent research has increasingly focused on addressing\ncomposite degradations involving a complex combination of multiple isolated\ndegradations. However, current IR methods for composite degradations require\nbuilding training data that contain an exponential number of possible\ndegradation combinations, which brings in a significant burden. To alleviate\nthis issue, this paper proposes a new task setting, i.e. Universal Image\nRestoration (UIR). Specifically, UIR doesn't require training on all the\ndegradation combinations but only on a set of degradation bases and then\nremoving any degradation that these bases can potentially compose in a\nzero-shot manner. Inspired by the Chain-of-Thought that prompts large language\nmodels (LLMs) to address problems step-by-step, we propose Chain-of-Restoration\n(CoR) mechanism, which instructs models to remove unknown composite\ndegradations step-by-step. By integrating a simple Degradation Discriminator\ninto pre-trained multi-task models, CoR facilitates the process where models\nremove one degradation basis per step, continuing this process until the image\nis fully restored from the unknown composite degradation. Extensive experiments\nshow that CoR can significantly improve model performance in removing composite\ndegradations, achieving comparable or better results than those\nstate-of-the-art (SoTA) methods trained on all degradations.\n","authors":["Jin Cao","Deyu Meng","Xiangyong Cao"],"pdf_url":"https://arxiv.org/pdf/2410.08688v2.pdf","comment":"code: https://github.com/toummHus/Chain-of-Restoration"},{"id":"http://arxiv.org/abs/2412.03021v1","updated":"2024-12-04T04:24:15Z","published":"2024-12-04T04:24:15Z","title":"PEMF-VVTO: Point-Enhanced Video Virtual Try-on via Mask-free Paradigm","summary":" Video Virtual Try-on aims to fluently transfer the garment image to a\nsemantically aligned try-on area in the source person video. Previous methods\nleveraged the inpainting mask to remove the original garment in the source\nvideo, thus achieving accurate garment transfer on simple model videos.\nHowever, when these methods are applied to realistic video data with more\ncomplex scene changes and posture movements, the overly large and incoherent\nagnostic masks will destroy the essential spatial-temporal information of the\noriginal video, thereby inhibiting the fidelity and coherence of the try-on\nvideo. To alleviate this problem, %avoid the inherent deficiencies of\nmask-based try-on paradigm, we propose a novel point-enhanced mask-free video\nvirtual try-on framework (PEMF-VVTO). Specifically, we first leverage the\npre-trained mask-based try-on model to construct large-scale paired training\ndata (pseudo-person samples). Training on these mask-free data enables our\nmodel to perceive the original spatial-temporal information while realizing\naccurate garment transfer. Then, based on the pre-acquired sparse frame-cloth\nand frame-frame point alignments, we design the point-enhanced spatial\nattention (PSA) and point-enhanced temporal attention (PTA) to further improve\nthe try-on accuracy and video coherence of the mask-free model. Concretely, PSA\nexplicitly guides the garment transfer to desirable locations through the\nsparse semantic alignments of video frames and cloth. PTA exploits the temporal\nattention on sparse point correspondences to enhance the smoothness of\ngenerated videos. Extensive qualitative and quantitative experiments clearly\nillustrate that our PEMF-VVTO can generate more natural and coherent try-on\nvideos than existing state-of-the-art methods.\n","authors":["Tianyu Chang","Xiaohao Chen. Zhichao Wei","Xuanpu Zhang","Qing-Guo Chen","Weihua Luo","Xun Yang"],"pdf_url":"https://arxiv.org/pdf/2412.03021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12838v2","updated":"2024-12-04T04:18:32Z","published":"2024-08-23T04:56:36Z","title":"Exploring Machine Learning Models for Lung Cancer Level Classification:\n A comparative ML Approach","summary":" This paper explores machine learning (ML) models for classifying lung cancer\nlevels to improve diagnostic accuracy and prognosis. Through parameter tuning\nand rigorous evaluation, we assess various ML algorithms. Techniques like\nminimum child weight and learning rate monitoring were used to reduce\noverfitting and optimize performance. Our findings highlight the robust\nperformance of Deep Neural Network (DNN) models across all phases. Ensemble\nmethods, including voting and bagging, also showed promise in enhancing\npredictive accuracy and robustness. However, Support Vector Machine (SVM)\nmodels with the Sigmoid kernel faced challenges, indicating a need for further\nrefinement. Overall, our study provides insights into ML-based lung cancer\nclassification, emphasizing the importance of parameter tuning to optimize\nmodel performance and improve diagnostic accuracy in oncological care.\n","authors":["Mohsen Asghari Ilani","Saba Moftakhar Tehran","Ashkan Kavei","Hamed Alizadegan"],"pdf_url":"https://arxiv.org/pdf/2408.12838v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03011v1","updated":"2024-12-04T04:02:17Z","published":"2024-12-04T04:02:17Z","title":"Human Multi-View Synthesis from a Single-View Model:Transferred Body and\n Face Representations","summary":" Generating multi-view human images from a single view is a complex and\nsignificant challenge. Although recent advancements in multi-view object\ngeneration have shown impressive results with diffusion models, novel view\nsynthesis for humans remains constrained by the limited availability of 3D\nhuman datasets. Consequently, many existing models struggle to produce\nrealistic human body shapes or capture fine-grained facial details accurately.\nTo address these issues, we propose an innovative framework that leverages\ntransferred body and facial representations for multi-view human synthesis.\nSpecifically, we use a single-view model pretrained on a large-scale human\ndataset to develop a multi-view body representation, aiming to extend the 2D\nknowledge of the single-view model to a multi-view diffusion model.\nAdditionally, to enhance the model's detail restoration capability, we\nintegrate transferred multimodal facial features into our trained human\ndiffusion model. Experimental evaluations on benchmark datasets demonstrate\nthat our approach outperforms the current state-of-the-art methods, achieving\nsuperior performance in multi-view human synthesis.\n","authors":["Yu Feng","Shunsi Zhang","Jian Shu","Hanfeng Zhao","Guoliang Pang","Chi Zhang","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.00672v4","updated":"2024-12-04T03:55:35Z","published":"2024-02-01T15:33:17Z","title":"Exploring Homogeneous and Heterogeneous Consistent Label Associations\n for Unsupervised Visible-Infrared Person ReID","summary":" Unsupervised visible-infrared person re-identification (USL-VI-ReID)\nendeavors to retrieve pedestrian images of the same identity from different\nmodalities without annotations. While prior work focuses on establishing\ncross-modality pseudo-label associations to bridge the modality-gap, they\nignore maintaining the instance-level homogeneous and heterogeneous consistency\nbetween the feature space and the pseudo-label space, resulting in coarse\nassociations. In response, we introduce a Modality-Unified Label Transfer\n(MULT) module that simultaneously accounts for both homogeneous and\nheterogeneous fine-grained instance-level structures, yielding high-quality\ncross-modality label associations. It models both homogeneous and heterogeneous\naffinities, leveraging them to quantify the inconsistency between the\npseudo-label space and the feature space, subsequently minimizing it. The\nproposed MULT ensures that the generated pseudo-labels maintain alignment\nacross modalities while upholding structural consistency within intra-modality.\nAdditionally, a straightforward plug-and-play Online Cross-memory Label\nRefinement (OCLR) module is proposed to further mitigate the side effects of\nnoisy pseudo-labels while simultaneously aligning different modalities, coupled\nwith an Alternative Modality-Invariant Representation Learning (AMIRL)\nframework. Experiments demonstrate that our proposed method outperforms\nexisting state-of-the-art USL-VI-ReID methods, highlighting the superiority of\nour MULT in comparison to other cross-modality association methods. Code is\navailable at https://github.com/FranklinLingfeng/code_for_MULT.\n","authors":["Lingfeng He","De Cheng","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2402.00672v4.pdf","comment":"Accepted by IJCV2024"},{"id":"http://arxiv.org/abs/2408.16200v3","updated":"2024-12-04T03:15:44Z","published":"2024-08-29T01:42:38Z","title":"PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object\n Detection in Bird's-Eye-View","summary":" Recently, LSS-based multi-view 3D object detection provides an economical and\ndeployment-friendly solution for autonomous driving. However, all the existing\nLSS-based methods transform multi-view image features into a Cartesian\nBird's-Eye-View(BEV) representation, which does not take into account the\nnon-uniform image information distribution and hardly exploits the view\nsymmetry. In this paper, in order to adapt the image information distribution\nand preserve the view symmetry by regular convolution, we propose to employ the\npolar BEV representation to substitute the Cartesian BEV representation. To\nachieve this, we elaborately tailor three modules: a polar view transformer to\ngenerate the polar BEV representation, a polar temporal fusion module for\nfusing historical polar BEV features and a polar detection head to predict the\npolar-parameterized representation of the object. In addition, we design a 2D\nauxiliary detection head and a spatial attention enhancement module to improve\nthe quality of feature extraction in perspective view and BEV, respectively.\nFinally, we integrate the above improvements into a novel multi-view 3D object\ndetector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves\nthe superior performance. The code is available at\nhttps://github.com/Yzichen/PolarBEVDet.git.(This work has been submitted to the\nIEEE for possible publication. Copyright may be transferred without notice,\nafter which this version may no longer be accessible)\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.16200v3.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2404.08027v2","updated":"2024-12-04T02:57:03Z","published":"2024-04-11T15:58:12Z","title":"SurvMamba: State Space Model with Multi-grained Multi-modal Interaction\n for Survival Prediction","summary":" Multi-modal learning that combines pathological images with genomic data has\nsignificantly enhanced the accuracy of survival prediction. Nevertheless,\nexisting methods have not fully utilized the inherent hierarchical structure\nwithin both whole slide images (WSIs) and transcriptomic data, from which\nbetter intra-modal representations and inter-modal integration could be\nderived. Moreover, many existing studies attempt to improve multi-modal\nrepresentations through attention mechanisms, which inevitably lead to high\ncomplexity when processing high-dimensional WSIs and transcriptomic data.\nRecently, a structured state space model named Mamba emerged as a promising\napproach for its superior performance in modeling long sequences with low\ncomplexity. In this study, we propose Mamba with multi-grained multi-modal\ninteraction (SurvMamba) for survival prediction. SurvMamba is implemented with\na Hierarchical Interaction Mamba (HIM) module that facilitates efficient\nintra-modal interactions at different granularities, thereby capturing more\ndetailed local features as well as rich global representations. In addition, an\nInteraction Fusion Mamba (IFM) module is used for cascaded inter-modal\ninteractive fusion, yielding more comprehensive features for survival\nprediction. Comprehensive evaluations on five TCGA datasets demonstrate that\nSurvMamba outperforms other existing methods in terms of performance and\ncomputational cost.\n","authors":["Ying Chen","Jiajing Xie","Yuxiang Lin","Yuhang Song","Wenxian Yang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02980v1","updated":"2024-12-04T02:47:45Z","published":"2024-12-04T02:47:45Z","title":"Surveying the Effects of Quality, Diversity, and Complexity in Synthetic\n Data From Large Language Models","summary":" Synthetic data generation with Large Language Models is a promising paradigm\nfor augmenting natural data over a nearly infinite range of tasks. Given this\nvariety, direct comparisons among synthetic data generation algorithms are\nscarce, making it difficult to understand where improvement comes from and what\nbottlenecks exist. We propose to evaluate algorithms via the makeup of\nsynthetic data generated by each algorithm in terms of data quality, diversity,\nand complexity. We choose these three characteristics for their significance in\nopen-ended processes and the impact each has on the capabilities of downstream\nmodels. We find quality to be essential for in-distribution model\ngeneralization, diversity to be essential for out-of-distribution\ngeneralization, and complexity to be beneficial for both. Further, we emphasize\nthe existence of Quality-Diversity trade-offs in training data and the\ndownstream effects on model performance. We then examine the effect of various\ncomponents in the synthetic data pipeline on each data characteristic. This\nexamination allows us to taxonomize and compare synthetic data generation\nalgorithms through the components they utilize and the resulting effects on\ndata QDC composition. This analysis extends into a discussion on the importance\nof balancing QDC in synthetic data for efficient reinforcement learning and\nself-improvement algorithms. Analogous to the QD trade-offs in training data,\noften there exist trade-offs between model output quality and output diversity\nwhich impact the composition of synthetic data. We observe that many models are\ncurrently evaluated and optimized only for output quality, thereby limiting\noutput diversity and the potential for self-improvement. We argue that\nbalancing these trade-offs is essential to the development of future\nself-improvement algorithms and highlight a number of works making progress in\nthis direction.\n","authors":["Alex Havrilla","Andrew Dai","Laura O'Mahony","Koen Oostermeijer","Vera Zisler","Alon Albalak","Fabrizio Milo","Sharath Chandra Raparthy","Kanishk Gandhi","Baber Abbasi","Duy Phung","Maia Iyer","Dakota Mahan","Chase Blagden","Srishti Gureja","Mohammed Hamdy","Wen-Ding Li","Giovanni Paolini","Pawan Sasanka Ammanamanchi","Elliot Meyerson"],"pdf_url":"https://arxiv.org/pdf/2412.02980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08188v3","updated":"2024-12-04T02:44:21Z","published":"2024-08-15T14:46:13Z","title":"Nl2Hltl2Plan: Scaling Up Natural Language Understanding for Multi-Robots\n Through Hierarchical Temporal Logic Task Representation","summary":" To enable non-experts to specify long-horizon, multi-robot collaborative\ntasks, language models are increasingly used to translate natural language\ncommands into formal specifications. However, because translation can occur in\nmultiple ways, such translations may lack accuracy or lead to inefficient\nmulti-robot planning. Our key insight is that concise hierarchical\nspecifications can simplify planning while remaining straightforward to derive\nfrom human instructions. We propose Nl2Hltl2Plan, a framework that translates\nnatural language commands into hierarchical Linear Temporal Logic (LTL) and\nsolves the corresponding planning problem. The translation involves two steps\nleveraging Large Language Models (LLMs). First, an LLM transforms instructions\ninto a Hierarchical Task Tree, capturing logical and temporal relations. Next,\na fine-tuned LLM converts sub-tasks into flat LTL formulas, which are\naggregated into hierarchical specifications, with the lowest level\ncorresponding to ordered robot actions. These specifications are then used with\noff-the-shelf planners. Our Nl2Hltl2Plan demonstrates the potential of LLMs in\nhierarchical reasoning for multi-robot task planning. Evaluations in simulation\nand real-world experiments with human participants show that Nl2Hltl2Plan\noutperforms existing methods, handling more complex instructions while\nachieving higher success rates and lower costs in task allocation and planning.\nAdditional details are available at https://nl2hltl2plan.github.io .\n","authors":["Shaojun Xu","Xusheng Luo","Yutong Huang","Letian Leng","Ruixuan Liu","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08188v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02975v1","updated":"2024-12-04T02:37:31Z","published":"2024-12-04T02:37:31Z","title":"Theoretical limitations of multi-layer Transformer","summary":" Transformers, especially the decoder-only variants, are the backbone of most\nmodern large language models; yet we do not have much understanding of their\nexpressive power except for the simple $1$-layer case.\n Due to the difficulty of analyzing multi-layer models, all previous work\nrelies on unproven complexity conjectures to show limitations for multi-layer\nTransformers. In this work, we prove the first $\\textit{unconditional}$ lower\nbound against multi-layer decoder-only transformers. For any constant $L$, we\nprove that any $L$-layer decoder-only transformer needs a polynomial model\ndimension ($n^{\\Omega(1)}$) to perform sequential composition of $L$ functions\nover an input of $n$ tokens.\n As a consequence, our results give: (1) the first depth-width trade-off for\nmulti-layer transformers, exhibiting that the $L$-step composition task is\nexponentially harder for $L$-layer models compared to $(L+1)$-layer ones; (2)\nan unconditional separation between encoder and decoder, exhibiting a hard task\nfor decoders that can be solved by an exponentially shallower and smaller\nencoder; (3) a provable advantage of chain-of-thought, exhibiting a task that\nbecomes exponentially easier with chain-of-thought.\n On the technical side, we propose the multi-party $\\textit{autoregressive}$\n$\\textit{communication}$ $\\textit{model}$ that captures the computation of a\ndecoder-only Transformer. We also introduce a new proof technique that finds a\ncertain $\\textit{indistinguishable}$ $\\textit{decomposition}$ of all possible\ninputs iteratively for proving lower bounds in this model. We believe our new\ncommunication model and proof technique will be helpful to further understand\nthe computational power of transformers.\n","authors":["Lijie Chen","Binghui Peng","Hongxun Wu"],"pdf_url":"https://arxiv.org/pdf/2412.02975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03937v3","updated":"2024-12-04T02:25:18Z","published":"2023-07-08T09:10:43Z","title":"Inductive Meta-path Learning for Schema-complex Heterogeneous\n Information Networks","summary":" Heterogeneous Information Networks (HINs) are information networks with\nmultiple types of nodes and edges. The concept of meta-path, i.e., a sequence\nof entity types and relation types connecting two entities, is proposed to\nprovide the meta-level explainable semantics for various HIN tasks.\nTraditionally, meta-paths are primarily used for schema-simple HINs, e.g.,\nbibliographic networks with only a few entity types, where meta-paths are often\nenumerated with domain knowledge. However, the adoption of meta-paths for\nschema-complex HINs, such as knowledge bases (KBs) with hundreds of entity and\nrelation types, has been limited due to the computational complexity associated\nwith meta-path enumeration. Additionally, effectively assessing meta-paths\nrequires enumerating relevant path instances, which adds further complexity to\nthe meta-path learning process. To address these challenges, we propose\nSchemaWalk, an inductive meta-path learning framework for schema-complex HINs.\nWe represent meta-paths with schema-level representations to support the\nlearning of the scores of meta-paths for varying relations, mitigating the need\nof exhaustive path instance enumeration for each relation. Further, we design a\nreinforcement-learning based path-finding agent, which directly navigates the\nnetwork schema (i.e., schema graph) to learn policies for establishing\nmeta-paths with high coverage and confidence for multiple relations. Extensive\nexperiments on real data sets demonstrate the effectiveness of our proposed\nparadigm.\n","authors":["Shixuan Liu","Changjun Fan","Kewei Cheng","Yunfei Wang","Peng Cui","Yizhou Sun","Zhong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.03937v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01955v2","updated":"2024-12-04T02:25:04Z","published":"2024-12-02T20:31:27Z","title":"The use of large language models to enhance cancer clinical trial\n educational materials","summary":" Cancer clinical trials often face challenges in recruitment and engagement\ndue to a lack of participant-facing informational and educational resources.\nThis study investigated the potential of Large Language Models (LLMs),\nspecifically GPT4, in generating patient-friendly educational content from\nclinical trial informed consent forms. Using data from ClinicalTrials.gov, we\nemployed zero-shot learning for creating trial summaries and one-shot learning\nfor developing multiple-choice questions, evaluating their effectiveness\nthrough patient surveys and crowdsourced annotation. Results showed that\nGPT4-generated summaries were both readable and comprehensive, and may improve\npatients' understanding and interest in clinical trials. The multiple-choice\nquestions demonstrated high accuracy and agreement with crowdsourced\nannotators. For both resource types, hallucinations were identified that\nrequire ongoing human oversight. The findings demonstrate the potential of LLMs\n\"out-of-the-box\" to support the generation of clinical trial education\nmaterials with minimal trial-specific engineering, but implementation with a\nhuman-in-the-loop is still needed to avoid misinformation risks.\n","authors":["Mingye Gao","Aman Varshney","Shan Chen","Vikram Goddla","Jack Gallifant","Patrick Doyle","Claire Novack","Maeve Dillon-Martin","Teresia Perkins","Xinrong Correia","Erik Duhaime","Howard Isenstein","Elad Sharon","Lisa Soleymani Lehmann","David Kozono","Brian Anthony","Dmitriy Dligach","Danielle S. Bitterman"],"pdf_url":"https://arxiv.org/pdf/2412.01955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.02167v3","updated":"2024-12-04T02:08:13Z","published":"2024-03-04T16:13:39Z","title":"EMOVOME: A Dataset for Emotion Recognition in Spontaneous Real-Life\n Speech","summary":" Spontaneous datasets for Speech Emotion Recognition (SER) are scarce and\nfrequently derived from laboratory environments or staged scenarios, such as TV\nshows, limiting their application in real-world contexts. We developed and\npublicly released the Emotional Voice Messages (EMOVOME) dataset, including 999\nvoice messages from real conversations of 100 Spanish speakers on a messaging\napp, labeled in continuous and discrete emotions by expert and non-expert\nannotators. We evaluated speaker-independent SER models using acoustic features\nas baseline and transformer-based models. We compared the results with\nreference datasets including acted and elicited speech, and analyzed the\ninfluence of annotators and gender fairness. The pre-trained\nUniSpeech-SAT-Large model achieved the highest results, 61.64% and 55.57%\nUnweighted Accuracy (UA) for 3-class valence and arousal prediction\nrespectively on EMOVOME, a 10% improvement over baseline models. For the\nemotion categories, 42.58% UA was obtained. EMOVOME performed lower than the\nacted RAVDESS dataset. The elicited IEMOCAP dataset also outperformed EMOVOME\nin predicting emotion categories, while similar results were obtained in\nvalence and arousal. EMOVOME outcomes varied with annotator labels, showing\nbetter results and fairness when combining expert and non-expert annotations.\nThis study highlights the gap between controlled and real-life scenarios,\nsupporting further advancements in recognizing genuine emotions.\n","authors":["Lucía Gómez-Zaragozá","Rocío del Amor","María José Castro-Bleda","Valery Naranjo","Mariano Alcañiz Raya","Javier Marín-Morales"],"pdf_url":"https://arxiv.org/pdf/2403.02167v3.pdf","comment":"This article is a merged version of the description of the EMOVOME\n database in arXiv:2402.17496v1 and the speech emotion recognition models in\n arXiv:2403.02167v1. This work has been submitted to the IEEE for possible\n publication"},{"id":"http://arxiv.org/abs/2412.02957v1","updated":"2024-12-04T02:05:55Z","published":"2024-12-04T02:05:55Z","title":"3D Interaction Geometric Pre-training for Molecular Relational Learning","summary":" Molecular Relational Learning (MRL) is a rapidly growing field that focuses\non understanding the interaction dynamics between molecules, which is crucial\nfor applications ranging from catalyst engineering to drug discovery. Despite\nrecent progress, earlier MRL approaches are limited to using only the 2D\ntopological structure of molecules, as obtaining the 3D interaction geometry\nremains prohibitively expensive. This paper introduces a novel 3D geometric\npre-training strategy for MRL (3DMRL) that incorporates a 3D virtual\ninteraction environment, overcoming the limitations of costly traditional\nquantum mechanical calculation methods. With the constructed 3D virtual\ninteraction environment, 3DMRL trains 2D MRL model to learn the overall 3D\ngeometric information of molecular interaction through contrastive learning.\nMoreover, fine-grained interaction between molecules is learned through force\nprediction loss, which is crucial in understanding the wide range of molecular\ninteraction processes. Extensive experiments on various tasks using real-world\ndatasets, including out-of-distribution and extrapolation scenarios,\ndemonstrate the effectiveness of 3DMRL, showing up to a 24.93\\% improvement in\nperformance across 40 tasks.\n","authors":["Namkyeong Lee","Yunhak Oh","Heewoong Noh","Gyoung S. Na","Minkai Xu","Hanchen Wang","Tianfan Fu","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2412.02957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18822v2","updated":"2024-12-04T01:56:07Z","published":"2024-11-27T23:51:53Z","title":"RelCon: Relative Contrastive Learning for a Motion Foundation Model for\n Wearable Data","summary":" We present RelCon, a novel self-supervised *Rel*ative *Con*trastive learning\napproach that uses a learnable distance measure in combination with a softened\ncontrastive loss for training an motion foundation model from wearable sensors.\nThe learnable distance measure captures motif similarity and domain-specific\nsemantic information such as rotation invariance. The learned distance provides\na measurement of semantic similarity between a pair of accelerometer\ntime-series segments, which is used to measure the distance between an anchor\nand various other sampled candidate segments. The self-supervised model is\ntrained on 1 billion segments from 87,376 participants from a large wearables\ndataset. The model achieves strong performance across multiple downstream\ntasks, encompassing both classification and regression. To our knowledge, we\nare the first to show the generalizability of a self-supervised learning model\nwith motion data from wearables across distinct evaluation tasks.\n","authors":["Maxwell A. Xu","Jaya Narain","Gregory Darnell","Haraldur Hallgrimsson","Hyewon Jeong","Darren Forde","Richard Fineman","Karthik J. Raghuram","James M. Rehg","Shirley Ren"],"pdf_url":"https://arxiv.org/pdf/2411.18822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13546v2","updated":"2024-12-04T01:45:42Z","published":"2024-08-24T10:35:10Z","title":"Synesthesia of Machines (SoM)-Enhanced ISAC Precoding for Vehicular\n Networks with Double Dynamics","summary":" Integrated sensing and communication (ISAC) technology is vital for vehicular\nnetworks, yet the time-varying communication channels and rapid movement of\ntargets present significant challenges for real-time precoding design.\nTraditional optimization-based methods are computationally complex and depend\non perfect prior information, which is often unavailable in double-dynamic\nscenarios. In this paper, we propose a synesthesia of machine (SoM)-enhanced\nprecoding paradigm that leverages modalities such as positioning and channel\ninformation to adapt to these dynamics. Utilizing a deep reinforcement learning\n(DRL) framework, our approach pushes ISAC performance boundaries. We also\nintroduce a parameter-shared actor-critic architecture to accelerate training\nin complex state and action spaces. Extensive experiments validate the\nsuperiority of our method over existing approaches.\n","authors":["Zonghui Yang","Shijian Gao","Xiang Cheng","Liuqing Yang"],"pdf_url":"https://arxiv.org/pdf/2408.13546v2.pdf","comment":"Submitted to IEEE for possible publication"},{"id":"http://arxiv.org/abs/2310.11178v3","updated":"2024-12-04T01:35:26Z","published":"2023-10-17T11:53:32Z","title":"FocDepthFormer: Transformer with latent LSTM for Depth Estimation from\n Focal Stack","summary":" Most existing methods for depth estimation from a focal stack of images\nemploy convolutional neural networks (CNNs) using 2D or 3D convolutions over a\nfixed set of images. However, their effectiveness is constrained by the local\nproperties of CNN kernels, which restricts them to process only focal stacks of\nfixed number of images during both training and inference. This limitation\nhampers their ability to generalize to stacks of arbitrary lengths. To overcome\nthese limitations, we present a novel Transformer-based network,\nFocDepthFormer, which integrates a Transformer with an LSTM module and a CNN\ndecoder. The Transformer's self-attention mechanism allows for the learning of\nmore informative spatial features by implicitly performing non-local\ncross-referencing. The LSTM module is designed to integrate representations\nacross image stacks of varying lengths. Additionally, we employ multi-scale\nconvolutional kernels in an early-stage encoder to capture low-level features\nat different degrees of focus/defocus. By incorporating the LSTM,\nFocDepthFormer can be pre-trained on large-scale monocular RGB depth estimation\ndatasets, improving visual pattern learning and reducing reliance on\ndifficult-to-obtain focal stack data. Extensive experiments on diverse focal\nstack benchmark datasets demonstrate that our model outperforms\nstate-of-the-art approaches across multiple evaluation metrics.\n","authors":["Xueyang Kang","Fengze Han","Abdur R. Fayjie","Patrick Vandewalle","Kourosh Khoshelham","Dong Gong"],"pdf_url":"https://arxiv.org/pdf/2310.11178v3.pdf","comment":"30 pages, 20 figures, Conference paper"},{"id":"http://arxiv.org/abs/2412.02946v1","updated":"2024-12-04T01:23:57Z","published":"2024-12-04T01:23:57Z","title":"Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large\n Vision-Language Model via Causality Analysis","summary":" Recent advancements in large vision-language models (LVLM) have significantly\nenhanced their ability to comprehend visual inputs alongside natural language.\nHowever, a major challenge in their real-world application is hallucination,\nwhere LVLMs generate non-existent visual elements, eroding user trust. The\nunderlying mechanism driving this multimodal hallucination is poorly\nunderstood. Minimal research has illuminated whether contexts such as sky,\ntree, or grass field involve the LVLM in hallucinating a frisbee. We\nhypothesize that hidden factors, such as objects, contexts, and semantic\nforeground-background structures, induce hallucination. This study proposes a\nnovel causal approach: a hallucination probing system to identify these hidden\nfactors. By analyzing the causality between images, text prompts, and network\nsaliency, we systematically explore interventions to block these factors. Our\nexperimental findings show that a straightforward technique based on our\nanalysis can significantly reduce hallucinations. Additionally, our analyses\nindicate the potential to edit network internals to minimize hallucinated\noutputs.\n","authors":["Po-Hsuan Huang","Jeng-Lin Li","Chin-Po Chen","Ming-Ching Chang","Wei-Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02946v1.pdf","comment":"Accepted by WACV2025"},{"id":"http://arxiv.org/abs/2412.02942v1","updated":"2024-12-04T01:20:43Z","published":"2024-12-04T01:20:43Z","title":"STDCformer: A Transformer-Based Model with a Spatial-Temporal Causal\n De-Confounding Strategy for Crowd Flow Prediction","summary":" Existing works typically treat spatial-temporal prediction as the task of\nlearning a function $F$ to transform historical observations to future\nobservations. We further decompose this cross-time transformation into three\nprocesses: (1) Encoding ($E$): learning the intrinsic representation of\nobservations, (2) Cross-Time Mapping ($M$): transforming past representations\ninto future representations, and (3) Decoding ($D$): reconstructing future\nobservations from the future representations. From this perspective,\nspatial-temporal prediction can be viewed as learning $F = E \\cdot M \\cdot D$,\nwhich includes learning the space transformations $\\left\\{{E},{D}\\right\\}$\nbetween the observation space and the hidden representation space, as well as\nthe spatial-temporal mapping $M$ from future states to past states within the\nrepresentation space. This leads to two key questions: \\textbf{Q1: What kind of\nrepresentation space allows for mapping the past to the future? Q2: How to\nachieve map the past to the future within the representation space?} To address\nQ1, we propose a Spatial-Temporal Backdoor Adjustment strategy, which learns a\nSpatial-Temporal De-Confounded (STDC) representation space and estimates the\nde-confounding causal effect of historical data on future data. This causal\nrelationship we captured serves as the foundation for subsequent\nspatial-temporal mapping. To address Q2, we design a Spatial-Temporal Embedding\n(STE) that fuses the information of temporal and spatial confounders, capturing\nthe intrinsic spatial-temporal characteristics of the representations.\nAdditionally, we introduce a Cross-Time Attention mechanism, which queries the\nattention between the future and the past to guide spatial-temporal mapping.\n","authors":["Silu He","Peng Shen","Pingzhen Xu","Qinyao Luo","Haifeng Li"],"pdf_url":"https://arxiv.org/pdf/2412.02942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.12841v2","updated":"2024-12-04T01:20:16Z","published":"2024-08-23T05:15:24Z","title":"COVID-19 Probability Prediction Using Machine Learning: An Infectious\n Approach","summary":" The ongoing COVID-19 pandemic continues to pose significant challenges to\nglobal public health, despite the widespread availability of vaccines. Early\ndetection of the disease remains paramount in curbing its transmission and\nmitigating its impact on public health systems. In response, this study delves\ninto the application of advanced machine learning (ML) techniques for\npredicting COVID-19 infection probability. We conducted a rigorous\ninvestigation into the efficacy of various ML models, including XGBoost, LGBM,\nAdaBoost, Logistic Regression, Decision Tree, RandomForest, CatBoost, KNN, and\nDeep Neural Networks (DNN). Leveraging a dataset comprising 4000 samples, with\n3200 allocated for training and 800 for testing, our experiment offers\ncomprehensive insights into the performance of these models in COVID-19\nprediction. Our findings reveal that Deep Neural Networks (DNN) emerge as the\ntop-performing model, exhibiting superior accuracy and recall metrics. With an\nimpressive accuracy rate of 89%, DNN demonstrates remarkable potential in early\nCOVID-19 detection. This underscores the efficacy of deep learning approaches\nin leveraging complex data patterns to identify COVID-19 infections accurately.\nThis study underscores the critical role of machine learning, particularly deep\nlearning methodologies, in augmenting early detection efforts amidst the\nongoing pandemic. The success of DNN in accurately predicting COVID-19\ninfection probability highlights the importance of continued research and\ndevelopment in leveraging advanced technologies to combat infectious diseases.\n","authors":["Mohsen Asghari Ilani","Saba Moftakhar Tehran","Ashkan Kavei","Arian Radmehr"],"pdf_url":"https://arxiv.org/pdf/2408.12841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02940v1","updated":"2024-12-04T01:13:44Z","published":"2024-12-04T01:13:44Z","title":"SAVER: A Toolbox for Sampling-Based, Probabilistic Verification of\n Neural Networks","summary":" We present a neural network verification toolbox to 1) assess the probability\nof satisfaction of a constraint, and 2) synthesize a set expansion factor to\nachieve the probability of satisfaction. Specifically, the tool box establishes\nwith a user-specified level of confidence whether the output of the neural\nnetwork for a given input distribution is likely to be contained within a given\nset. Should the tool determine that the given set cannot satisfy the likelihood\nconstraint, the tool also implements an approach outlined in this paper to\nalter the constraint set to ensure that the user-defined satisfaction\nprobability is achieved. The toolbox is comprised of sampling-based approaches\nwhich exploit the properties of signed distance function to define set\ncontainment.\n","authors":["Vignesh Sivaramakrishnan","Krishna C. Kalagarla","Rosalyn Devonport","Joshua Pilipovsky","Panagiotis Tsiotras","Meeko Oishi"],"pdf_url":"https://arxiv.org/pdf/2412.02940v1.pdf","comment":"7 pages, 8 figures, submitted to the 28th ACM International\n Conference on Hybrid Systems: Computation and Control"},{"id":"http://arxiv.org/abs/2412.02931v1","updated":"2024-12-04T00:53:55Z","published":"2024-12-04T00:53:55Z","title":"Inverse Delayed Reinforcement Learning","summary":" Inverse Reinforcement Learning (IRL) has demonstrated effectiveness in a\nvariety of imitation tasks. In this paper, we introduce an IRL framework\ndesigned to extract rewarding features from expert trajectories affected by\ndelayed disturbances. Instead of relying on direct observations, our approach\nemploys an efficient off-policy adversarial training framework to derive expert\nfeatures and recover optimal policies from augmented delayed observations.\nEmpirical evaluations in the MuJoCo environment under diverse delay settings\nvalidate the effectiveness of our method. Furthermore, we provide a theoretical\nanalysis showing that recovering expert policies from augmented delayed\nobservations outperforms using direct delayed observations.\n","authors":["Simon Sinong Zhan","Qingyuan Wu","Zhian Ruan","Frank Yang","Philip Wang","Yixuan Wang","Ruochen Jiao","Chao Huang","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.02931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02929v1","updated":"2024-12-04T00:42:15Z","published":"2024-12-04T00:42:15Z","title":"Panoptic Diffusion Models: co-generation of images and segmentation maps","summary":" Recently, diffusion models have demonstrated impressive capabilities in\ntext-guided and image-conditioned image generation. However, existing diffusion\nmodels cannot simultaneously generate a segmentation map of objects and a\ncorresponding image from the prompt. Previous attempts either generate\nsegmentation maps based on the images or provide maps as input conditions to\ncontrol image generation, limiting their functionality to given inputs.\nIncorporating an inherent understanding of the scene layouts can improve the\ncreativity and realism of diffusion models. To address this limitation, we\npresent Panoptic Diffusion Model (PDM), the first model designed to generate\nboth images and panoptic segmentation maps concurrently. PDM bridges the gap\nbetween image and text by constructing segmentation layouts that provide\ndetailed, built-in guidance throughout the generation process. This ensures the\ninclusion of categories mentioned in text prompts and enriches the diversity of\nsegments within the background. We demonstrate the effectiveness of PDM across\ntwo architectures: a unified diffusion transformer and a two-stream transformer\nwith a pretrained backbone. To facilitate co-generation with fewer sampling\nsteps, we incorporate a fast diffusion solver into PDM. Additionally, when\nground-truth maps are available, PDM can function as a text-guided\nimage-to-image generation model. Finally, we propose a novel metric for\nevaluating the quality of generated maps and show that PDM achieves\nstate-of-the-art results in image generation with implicit scene control.\n","authors":["Yinghan Long","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2412.02929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19094v5","updated":"2024-12-04T00:40:25Z","published":"2024-07-26T21:18:57Z","title":"Wonderful Team: Zero-Shot Physical Task Planning with Visual LLMs","summary":" We introduce Wonderful Team, a multi-agent Vision Large Language Model (VLLM)\nframework for executing high level robotic planning in a zero-shot regime. In\nour context, zero-shot high-level planning means that for a novel environment,\nwe provide a VLLM with an image of the robot's surroundings and a task\ndescription, and the VLLM outputs the sequence of actions necessary for the\nrobot to complete the task. Unlike previous methods for high-level visual\nplanning for robotic manipulation, our method uses VLLMs for the entire\nplanning process, enabling a more tightly integrated loop between perception,\ncontrol, and planning. As a result, Wonderful Team's performance on a\nreal-world semantic and physical planning tasks often exceeds methods that rely\non separate vision systems. For example, we see an average 40% success-rate\nimprovement on VimaBench over prior methods such as NLaP, an average 30%\nimprovement over Trajectory Generators on tasks from the Trajectory Generator\npaper including drawing and wiping a plate, and an average 70% improvement over\nTrajectory Generators on a new set of semantic reasoning tasks including\nenvironment re-arrangement with implicit linguistic constraints. We hope these\nresults highlight the rapid improvements of VLLMs in the past year, and\nmotivate the community to consider VLLMs as an option for some high-level\nrobotic planning problems in the future.\n","authors":["Zidan Wang","Rui Shen","Bradly Stadie"],"pdf_url":"https://arxiv.org/pdf/2407.19094v5.pdf","comment":"aka Wonderful Team"},{"id":"http://arxiv.org/abs/2406.11143v2","updated":"2024-12-04T00:18:41Z","published":"2024-06-17T02:11:59Z","title":"Scorecards for Synthetic Medical Data Evaluation and Reporting","summary":" Although interest in synthetic medical data (SMD) for training and testing AI\nmethods is growing, the absence of a standardized framework to evaluate its\nquality and applicability hinders its wider adoption. Here, we outline an\nevaluation framework designed to meet the unique requirements of medical\napplications, and introduce SMD Card, which can serve as comprehensive reports\nthat accompany artificially generated datasets. This card provides a\ntransparent and standardized framework for evaluating and reporting the quality\nof synthetic data, which can benefit SMD developers, users, and regulators,\nparticularly for AI models using SMD in regulatory submissions.\n","authors":["Ghada Zamzmi","Adarsh Subbaswamy","Elena Sizikova","Edward Margerrison","Jana Delfino","Aldo Badano"],"pdf_url":"https://arxiv.org/pdf/2406.11143v2.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.02919v1","updated":"2024-12-04T00:10:47Z","published":"2024-12-04T00:10:47Z","title":"Higher Order Transformers: Efficient Attention Mechanism for Tensor\n Structured Data","summary":" Transformers are now ubiquitous for sequence modeling tasks, but their\nextension to multi-dimensional data remains a challenge due to the quadratic\ncost of the attention mechanism. In this paper, we propose Higher-Order\nTransformers (HOT), a novel architecture designed to efficiently process data\nwith more than two axes, i.e. higher-order tensors. To address the\ncomputational challenges associated with high-order tensor attention, we\nintroduce a novel Kronecker factorized attention mechanism that reduces the\nattention cost to quadratic in each axis' dimension, rather than quadratic in\nthe total size of the input tensor. To further enhance efficiency, HOT\nleverages kernelized attention, reducing the complexity to linear. This\nstrategy maintains the model's expressiveness while enabling scalable attention\ncomputation. We validate the effectiveness of HOT on two high-dimensional\ntasks, including multivariate time series forecasting, and 3D medical image\nclassification. Experimental results demonstrate that HOT achieves competitive\nperformance while significantly improving computational efficiency, showcasing\nits potential for tackling a wide range of complex, multi-dimensional data.\n","authors":["Soroush Omranpour","Guillaume Rabusseau","Reihaneh Rabbany"],"pdf_url":"https://arxiv.org/pdf/2412.02919v1.pdf","comment":null}],"Genomics":[{"id":"http://arxiv.org/abs/2412.03027v1","updated":"2024-12-04T04:45:05Z","published":"2024-12-04T04:45:05Z","title":"Timestamp calibration for time-series single cell RNA-seq expression\n data","summary":" Timestamp automatic annotation (TAA) is a crucial procedure for analyzing\ntime-series ScRNA-seq data, as they unveil dynamic biological developments and\ncell regeneration process. However, current TAA methods heavily rely on manual\ntimestamps, often overlooking their reliability. This oversight can\nsignificantly degrade the performance of timestamp automatic annotation due to\nnoisy timestamps. Nevertheless, the current approach for addressing this issue\ntends to select less critical cleaned samples for timestamp calibration. To\ntackle this challenge, we have developed a novel timestamp calibration model\ncalled ScPace for handling noisy labeled time-series ScRNA-seq data. This\napproach incorporates a latent variable indicator within a base classifier\ninstead of probability sampling to detect noisy samples effectively. To\nvalidate our proposed method, we conducted experiments on both simulated and\nreal time-series ScRNA-seq datasets. Cross-validation experiments with\ndifferent artificial mislabeling rates demonstrate that ScPace outperforms\nprevious approaches. Furthermore, after calibrating the timestamps of the\noriginal time-series ScRNA-seq data using our method, we performed supervised\npseudotime analysis, revealing that ScPace enhances its performance\nsignificantly. These findings suggest that ScPace is an effective tool for\ntimestamp calibration by enabling reclassification and deletion of detected\nnoisy labeled samples while maintaining robustness across diverse ranges of\ntime-series ScRNA-seq datasets. The source code is available at\nhttps://github.com/OPUS-Lightphenexx/ScPace.\n","authors":["Xiran Chen","Sha Lin","Xiaofeng Chen","Weikai Li","Yifei Li"],"pdf_url":"https://arxiv.org/pdf/2412.03027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03005v1","updated":"2024-12-04T03:50:26Z","published":"2024-12-04T03:50:26Z","title":"gghic: A Versatile R Package for Exploring and Visualizing 3D Genome\n Organization","summary":" Motivation: The three-dimensional (3D) organization of the genome plays a\ncritical role in regulating gene expression and maintaining cellular\nhomeostasis. Disruptions in this spatial organization can result in abnormal\nchromatin interactions, contributing to the development of various diseases\nincluding cancer. Advances in chromosome conformation capture technologies,\nsuch as Hi-C, have enabled researchers to study genome architecture at high\nresolution. However, the efficient visualization and interpretation of these\ncomplex datasets remain a major challenge, particularly when integrating\ngenomic annotations and inter-chromosomal interactions.\n Results: We present gghic, an R package that extends the ggplot2 framework to\nenable intuitive and customizable visualization of genomic interaction data.\ngghic introduces novel layers for generating triangular heatmaps of chromatin\ninteractions and annotating them with features such as chromatin loops,\ntopologically associated domains (TADs), gene/transcript models, and data\ntracks (e.g., ChIP-seq signals). The package supports data from multiple\nchromosomes, facilitating the exploration of inter-chromosomal interactions.\nBuilt to integrate seamlessly with the R/Bioconductor ecosystem, gghic is\ncompatible with widely used genomic data formats, including HiCExperiment and\nGInteractions objects. We demonstrate the utility of gghic by replicating a\npublished figure showing a translocation event in T-cell acute lymphoblastic\nleukemia (T-ALL), highlighting its ability to integrate genomic annotations and\ngenerate publication-quality figures.\n Availability and implementation: The R package can be accessed at\nhttps://github.com/jasonwong-lab/gghic and is distributed under the GNU General\nPublic License version 3.0.\n","authors":["Minghao Jiang","Duohui Jing","Jason W. H. Wong"],"pdf_url":"https://arxiv.org/pdf/2412.03005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03614v1","updated":"2024-12-04T14:07:11Z","published":"2024-12-04T14:07:11Z","title":"Deep Learning in Single-Cell and Spatial Transcriptomics Data Analysis:\n Advances and Challenges from a Data Science Perspective","summary":" The development of single-cell and spatial transcriptomics has revolutionized\nour capacity to investigate cellular properties, functions, and interactions in\nboth cellular and spatial contexts. However, the analysis of single-cell and\nspatial omics data remains challenging. First, single-cell sequencing data are\nhigh-dimensional and sparse, often contaminated by noise and uncertainty,\nobscuring the underlying biological signals. Second, these data often encompass\nmultiple modalities, including gene expression, epigenetic modifications, and\nspatial locations. Integrating these diverse data modalities is crucial for\nenhancing prediction accuracy and biological interpretability. Third, while the\nscale of single-cell sequencing has expanded to millions of cells, high-quality\nannotated datasets are still limited. Fourth, the complex correlations of\nbiological tissues make it difficult to accurately reconstruct cellular states\nand spatial contexts. Traditional feature engineering-based analysis methods\nstruggle to deal with the various challenges presented by intricate biological\nnetworks. Deep learning has emerged as a powerful tool capable of handling\nhigh-dimensional complex data and automatically identifying meaningful\npatterns, offering significant promise in addressing these challenges. This\nreview systematically analyzes these challenges and discusses related deep\nlearning approaches. Moreover, we have curated 21 datasets from 9 benchmarks,\nencompassing 58 computational methods, and evaluated their performance on the\nrespective modeling tasks. Finally, we highlight three areas for future\ndevelopment from a technical, dataset, and application perspective. This work\nwill serve as a valuable resource for understanding how deep learning can be\neffectively utilized in single-cell and spatial transcriptomics analyses, while\ninspiring novel approaches to address emerging challenges.\n","authors":["Shuang Ge","Shuqing Sun","Huan Xu","Qiang Cheng","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2412.03614v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2406.09400v2","updated":"2024-12-04T18:59:56Z","published":"2024-06-13T17:59:29Z","title":"Yo'LLaVA: Your Personalized Language and Vision Assistant","summary":" Large Multimodal Models (LMMs) have shown remarkable capabilities across a\nvariety of tasks (e.g., image captioning, visual question answering). While\nbroad, their knowledge remains generic (e.g., recognizing a dog), and they are\nunable to handle personalized subjects (e.g., recognizing a user's pet dog).\nHuman reasoning, in contrast, typically operates within the context of specific\nsubjects in our surroundings. For example, one might ask, \"What should I buy\nfor my dog's birthday?\"; as opposed to a generic inquiry about \"What should I\nbuy for a dog's birthday?\". Similarly, when looking at a friend's image, the\ninterest lies in seeing their activities (e.g., \"my friend is holding a cat\"),\nrather than merely observing generic human actions (e.g., \"a man is holding a\ncat\"). In this paper, we introduce the novel task of personalizing LMMs, so\nthat they can have conversations about a specific subject. We propose Yo'LLaVA,\nwhich learns to embed a personalized subject into a set of latent tokens given\na handful of example images of the subject. Our qualitative and quantitative\nanalyses reveal that Yo'LLaVA can learn the concept more efficiently using\nfewer tokens and more effectively encode the visual attributes compared to\nstrong prompting baselines (e.g., LLaVA).\n","authors":["Thao Nguyen","Haotian Liu","Yuheng Li","Mu Cai","Utkarsh Ojha","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2406.09400v2.pdf","comment":"NeurIPS 2024; Project page: https://thaoshibe.github.io/YoLLaVA"},{"id":"http://arxiv.org/abs/2412.03572v1","updated":"2024-12-04T18:59:45Z","published":"2024-12-04T18:59:45Z","title":"Navigation World Models","summary":" Navigation is a fundamental skill of agents with visual-motor capabilities.\nWe introduce a Navigation World Model (NWM), a controllable video generation\nmodel that predicts future visual observations based on past observations and\nnavigation actions. To capture complex environment dynamics, NWM employs a\nConditional Diffusion Transformer (CDiT), trained on a diverse collection of\negocentric videos of both human and robotic agents, and scaled up to 1 billion\nparameters. In familiar environments, NWM can plan navigation trajectories by\nsimulating them and evaluating whether they achieve the desired goal. Unlike\nsupervised navigation policies with fixed behavior, NWM can dynamically\nincorporate constraints during planning. Experiments demonstrate its\neffectiveness in planning trajectories from scratch or by ranking trajectories\nsampled from an external policy. Furthermore, NWM leverages its learned visual\npriors to imagine trajectories in unfamiliar environments from a single input\nimage, making it a flexible and powerful tool for next-generation navigation\nsystems.\n","authors":["Amir Bar","Gaoyue Zhou","Danny Tran","Trevor Darrell","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2412.03572v1.pdf","comment":"project page: https://www.amirbar.net/nwm/"},{"id":"http://arxiv.org/abs/2412.03556v1","updated":"2024-12-04T18:51:32Z","published":"2024-12-04T18:51:32Z","title":"Best-of-N Jailbreaking","summary":" We introduce Best-of-N (BoN) Jailbreaking, a simple black-box algorithm that\njailbreaks frontier AI systems across modalities. BoN Jailbreaking works by\nrepeatedly sampling variations of a prompt with a combination of augmentations\n- such as random shuffling or capitalization for textual prompts - until a\nharmful response is elicited. We find that BoN Jailbreaking achieves high\nattack success rates (ASRs) on closed-source language models, such as 89% on\nGPT-4o and 78% on Claude 3.5 Sonnet when sampling 10,000 augmented prompts.\nFurther, it is similarly effective at circumventing state-of-the-art\nopen-source defenses like circuit breakers. BoN also seamlessly extends to\nother modalities: it jailbreaks vision language models (VLMs) such as GPT-4o\nand audio language models (ALMs) like Gemini 1.5 Pro, using modality-specific\naugmentations. BoN reliably improves when we sample more augmented prompts.\nAcross all modalities, ASR, as a function of the number of samples (N),\nempirically follows power-law-like behavior for many orders of magnitude. BoN\nJailbreaking can also be composed with other black-box algorithms for even more\neffective attacks - combining BoN with an optimized prefix attack achieves up\nto a 35% increase in ASR. Overall, our work indicates that, despite their\ncapability, language models are sensitive to seemingly innocuous changes to\ninputs, which attackers can exploit across modalities.\n","authors":["John Hughes","Sara Price","Aengus Lynch","Rylan Schaeffer","Fazl Barez","Sanmi Koyejo","Henry Sleight","Erik Jones","Ethan Perez","Mrinank Sharma"],"pdf_url":"https://arxiv.org/pdf/2412.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.15957v2","updated":"2024-12-04T18:48:43Z","published":"2024-02-25T02:36:03Z","title":"DynaMITE-RL: A Dynamic Model for Improved Temporal Meta-Reinforcement\n Learning","summary":" We introduce DynaMITE-RL, a meta-reinforcement learning (meta-RL) approach to\napproximate inference in environments where the latent state evolves at varying\nrates. We model episode sessions - parts of the episode where the latent state\nis fixed - and propose three key modifications to existing meta-RL methods:\nconsistency of latent information within sessions, session masking, and prior\nlatent conditioning. We demonstrate the importance of these modifications in\nvarious domains, ranging from discrete Gridworld environments to\ncontinuous-control and simulated robot assistive tasks, demonstrating that\nDynaMITE-RL significantly outperforms state-of-the-art baselines in sample\nefficiency and inference returns.\n","authors":["Anthony Liang","Guy Tennenholtz","Chih-wei Hsu","Yinlam Chow","Erdem Bıyık","Craig Boutilier"],"pdf_url":"https://arxiv.org/pdf/2402.15957v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10182v3","updated":"2024-12-04T18:48:28Z","published":"2024-03-15T10:38:48Z","title":"Fast and reliable uncertainty quantification with neural network\n ensembles for industrial image classification","summary":" Image classification with neural networks (NNs) is widely used in industrial\nprocesses, situations where the model likely encounters unknown objects during\ndeployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make\nconfident yet incorrect predictions when confronted with OOD data. To increase\nthe models' reliability, they should quantify the uncertainty in their own\npredictions, communicating when the output should (not) be trusted. Deep\nensembles, composed of multiple independent NNs, have been shown to perform\nstrongly but are computationally expensive. Recent research has proposed more\nefficient NN ensembles, namely the snapshot, batch, and multi-input\nmulti-output ensemble. This study investigates the predictive and uncertainty\nperformance of efficient NN ensembles in the context of image classification\nfor industrial processes. It is the first to provide a comprehensive comparison\nand it proposes a novel Diversity Quality metric to quantify the ensembles'\nperformance on the in-distribution and OOD sets in one single metric. The\nresults highlight the batch ensemble as a cost-effective and competitive\nalternative to the deep ensemble. It matches the deep ensemble in both\nuncertainty and accuracy while exhibiting considerable savings in training\ntime, test time, and memory storage.\n","authors":["Arthur Thuy","Dries F. Benoit"],"pdf_url":"https://arxiv.org/pdf/2403.10182v3.pdf","comment":"Submitted to Annals of Operations Research"},{"id":"http://arxiv.org/abs/2412.03548v1","updated":"2024-12-04T18:45:35Z","published":"2024-12-04T18:45:35Z","title":"Perception Tokens Enhance Visual Reasoning in Multimodal Language Models","summary":" Multimodal language models (MLMs) still face challenges in fundamental visual\nperception tasks where specialized models excel. Tasks requiring reasoning\nabout 3D structures benefit from depth estimation, and reasoning about 2D\nobject instances benefits from object detection. Yet, MLMs can not produce\nintermediate depth or boxes to reason over. Finetuning MLMs on relevant data\ndoesn't generalize well and outsourcing computation to specialized vision tools\nis too compute-intensive and memory-inefficient. To address this, we introduce\nPerception Tokens, intrinsic image representations designed to assist reasoning\ntasks where language is insufficient. Perception tokens act as auxiliary\nreasoning tokens, akin to chain-of-thought prompts in language models. For\nexample, in a depth-related task, an MLM augmented with perception tokens can\nreason by generating a depth map as tokens, enabling it to solve the problem\neffectively. We propose AURORA, a training method that augments MLMs with\nperception tokens for improved reasoning over visual inputs. AURORA leverages a\nVQVAE to transform intermediate image representations, such as depth maps into\na tokenized format and bounding box tokens, which is then used in a multi-task\ntraining framework. AURORA achieves notable improvements across counting\nbenchmarks: +10.8% on BLINK, +11.3% on CVBench, and +8.3% on SEED-Bench,\noutperforming finetuning approaches in generalization across datasets. It also\nimproves on relative depth: over +6% on BLINK. With perception tokens, AURORA\nexpands the scope of MLMs beyond language-based reasoning, paving the way for\nmore effective visual reasoning capabilities.\n","authors":["Mahtab Bigverdi","Zelun Luo","Cheng-Yu Hsieh","Ethan Shen","Dongping Chen","Linda G. Shapiro","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2412.03548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19379v2","updated":"2024-12-04T18:40:24Z","published":"2024-11-28T21:10:20Z","title":"Marconi: Prefix Caching for the Era of Hybrid LLMs","summary":" Hybrid models that combine the language modeling capabilities of Attention\nlayers with the efficiency of Recurrent layers (e.g., State Space Models) have\ngained traction in practically supporting long contexts in Large Language Model\nserving. Yet, the unique properties of these models complicate the usage of\ncomplementary efficiency optimizations such as prefix caching that skip\nredundant computations across requests. Most notably, their use of in-place\nstate updates for recurrent layers precludes rolling back cache entries for\npartial sequence overlaps, and instead mandates only exact-match cache hits;\nthe effect is a deluge of (large) cache entries per sequence, most of which\nyield minimal reuse opportunities. We present Marconi, the first system that\nsupports efficient prefix caching with Hybrid LLMs. Key to Marconi are its\nnovel admission and eviction policies that more judiciously assess potential\ncache entries based not only on recency, but also on (1) forecasts of their\nreuse likelihood across a taxonomy of different hit scenarios, and (2) the\ncompute savings that hits deliver relative to memory footprints. Across diverse\nworkloads and Hybrid models, Marconi achieves up to 34.4$\\times$ higher token\nhit rates (71.1% or 617 ms lower TTFT) compared to state-of-the-art prefix\ncaching systems.\n","authors":["Rui Pan","Zhuang Wang","Zhen Jia","Can Karakus","Luca Zancato","Tri Dao","Yida Wang","Ravi Netravali"],"pdf_url":"https://arxiv.org/pdf/2411.19379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03539v1","updated":"2024-12-04T18:36:09Z","published":"2024-12-04T18:36:09Z","title":"NODE-AdvGAN: Improving the transferability and perceptual similarity of\n adversarial examples by dynamic-system-driven adversarial generative model","summary":" Understanding adversarial examples is crucial for improving the model's\nrobustness, as they introduce imperceptible perturbations that deceive models.\nEffective adversarial examples, therefore, offer the potential to train more\nrobust models by removing their singularities. We propose NODE-AdvGAN, a novel\napproach that treats adversarial generation as a continuous process and employs\na Neural Ordinary Differential Equation (NODE) for simulating the dynamics of\nthe generator. By mimicking the iterative nature of traditional gradient-based\nmethods, NODE-AdvGAN generates smoother and more precise perturbations that\npreserve high perceptual similarity when added to benign images. We also\npropose a new training strategy, NODE-AdvGAN-T, which enhances transferability\nin black-box attacks by effectively tuning noise parameters during training.\nExperiments demonstrate that NODE-AdvGAN and NODE-AdvGAN-T generate more\neffective adversarial examples that achieve higher attack success rates while\npreserving better perceptual quality than traditional GAN-based methods.\n","authors":["Xinheng Xie","Yue Wu","Cuiyu He"],"pdf_url":"https://arxiv.org/pdf/2412.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03537v1","updated":"2024-12-04T18:32:42Z","published":"2024-12-04T18:32:42Z","title":"Evaluating Gender Bias Transfer between Pre-trained and Prompt-Adapted\n Language Models","summary":" Large language models (LLMs) are increasingly being adapted to achieve\ntask-specificity for deployment in real-world decision systems. Several\nprevious works have investigated the bias transfer hypothesis (BTH) by studying\nthe effect of the fine-tuning adaptation strategy on model fairness to find\nthat fairness in pre-trained masked language models have limited effect on the\nfairness of models when adapted using fine-tuning. In this work, we expand the\nstudy of BTH to causal models under prompt adaptations, as prompting is an\naccessible, and compute-efficient way to deploy models in real-world systems.\nIn contrast to previous works, we establish that intrinsic biases in\npre-trained Mistral, Falcon and Llama models are strongly correlated (rho >=\n0.94) with biases when the same models are zero- and few-shot prompted, using a\npronoun co-reference resolution task. Further, we find that bias transfer\nremains strongly correlated even when LLMs are specifically prompted to exhibit\nfair or biased behavior (rho >= 0.92), and few-shot length and stereotypical\ncomposition are varied (rho >= 0.97). Our findings highlight the importance of\nensuring fairness in pre-trained LLMs, especially when they are later used to\nperform downstream tasks via prompt adaptation.\n","authors":["Natalie Mackraz","Nivedha Sivakumar","Samira Khorshidi","Krishna Patel","Barry-John Theobald","Luca Zappella","Nicholas Apostoloff"],"pdf_url":"https://arxiv.org/pdf/2412.03537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03531v1","updated":"2024-12-04T18:26:13Z","published":"2024-12-04T18:26:13Z","title":"A Review on Scientific Knowledge Extraction using Large Language Models\n in Biomedical Sciences","summary":" The rapid advancement of large language models (LLMs) has opened new\nboundaries in the extraction and synthesis of medical knowledge, particularly\nwithin evidence synthesis. This paper reviews the state-of-the-art applications\nof LLMs in the biomedical domain, exploring their effectiveness in automating\ncomplex tasks such as evidence synthesis and data extraction from a biomedical\ncorpus of documents. While LLMs demonstrate remarkable potential, significant\nchallenges remain, including issues related to hallucinations, contextual\nunderstanding, and the ability to generalize across diverse medical tasks. We\nhighlight critical gaps in the current research literature, particularly the\nneed for unified benchmarks to standardize evaluations and ensure reliability\nin real-world applications. In addition, we propose directions for future\nresearch, emphasizing the integration of state-of-the-art techniques such as\nretrieval-augmented generation (RAG) to enhance LLM performance in evidence\nsynthesis. By addressing these challenges and utilizing the strengths of LLMs,\nwe aim to improve access to medical literature and facilitate meaningful\ndiscoveries in healthcare.\n","authors":["Gabriel Lino Garcia","João Renato Ribeiro Manesco","Pedro Henrique Paiola","Lucas Miranda","Maria Paola de Salvo","João Paulo Papa"],"pdf_url":"https://arxiv.org/pdf/2412.03531v1.pdf","comment":"9 pages, 1 table, 1 figure, conference paper"},{"id":"http://arxiv.org/abs/2403.12712v3","updated":"2024-12-04T18:18:47Z","published":"2024-03-19T13:19:41Z","title":"Instance-Warp: Saliency Guided Image Warping for Unsupervised Domain\n Adaptation","summary":" Driving is challenging in conditions like night, rain, and snow. Lack of good\nlabeled datasets has hampered progress in scene understanding under such\nconditions. Unsupervised Domain Adaptation (UDA) using large labeled clear-day\ndatasets is a promising research direction in such cases. However, many UDA\nmethods are trained with dominant scene backgrounds (e.g., roads, sky,\nsidewalks) that appear dramatically different across domains. As a result, they\nstruggle to learn effective features of smaller and often sparse foreground\nobjects (e.g., people, vehicles, signs).\n In this work, we improve UDA training by applying in-place image warping to\nfocus on salient objects. We design instance-level saliency guidance to\nadaptively oversample object regions and undersample background areas, which\nreduces adverse effects from background context and enhances backbone feature\nlearning. Our approach improves adaptation across geographies, lighting, and\nweather conditions, and is agnostic to the task (segmentation, detection),\ndomain adaptation algorithm, saliency guidance, and underlying model\narchitecture. Result highlights include +6.1 mAP50 for BDD100K Clear\n$\\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\\rightarrow$ Night, +3.0\nmAP50 for BDD100K Clear $\\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes\n$\\rightarrow$ ACDC. Besides, Our method adds minimal training memory and no\nadditional inference latency. Code is available at\nhttps://github.com/ShenZheng2000/Instance-Warp\n","authors":["Shen Zheng","Anurag Ghosh","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2403.12712v3.pdf","comment":"WACV 2025 Accepted Paper"},{"id":"http://arxiv.org/abs/2412.03527v1","updated":"2024-12-04T18:15:41Z","published":"2024-12-04T18:15:41Z","title":"FANAL -- Financial Activity News Alerting Language Modeling Framework","summary":" In the rapidly evolving financial sector, the accurate and timely\ninterpretation of market news is essential for stakeholders needing to navigate\nunpredictable events. This paper introduces FANAL (Financial Activity News\nAlerting Language Modeling Framework), a specialized BERT-based framework\nengineered for real-time financial event detection and analysis, categorizing\nnews into twelve distinct financial categories. FANAL leverages silver-labeled\ndata processed through XGBoost and employs advanced fine-tuning techniques,\nalongside ORBERT (Odds Ratio BERT), a novel variant of BERT fine-tuned with\nORPO (Odds Ratio Preference Optimization) for superior class-wise probability\ncalibration and alignment with financial event relevance. We evaluate FANAL's\nperformance against leading large language models, including GPT-4o, Llama-3.1\n8B, and Phi-3, demonstrating its superior accuracy and cost efficiency. This\nframework sets a new standard for financial intelligence and responsiveness,\nsignificantly outstripping existing models in both performance and\naffordability.\n","authors":["Urjitkumar Patel","Fang-Chun Yeh","Chinmay Gondhalekar","Hari Nalluri"],"pdf_url":"https://arxiv.org/pdf/2412.03527v1.pdf","comment":"Accepted for the IEEE International Workshop on Large Language Models\n for Finance, 2024. This is a preprint version"},{"id":"http://arxiv.org/abs/2407.08152v2","updated":"2024-12-04T17:56:57Z","published":"2024-07-11T03:10:27Z","title":"Privacy-Preserving Data Deduplication for Enhancing Federated Learning\n of Language Models (Extended Version)","summary":" Deduplication is a vital preprocessing step that enhances machine learning\nmodel performance and saves training time and energy. However, enhancing\nfederated learning through deduplication poses challenges, especially regarding\nscalability and potential privacy violations if deduplication involves sharing\nall clients' data. In this paper, we address the problem of deduplication in a\nfederated setup by introducing a pioneering protocol, Efficient\nPrivacy-Preserving Multi-Party Deduplication (EP-MPD). It efficiently removes\nduplicates from multiple clients' datasets without compromising data privacy.\nEP-MPD is constructed in a modular fashion, utilizing two novel variants of the\nPrivate Set Intersection protocol. Our extensive experiments demonstrate the\nsignificant benefits of deduplication in federated learning of large language\nmodels. For instance, we observe up to 19.62\\% improvement in perplexity and up\nto 27.95\\% reduction in running time while varying the duplication level\nbetween 10\\% and 30\\%. EP-MPD effectively balances privacy and performance in\nfederated learning, making it a valuable solution for large-scale applications.\n","authors":["Aydin Abadi","Vishnu Asutosh Dasu","Sumanta Sarkar"],"pdf_url":"https://arxiv.org/pdf/2407.08152v2.pdf","comment":"Accepted at the Network and Distributed Systems Security (NDSS)\n Symposium, 2025"},{"id":"http://arxiv.org/abs/2412.03513v1","updated":"2024-12-04T17:56:49Z","published":"2024-12-04T17:56:49Z","title":"KKLIP: Knowledge Distillation Exploiting K-means Clustering for\n Language-Image Pre-Training","summary":" Recently, CLIP has emerged as a valuable model for aligning image and text\ninformation in multi-modal scenarios. However, researchers have observed\nlimitations in the ability of CLIP's text and image encoders to extract\ndetailed knowledge from caption-image pairs. In response, this paper introduces\nKKLIP, a novel approach designed to enhance the quality of CLIP by\nincorporating a new knowledge distillation (KD) method derived from Llama 2.\nOur method comprises three objectives: Text Embedding Distillation, Concept\nLearning, and Contrastive Learning. Firstly, Text Embedding Distillation\ninvolves training the KKLIP text encoder to emulate the teacher model, Llama 2.\nSecondly, Concept Learning assigns a soft concept label to each caption-image\npair through offline k-means clustering of text information from Llama 2,\nallowing KKLIP to learn from these soft concept labels. Finally, Contrastive\nLearning harmonizes text and image embeddings. Our experimental results\ndemonstrate that KKLIP enhances the quality of both text and image encoders.\n","authors":["Kuei-Chun Kao"],"pdf_url":"https://arxiv.org/pdf/2412.03513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03506v1","updated":"2024-12-04T17:48:38Z","published":"2024-12-04T17:48:38Z","title":"Self-test loss functions for learning weak-form operators and gradient\n flows","summary":" The construction of loss functions presents a major challenge in data-driven\nmodeling involving weak-form operators in PDEs and gradient flows, particularly\ndue to the need to select test functions appropriately. We address this\nchallenge by introducing self-test loss functions, which employ test functions\nthat depend on the unknown parameters, specifically for cases where the\noperator depends linearly on the unknowns. The proposed self-test loss function\nconserves energy for gradient flows and coincides with the expected\nlog-likelihood ratio for stochastic differential equations. Importantly, it is\nquadratic, facilitating theoretical analysis of identifiability and\nwell-posedness of the inverse problem, while also leading to efficient\nparametric or nonparametric regression algorithms. It is computationally\nsimple, requiring only low-order derivatives or even being entirely\nderivative-free, and numerical experiments demonstrate its robustness against\nnoisy and discrete data.\n","authors":["Yuan Gao","Quanjun Lang","Fei Lu"],"pdf_url":"https://arxiv.org/pdf/2412.03506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11376v2","updated":"2024-12-04T17:45:14Z","published":"2024-09-17T17:23:44Z","title":"Towards Time Series Reasoning with LLMs","summary":" Multi-modal large language models (MLLMs) have enabled numerous advances in\nunderstanding and reasoning in domains like vision, but we have not yet seen\nthis broad success for time-series. Although prior works on time-series MLLMs\nhave shown promising performance in time-series forecasting, very few works\nshow how an LLM could be used for time-series reasoning in natural language. We\npropose a novel multi-modal time-series LLM approach that learns generalizable\ninformation across various domains with powerful zero-shot performance. First,\nwe train a lightweight time-series encoder on top of an LLM to directly extract\ntime-series information. Then, we fine-tune our model with chain-of-thought\naugmented time-series tasks to encourage the model to generate reasoning paths.\nWe show that our model learns a latent representation that reflects specific\ntime-series features (e.g. slope, frequency), as well as outperforming GPT-4o\non a set of zero-shot reasoning tasks on a variety of domains.\n","authors":["Winnie Chow","Lauren Gardiner","Haraldur T. Hallgrímsson","Maxwell A. Xu","Shirley You Ren"],"pdf_url":"https://arxiv.org/pdf/2409.11376v2.pdf","comment":"Oral Presentation at 2024 NeurIPS Workshop on Time Series in the Age\n of Large Models"},{"id":"http://arxiv.org/abs/2412.03498v1","updated":"2024-12-04T17:39:55Z","published":"2024-12-04T17:39:55Z","title":"A Bidirectional Siamese Recurrent Neural Network for Accurate Gait\n Recognition Using Body Landmarks","summary":" Gait recognition is a significant biometric technique for person\nidentification, particularly in scenarios where other physiological biometrics\nare impractical or ineffective. In this paper, we address the challenges\nassociated with gait recognition and present a novel approach to improve its\naccuracy and reliability. The proposed method leverages advanced techniques,\nincluding sequential gait landmarks obtained through the Mediapipe pose\nestimation model, Procrustes analysis for alignment, and a Siamese\nbiGRU-dualStack Neural Network architecture for capturing temporal\ndependencies. Extensive experiments were conducted on large-scale cross-view\ndatasets to demonstrate the effectiveness of the approach, achieving high\nrecognition accuracy compared to other models. The model demonstrated\naccuracies of 95.7%, 94.44%, 87.71%, and 86.6% on CASIA-B, SZU RGB-D, OU-MVLP,\nand Gait3D datasets respectively. The results highlight the potential\napplications of the proposed method in various practical domains, indicating\nits significant contribution to the field of gait recognition.\n","authors":["Proma Hossain Progga","Md. Jobayer Rahman","Swapnil Biswas","Md. Shakil Ahmed","Arif Reza Anwary","Swakkhar Shatabda"],"pdf_url":"https://arxiv.org/pdf/2412.03498v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03497v1","updated":"2024-12-04T17:39:01Z","published":"2024-12-04T17:39:01Z","title":"Soft Checksums to Flag Untrustworthy Machine Learning Surrogate\n Predictions and Application to Atomic Physics Simulations","summary":" Trained neural networks (NN) are attractive as surrogate models to replace\ncostly calculations in physical simulations, but are often unknowingly applied\nto states not adequately represented in the training dataset. We present the\nnovel technique of soft checksums for scientific machine learning, a\ngeneral-purpose method to differentiate between trustworthy predictions with\nsmall errors on in-distribution (ID) data points, and untrustworthy predictions\nwith large errors on out-of-distribution (OOD) data points. By adding a check\nnode to the existing output layer, we train the model to learn the chosen\nchecksum function encoded within the NN predictions and show that violations of\nthis function correlate with high prediction errors. As the checksum function\ndepends only on the NN predictions, we can calculate the checksum error for any\nprediction with a single forward pass, incurring negligible time and memory\ncosts. Additionally, we find that incorporating the checksum function into the\nloss function and exposing the NN to OOD data points during the training\nprocess improves separation between ID and OOD predictions. By applying soft\nchecksums to a physically complex and high-dimensional non-local thermodynamic\nequilibrium atomic physics dataset, we show that a well-chosen threshold\nchecksum error can effectively separate ID and OOD predictions.\n","authors":["Casey Lauer","Robert C. Blake","Jonathan B. Freund"],"pdf_url":"https://arxiv.org/pdf/2412.03497v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2205.11359v3","updated":"2024-12-04T17:37:38Z","published":"2022-05-23T14:45:34Z","title":"Towards Size-Independent Generalization Bounds for Deep Operator Nets","summary":" In recent times machine learning methods have made significant advances in\nbecoming a useful tool for analyzing physical systems. A particularly active\narea in this theme has been \"physics-informed machine learning\" which focuses\non using neural nets for numerically solving differential equations. In this\nwork, we aim to advance the theory of measuring out-of-sample error while\ntraining DeepONets - which is among the most versatile ways to solve P.D.E\nsystems in one-shot. Firstly, for a class of DeepONets, we prove a bound on\ntheir Rademacher complexity which does not explicitly scale with the width of\nthe nets involved. Secondly, we use this to show how the Huber loss can be\nchosen so that for these DeepONet classes generalization error bounds can be\nobtained that have no explicit dependence on the size of the nets. The\neffective capacity measure for DeepONets that we thus derive is also shown to\ncorrelate with the behavior of generalization error in experiments.\n","authors":["Pulkit Gopalani","Sayar Karmakar","Dibyakanti Kumar","Anirbit Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2205.11359v3.pdf","comment":"33 pages, 7 figures; Published in TMLR, December 2024"},{"id":"http://arxiv.org/abs/2412.03496v1","updated":"2024-12-04T17:36:47Z","published":"2024-12-04T17:36:47Z","title":"TRENDy: Temporal Regression of Effective Non-linear Dynamics","summary":" Spatiotemporal dynamics pervade the natural sciences, from the morphogen\ndynamics underlying patterning in animal pigmentation to the protein waves\ncontrolling cell division. A central challenge lies in understanding how\ncontrollable parameters induce qualitative changes in system behavior called\nbifurcations. This endeavor is made particularly difficult in realistic\nsettings where governing partial differential equations (PDEs) are unknown and\ndata is limited and noisy. To address this challenge, we propose TRENDy\n(Temporal Regression of Effective Nonlinear Dynamics), an equation-free\napproach to learning low-dimensional, predictive models of spatiotemporal\ndynamics. Following classical work in spatial coarse-graining, TRENDy first\nmaps input data to a low-dimensional space of effective dynamics via a cascade\nof multiscale filtering operations. Our key insight is the recognition that\nthese effective dynamics can be fit by a neural ordinary differential equation\n(NODE) having the same parameter space as the input PDE. The preceding\nfiltering operations strongly regularize the phase space of the NODE, making\nTRENDy significantly more robust to noise compared to existing methods. We\ntrain TRENDy to predict the effective dynamics of synthetic and real data\nrepresenting dynamics from across the physical and life sciences. We then\ndemonstrate how our framework can automatically locate both Turing and Hopf\nbifurcations in unseen regions of parameter space. We finally apply our method\nto the analysis of spatial patterning of the ocellated lizard through\ndevelopment. We found that TRENDy's effective state not only accurately\npredicts spatial changes over time but also identifies distinct pattern\nfeatures unique to different anatomical regions, highlighting the potential\ninfluence of surface geometry on reaction-diffusion mechanisms and their role\nin driving spatially varying pattern dynamics.\n","authors":["Matthew Ricci","Guy Pelc","Zoe Piran","Noa Moriel","Mor Nitzan"],"pdf_url":"https://arxiv.org/pdf/2412.03496v1.pdf","comment":"10 pages, 14 appendix pages, 5 figures, 7 appendix figures"},{"id":"http://arxiv.org/abs/2412.03491v1","updated":"2024-12-04T17:29:10Z","published":"2024-12-04T17:29:10Z","title":"Beyond algorithm hyperparameters: on preprocessing hyperparameters and\n associated pitfalls in machine learning applications","summary":" Adequately generating and evaluating prediction models based on supervised\nmachine learning (ML) is often challenging, especially for less experienced\nusers in applied research areas. Special attention is required in settings\nwhere the model generation process involves hyperparameter tuning, i.e.\ndata-driven optimization of different types of hyperparameters to improve the\npredictive performance of the resulting model. Discussions about tuning\ntypically focus on the hyperparameters of the ML algorithm (e.g., the minimum\nnumber of observations in each terminal node for a tree-based algorithm). In\nthis context, it is often neglected that hyperparameters also exist for the\npreprocessing steps that are applied to the data before it is provided to the\nalgorithm (e.g., how to handle missing feature values in the data). As a\nconsequence, users experimenting with different preprocessing options to\nimprove model performance may be unaware that this constitutes a form of\nhyperparameter tuning - albeit informal and unsystematic - and thus may fail to\nreport or account for this optimization. To illuminate this issue, this paper\nreviews and empirically illustrates different procedures for generating and\nevaluating prediction models, explicitly addressing the different ways\nalgorithm and preprocessing hyperparameters are typically handled by applied ML\nusers. By highlighting potential pitfalls, especially those that may lead to\nexaggerated performance claims, this review aims to further improve the quality\nof predictive modeling in ML applications.\n","authors":["Christina Sauer","Anne-Laure Boulesteix","Luzia Hanßum","Farina Hodiamont","Claudia Bausewein","Theresa Ullmann"],"pdf_url":"https://arxiv.org/pdf/2412.03491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03487v1","updated":"2024-12-04T17:24:35Z","published":"2024-12-04T17:24:35Z","title":"Flow Matching with General Discrete Paths: A Kinetic-Optimal Perspective","summary":" The design space of discrete-space diffusion or flow generative models are\nsignificantly less well-understood than their continuous-space counterparts,\nwith many works focusing only on a simple masked construction. In this work, we\naim to take a holistic approach to the construction of discrete generative\nmodels based on continuous-time Markov chains, and for the first time, allow\nthe use of arbitrary discrete probability paths, or colloquially, corruption\nprocesses. Through the lens of optimizing the symmetric kinetic energy, we\npropose velocity formulas that can be applied to any given probability path,\ncompletely decoupling the probability and velocity, and giving the user the\nfreedom to specify any desirable probability path based on expert knowledge\nspecific to the data domain. Furthermore, we find that a special construction\nof mixture probability paths optimizes the symmetric kinetic energy for the\ndiscrete case. We empirically validate the usefulness of this new design space\nacross multiple modalities: text generation, inorganic material generation, and\nimage generation. We find that we can outperform the mask construction even in\ntext with kinetic-optimal mixture paths, while we can make use of\ndomain-specific constructions of the probability path over the visual domain.\n","authors":["Neta Shaul","Itai Gat","Marton Havasi","Daniel Severo","Anuroop Sriram","Peter Holderrieth","Brian Karrer","Yaron Lipman","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03486v1","updated":"2024-12-04T17:23:35Z","published":"2024-12-04T17:23:35Z","title":"Tight PAC-Bayesian Risk Certificates for Contrastive Learning","summary":" Contrastive representation learning is a modern paradigm for learning\nrepresentations of unlabeled data via augmentations -- precisely, contrastive\nmodels learn to embed semantically similar pairs of samples (positive pairs)\ncloser than independently drawn samples (negative samples). In spite of its\nempirical success and widespread use in foundation models, statistical theory\nfor contrastive learning remains less explored. Recent works have developed\ngeneralization error bounds for contrastive losses, but the resulting risk\ncertificates are either vacuous (certificates based on Rademacher complexity or\n$f$-divergence) or require strong assumptions about samples that are\nunreasonable in practice. The present paper develops non-vacuous PAC-Bayesian\nrisk certificates for contrastive representation learning, considering the\npractical considerations of the popular SimCLR framework. Notably, we take into\naccount that SimCLR reuses positive pairs of augmented data as negative samples\nfor other data, thereby inducing strong dependence and making classical PAC or\nPAC-Bayesian bounds inapplicable. We further refine existing bounds on the\ndownstream classification loss by incorporating SimCLR-specific factors,\nincluding data augmentation and temperature scaling, and derive risk\ncertificates for the contrastive zero-one risk. The resulting bounds for\ncontrastive loss and downstream prediction are much tighter than those of\nprevious risk certificates, as demonstrated by experiments on CIFAR-10.\n","authors":["Anna van Elst","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2412.03486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03483v1","updated":"2024-12-04T17:20:01Z","published":"2024-12-04T17:20:01Z","title":"Convolutional Neural Networks and Mixture of Experts for Intrusion\n Detection in 5G Networks and beyond","summary":" The advent of 6G/NextG networks comes along with a series of benefits,\nincluding extreme capacity, reliability, and efficiency. However, these\nnetworks may become vulnerable to new security threats. Therefore, 6G/NextG\nnetworks must be equipped with advanced Artificial Intelligence algorithms, in\norder to evade these attacks. Existing studies on the intrusion detection task\nrely on the train of shallow machine learning classifiers, including Logistic\nRegression, Decision Trees, and so on, yielding suboptimal performance. Others\nare based on deep neural networks consisting of static components, which are\nnot conditional on the input. This limits their representation power and\nefficiency. To resolve these issues, we present the first study integrating\nMixture of Experts (MoE) for identifying malicious traffic. Specifically, we\nuse network traffic data and convert the 1D array of features into a 2D matrix.\nNext, we pass this matrix through convolutional neural network (CNN) layers\nfollowed by batch normalization and max pooling layers. After obtaining the\nrepresentation vector via the CNN layers, a sparsely gated MoE layer is used.\nThis layer consists of a set of experts (dense layers) and a router, where the\nrouter assigns weights to the output of each expert. Sparsity is achieved by\nchoosing the most relevant experts of the total ones. Finally, we perform a\nseries of ablation experiments to prove the effectiveness of our proposed\nmodel. Experiments are conducted on the 5G-NIDD dataset, a network intrusion\ndetection dataset generated from a real 5G test network. Results show that our\nintroduced approach reaches weighted F1-score up to 99.95% achieving comparable\nperformance to existing approaches. Findings also show that our proposed model\nachieves multiple advantages over state-of-the-art approaches.\n","authors":["Loukas Ilias","George Doukas","Vangelis Lamprou","Christos Ntanos","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2412.03483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04919v2","updated":"2024-12-04T17:18:05Z","published":"2024-05-08T09:41:25Z","title":"Fast Computation of Leave-One-Out Cross-Validation for $k$-NN Regression","summary":" We describe a fast computation method for leave-one-out cross-validation\n(LOOCV) for $k$-nearest neighbours ($k$-NN) regression. We show that, under a\ntie-breaking condition for nearest neighbours, the LOOCV estimate of the mean\nsquare error for $k$-NN regression is identical to the mean square error of\n$(k+1)$-NN regression evaluated on the training data, multiplied by the scaling\nfactor $(k+1)^2/k^2$. Therefore, to compute the LOOCV score, one only needs to\nfit $(k+1)$-NN regression only once, and does not need to repeat\ntraining-validation of $k$-NN regression for the number of training data.\nNumerical experiments confirm the validity of the fast computation method.\n","authors":["Motonobu Kanagawa"],"pdf_url":"https://arxiv.org/pdf/2405.04919v2.pdf","comment":"To appear in Transactions of Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2411.11976v2","updated":"2024-12-04T17:13:22Z","published":"2024-11-18T19:06:01Z","title":"Coverage-Constrained Human-AI Cooperation with Multiple Experts","summary":" Human-AI cooperative classification (HAI-CC) approaches aim to develop hybrid\nintelligent systems that enhance decision-making in various high-stakes\nreal-world scenarios by leveraging both human expertise and AI capabilities.\nCurrent HAI-CC methods primarily focus on learning-to-defer (L2D), where\ndecisions are deferred to human experts, and learning-to-complement (L2C),\nwhere AI and human experts make predictions cooperatively. However, a notable\nresearch gap remains in effectively exploring both L2D and L2C under diverse\nexpert knowledge to improve decision-making, particularly when constrained by\nthe cooperation cost required to achieve a target probability for AI-only\nselection (i.e., coverage). In this paper, we address this research gap by\nproposing the Coverage-constrained Learning to Defer and Complement with\nSpecific Experts (CL2DC) method. CL2DC makes final decisions through either AI\nprediction alone or by deferring to or complementing a specific expert,\ndepending on the input data. Furthermore, we propose a coverage-constrained\noptimisation to control the cooperation cost, ensuring it approximates a target\nprobability for AI-only selection. This approach enables an effective\nassessment of system performance within a specified budget. Also, CL2DC is\ndesigned to address scenarios where training sets contain multiple noisy-label\nannotations without any clean-label references. Comprehensive evaluations on\nboth synthetic and real-world datasets demonstrate that CL2DC achieves superior\nperformance compared to state-of-the-art HAI-CC methods.\n","authors":["Zheng Zhang","Cuong Nguyen","Kevin Wells","Thanh-Toan Do","David Rosewarne","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2411.11976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08511v5","updated":"2024-12-04T17:10:06Z","published":"2024-10-11T04:23:56Z","title":"Distributionally robust self-supervised learning for tabular data","summary":" Machine learning (ML) models trained using Empirical Risk Minimization (ERM)\noften exhibit systematic errors on specific subpopulations of tabular data,\nknown as error slices. Learning robust representation in presence of error\nslices is challenging, especially in self-supervised settings during the\nfeature reconstruction phase, due to high cardinality features and the\ncomplexity of constructing error sets. Traditional robust representation\nlearning methods are largely focused on improving worst group performance in\nsupervised setting in computer vision, leaving a gap in approaches tailored for\ntabular data. We address this gap by developing a framework to learn robust\nrepresentation in tabular data during self-supervised pre-training. Our\napproach utilizes an encoder-decoder model trained with Masked Language\nModeling (MLM) loss to learn robust latent representations. This paper applies\nthe Just Train Twice (JTT) and Deep Feature Reweighting (DFR) methods during\nthe pre-training phase for tabular data. These methods fine-tune the ERM\npre-trained model by up-weighting error-prone samples or creating balanced\ndatasets for specific categorical features. This results in specialized models\nfor each feature, which are then used in an ensemble approach to enhance\ndownstream classification performance. This methodology improves robustness\nacross slices, thus enhancing overall generalization performance. Extensive\nexperiments across various datasets demonstrate the efficacy of our approach.\nThe code is available:\n\\url{https://github.com/amazon-science/distributionally-robust-self-supervised-learning-for-tabular-data}.\n","authors":["Shantanu Ghosh","Tiankang Xie","Mikhail Kuznetsov"],"pdf_url":"https://arxiv.org/pdf/2410.08511v5.pdf","comment":"TRL Workshop@NeurIPS2024"},{"id":"http://arxiv.org/abs/2410.13928v2","updated":"2024-12-04T17:03:13Z","published":"2024-10-17T17:56:01Z","title":"Automatically Interpreting Millions of Features in Large Language Models","summary":" While the activations of neurons in deep neural networks usually do not have\na simple human-understandable interpretation, sparse autoencoders (SAEs) can be\nused to transform these activations into a higher-dimensional latent space\nwhich may be more easily interpretable. However, these SAEs can have millions\nof distinct latent features, making it infeasible for humans to manually\ninterpret each one. In this work, we build an open-source automated pipeline to\ngenerate and evaluate natural language explanations for SAE features using\nLLMs. We test our framework on SAEs of varying sizes, activation functions, and\nlosses, trained on two different open-weight LLMs. We introduce five new\ntechniques to score the quality of explanations that are cheaper to run than\nthe previous state of the art. One of these techniques, intervention scoring,\nevaluates the interpretability of the effects of intervening on a feature,\nwhich we find explains features that are not recalled by existing methods. We\npropose guidelines for generating better explanations that remain valid for a\nbroader set of activating contexts, and discuss pitfalls with existing scoring\ntechniques. We use our explanations to measure the semantic similarity of\nindependently trained SAEs, and find that SAEs trained on nearby layers of the\nresidual stream are highly similar. Our large-scale analysis confirms that SAE\nlatents are indeed much more interpretable than neurons, even when neurons are\nsparsified using top-$k$ postprocessing. Our code is available at\nhttps://github.com/EleutherAI/sae-auto-interp, and our explanations are\navailable at\nhttps://huggingface.co/datasets/EleutherAI/auto_interp_explanations.\n","authors":["Gonçalo Paulo","Alex Mallen","Caden Juang","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2410.13928v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08026v2","updated":"2024-12-04T16:59:38Z","published":"2024-10-10T15:23:21Z","title":"Generalization Bounds and Model Complexity for Kolmogorov-Arnold\n Networks","summary":" Kolmogorov-Arnold Network (KAN) is a network structure recently proposed by\nLiu et al. (2024) that offers improved interpretability and a more parsimonious\ndesign in many science-oriented tasks compared to multi-layer perceptrons. This\nwork provides a rigorous theoretical analysis of KAN by establishing\ngeneralization bounds for KAN equipped with activation functions that are\neither represented by linear combinations of basis functions or lying in a\nlow-rank Reproducing Kernel Hilbert Space (RKHS). In the first case, the\ngeneralization bound accommodates various choices of basis functions in forming\nthe activation functions in each layer of KAN and is adapted to different\noperator norms at each layer. For a particular choice of operator norms, the\nbound scales with the $l_1$ norm of the coefficient matrices and the Lipschitz\nconstants for the activation functions, and it has no dependence on\ncombinatorial parameters (e.g., number of nodes) outside of logarithmic\nfactors. Moreover, our result does not require the boundedness assumption on\nthe loss function and, hence, is applicable to a general class of\nregression-type loss functions. In the low-rank case, the generalization bound\nscales polynomially with the underlying ranks as well as the Lipschitz\nconstants of the activation functions in each layer. These bounds are\nempirically investigated for KANs trained with stochastic gradient descent on\nsimulated and real data sets. The numerical results demonstrate the practical\nrelevance of these bounds.\n","authors":["Xianyang Zhang","Huijuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2410.08026v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03471v1","updated":"2024-12-04T16:59:37Z","published":"2024-12-04T16:59:37Z","title":"Cluster Specific Representation Learning","summary":" Representation learning aims to extract meaningful lower-dimensional\nembeddings from data, known as representations. Despite its widespread\napplication, there is no established definition of a ``good'' representation.\nTypically, the representation quality is evaluated based on its performance in\ndownstream tasks such as clustering, de-noising, etc. However, this\ntask-specific approach has a limitation where a representation that performs\nwell for one task may not necessarily be effective for another. This highlights\nthe need for a more agnostic formulation, which is the focus of our work. We\npropose a downstream-agnostic formulation: when inherent clusters exist in the\ndata, the representations should be specific to each cluster. Under this idea,\nwe develop a meta-algorithm that jointly learns cluster-specific\nrepresentations and cluster assignments. As our approach is easy to integrate\nwith any representation learning framework, we demonstrate its effectiveness in\nvarious setups, including Autoencoders, Variational Autoencoders, Contrastive\nlearning models, and Restricted Boltzmann Machines. We qualitatively compare\nour cluster-specific embeddings to standard embeddings and downstream tasks\nsuch as de-noising and clustering. While our method slightly increases runtime\nand parameters compared to the standard model, the experiments clearly show\nthat it extracts the inherent cluster structures in the data, resulting in\nimproved performance in relevant applications.\n","authors":["Mahalakshmi Sabanayagam","Omar Al-Dabooni","Pascal Esser"],"pdf_url":"https://arxiv.org/pdf/2412.03471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03465v1","updated":"2024-12-04T16:54:58Z","published":"2024-12-04T16:54:58Z","title":"YT-30M: A multi-lingual multi-category dataset of YouTube comments","summary":" This paper introduces two large-scale multilingual comment datasets, YT-30M\n(and YT-100K) from YouTube. The analysis in this paper is performed on a\nsmaller sample (YT-100K) of YT-30M. Both the datasets: YT-30M (full) and\nYT-100K (randomly selected 100K sample from YT-30M) are publicly released for\nfurther research. YT-30M (YT-100K) contains 32236173 (108694) comments posted\nby YouTube channel that belong to YouTube categories. Each comment is\nassociated with a video ID, comment ID, commentor name, commentor channel ID,\ncomment text, upvotes, original channel ID and category of the YouTube channel\n(e.g., 'News & Politics', 'Science & Technology', etc.).\n","authors":["Hridoy Sankar Dutta"],"pdf_url":"https://arxiv.org/pdf/2412.03465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03464v1","updated":"2024-12-04T16:52:44Z","published":"2024-12-04T16:52:44Z","title":"Validity and efficiency of the conformal CUSUM procedure","summary":" In this paper we study the validity and efficiency of a conformal version of\nthe CUSUM procedure for change detection both experimentally and theoretically.\n","authors":["Vladimir Vovk","Ilia Nouretdinov","Alex Gammerman"],"pdf_url":"https://arxiv.org/pdf/2412.03464v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.03442v1","updated":"2024-12-04T16:30:35Z","published":"2024-12-04T16:30:35Z","title":"State Frequency Estimation for Anomaly Detection","summary":" Many works have studied the efficacy of state machines for detecting\nanomalies within NetFlows. These works typically learn a model from unlabeled\ndata and compute anomaly scores for arbitrary traces based on their likelihood\nof occurrence or how well they fit within the model. However, these methods do\nnot dynamically adapt their scores based on the traces seen at test time. This\nbecomes a problem when an adversary produces seemingly common traces in their\nattack, causing the model to miss the detection by assigning low anomaly\nscores. We propose SEQUENT, a new approach that uses the state visit frequency\nto adapt its scoring for anomaly detection dynamically. SEQUENT subsequently\nuses the scores to generate root causes for anomalies. These allow the grouping\nof alarms and simplify the analysis of anomalies. Our evaluation of SEQUENT on\nthree NetFlow datasets indicates that our approach outperforms existing\nmethods, demonstrating its effectiveness in detecting anomalies.\n","authors":["Clinton Cao","Agathe Blaise","Annibale Panichella","Sicco Verwer"],"pdf_url":"https://arxiv.org/pdf/2412.03442v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2412.03441v1","updated":"2024-12-04T16:30:03Z","published":"2024-12-04T16:30:03Z","title":"PBP: Post-training Backdoor Purification for Malware Classifiers","summary":" In recent years, the rise of machine learning (ML) in cybersecurity has\nbrought new challenges, including the increasing threat of backdoor poisoning\nattacks on ML malware classifiers. For instance, adversaries could inject\nmalicious samples into public malware repositories, contaminating the training\ndata and potentially misclassifying malware by the ML model. Current\ncountermeasures predominantly focus on detecting poisoned samples by leveraging\ndisagreements within the outputs of a diverse set of ensemble models on\ntraining data points. However, these methods are not suitable for scenarios\nwhere Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove\nbackdoors from a model after it has been trained. Addressing this scenario, we\nintroduce PBP, a post-training defense for malware classifiers that mitigates\nvarious types of backdoor embeddings without assuming any specific backdoor\nembedding mechanism. Our method exploits the influence of backdoor attacks on\nthe activation distribution of neural networks, independent of the\ntrigger-embedding method. In the presence of a backdoor attack, the activation\ndistribution of each layer is distorted into a mixture of distributions. By\nregulating the statistics of the batch normalization layers, we can guide a\nbackdoored model to perform similarly to a clean one. Our method demonstrates\nsubstantial advantages over several state-of-the-art methods, as evidenced by\nexperiments on two datasets, two types of backdoor methods, and various attack\nconfigurations. Notably, our approach requires only a small portion of the\ntraining data -- only 1\\% -- to purify the backdoor and reduce the attack\nsuccess rate from 100\\% to almost 0\\%, a 100-fold improvement over the baseline\nmethods. Our code is available at\n\\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}.\n","authors":["Dung Thuy Nguyen","Ngoc N. Tran","Taylor T. Johnson","Kevin Leach"],"pdf_url":"https://arxiv.org/pdf/2412.03441v1.pdf","comment":"Accepted at NDSS 2025"},{"id":"http://arxiv.org/abs/2412.03430v1","updated":"2024-12-04T16:19:47Z","published":"2024-12-04T16:19:47Z","title":"SINGER: Vivid Audio-driven Singing Video Generation with Multi-scale\n Spectral Diffusion Model","summary":" Recent advancements in generative models have significantly enhanced talking\nface video generation, yet singing video generation remains underexplored. The\ndifferences between human talking and singing limit the performance of existing\ntalking face video generation models when applied to singing. The fundamental\ndifferences between talking and singing-specifically in audio characteristics\nand behavioral expressions-limit the effectiveness of existing models. We\nobserve that the differences between singing and talking audios manifest in\nterms of frequency and amplitude. To address this, we have designed a\nmulti-scale spectral module to help the model learn singing patterns in the\nspectral domain. Additionally, we develop a spectral-filtering module that aids\nthe model in learning the human behaviors associated with singing audio. These\ntwo modules are integrated into the diffusion model to enhance singing video\ngeneration performance, resulting in our proposed model, SINGER. Furthermore,\nthe lack of high-quality real-world singing face videos has hindered the\ndevelopment of the singing video generation community. To address this gap, we\nhave collected an in-the-wild audio-visual singing dataset to facilitate\nresearch in this area. Our experiments demonstrate that SINGER is capable of\ngenerating vivid singing videos and outperforms state-of-the-art methods in\nboth objective and subjective evaluations.\n","authors":["Yan Li","Ziya Zhou","Zhiqiang Wang","Wei Xue","Wenhan Luo","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2412.03430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03427v1","updated":"2024-12-04T16:17:09Z","published":"2024-12-04T16:17:09Z","title":"Assessing Foundation Models' Transferability to Physiological Signals in\n Precision Medicine","summary":" The success of precision medicine requires computational models that can\neffectively process and interpret diverse physiological signals across\nheterogeneous patient populations. While foundation models have demonstrated\nremarkable transfer capabilities across various domains, their effectiveness in\nhandling individual-specific physiological signals - crucial for precision\nmedicine - remains largely unexplored. This work introduces a systematic\npipeline for rapidly and efficiently evaluating foundation models' transfer\ncapabilities in medical contexts. Our pipeline employs a three-stage approach.\nFirst, it leverages physiological simulation software to generate diverse,\nclinically relevant scenarios, particularly focusing on data-scarce medical\nconditions. This simulation-based approach enables both targeted capability\nassessment and subsequent model fine-tuning. Second, the pipeline projects\nthese simulated signals through the foundation model to obtain embeddings,\nwhich are then evaluated using linear methods. This evaluation quantifies the\nmodel's ability to capture three critical aspects: physiological feature\nindependence, temporal dynamics preservation, and medical scenario\ndifferentiation. Finally, the pipeline validates these representations through\nspecific downstream medical tasks. Initial testing of our pipeline on the\nMoirai time series foundation model revealed significant limitations in\nphysiological signal processing, including feature entanglement, temporal\ndynamics distortion, and reduced scenario discrimination. These findings\nsuggest that current foundation models may require substantial architectural\nmodifications or targeted fine-tuning before deployment in clinical settings.\n","authors":["Matthias Christenson","Cove Geary","Brian Locke","Pranav Koirala","Warren Woodrich Pettine"],"pdf_url":"https://arxiv.org/pdf/2412.03427v1.pdf","comment":"Presented at the precision medicine workshop at the AI in Medicine\n conference (2024) in Salt Lake City"},{"id":"http://arxiv.org/abs/2406.06671v2","updated":"2024-12-04T16:04:07Z","published":"2024-06-10T18:00:00Z","title":"Controlling Counterfactual Harm in Decision Support Systems Based on\n Prediction Sets","summary":" Decision support systems based on prediction sets help humans solve\nmulticlass classification tasks by narrowing down the set of potential label\nvalues to a subset of them, namely a prediction set, and asking them to always\npredict label values from the prediction sets. While this type of systems have\nbeen proven to be effective at improving the average accuracy of the\npredictions made by humans, by restricting human agency, they may cause\nharm$\\unicode{x2014}$a human who has succeeded at predicting the ground-truth\nlabel of an instance on their own may have failed had they used these systems.\nIn this paper, our goal is to control how frequently a decision support system\nbased on prediction sets may cause harm, by design. To this end, we start by\ncharacterizing the above notion of harm using the theoretical framework of\nstructural causal models. Then, we show that, under a natural, albeit\nunverifiable, monotonicity assumption, we can estimate how frequently a system\nmay cause harm using only predictions made by humans on their own. Further, we\nalso show that, under a weaker monotonicity assumption, which can be verified\nexperimentally, we can bound how frequently a system may cause harm again using\nonly predictions made by humans on their own. Building upon these assumptions,\nwe introduce a computational framework to design decision support systems based\non prediction sets that are guaranteed to cause harm less frequently than a\nuser-specified value using conformal risk control. We validate our framework\nusing real human predictions from two different human subject studies and show\nthat, in decision support systems based on prediction sets, there is a\ntrade-off between accuracy and counterfactual harm.\n","authors":["Eleni Straitouri","Suhas Thejaswi","Manuel Gomez Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2406.06671v2.pdf","comment":"Accepted at the ICML 2024 Workshop on Humans, Algorithmic\n Decision-Making and Society and published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2402.17826v3","updated":"2024-12-04T16:03:04Z","published":"2024-02-27T19:00:01Z","title":"Prediction-Powered Ranking of Large Language Models","summary":" Large language models are often ranked according to their level of alignment\nwith human preferences -- a model is better than other models if its outputs\nare more frequently preferred by humans. One of the popular ways to elicit\nhuman preferences utilizes pairwise comparisons between the outputs provided by\ndifferent models to the same inputs. However, since gathering pairwise\ncomparisons by humans is costly and time-consuming, it has become a common\npractice to gather pairwise comparisons by a strong large language model -- a\nmodel strongly aligned with human preferences. Surprisingly, practitioners\ncannot currently measure the uncertainty that any mismatch between human and\nmodel preferences may introduce in the constructed rankings. In this work, we\ndevelop a statistical framework to bridge this gap. Given a (small) set of\npairwise comparisons by humans and a large set of pairwise comparisons by a\nmodel, our framework provides a rank-set -- a set of possible ranking positions\n-- for each of the models under comparison. Moreover, it guarantees that, with\na probability greater than or equal to a user-specified value, the rank-sets\ncover the true ranking consistent with the distribution of human pairwise\npreferences asymptotically. Using pairwise comparisons made by humans in the\nLMSYS Chatbot Arena platform and pairwise comparisons made by three strong\nlarge language models, we empirically demonstrate the effectivity of our\nframework and show that the rank-sets constructed using only pairwise\ncomparisons by the strong large language models are often inconsistent with\n(the distribution of) human pairwise preferences.\n","authors":["Ivi Chatzi","Eleni Straitouri","Suhas Thejaswi","Manuel Gomez Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2402.17826v3.pdf","comment":"Published at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03417v1","updated":"2024-12-04T15:53:45Z","published":"2024-12-04T15:53:45Z","title":"Learning Semantic Association Rules from Internet of Things Data","summary":" Association Rule Mining (ARM) is the task of discovering commonalities in\ndata in the form of logical implications. ARM is used in the Internet of Things\n(IoT) for different tasks including monitoring and decision-making. However,\nexisting methods give limited consideration to IoT-specific requirements such\nas heterogeneity and volume. Furthermore, they do not utilize important static\ndomain-specific description data about IoT systems, which is increasingly\nrepresented as knowledge graphs. In this paper, we propose a novel ARM pipeline\nfor IoT data that utilizes both dynamic sensor data and static IoT system\nmetadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method\n(Aerial) as part of the pipeline to address the high volume of IoT data and\nreduce the total number of rules that are resource-intensive to process. Aerial\nlearns a neural representation of a given data and extracts association rules\nfrom this representation by exploiting the reconstruction (decoding) mechanism\nof an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show\nthat ARM on both static and dynamic IoT data results in more generically\napplicable rules while Aerial can learn a more concise set of high-quality\nassociation rules than the state-of-the-art with full coverage over the\ndatasets.\n","authors":["Erkan Karabulut","Paul Groth","Victoria Degeler"],"pdf_url":"https://arxiv.org/pdf/2412.03417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03752v2","updated":"2024-12-04T15:53:19Z","published":"2024-11-06T08:27:49Z","title":"Deferred Poisoning: Making the Model More Vulnerable via Hessian\n Singularization","summary":" Recent studies have shown that deep learning models are very vulnerable to\npoisoning attacks. Many defense methods have been proposed to address this\nissue. However, traditional poisoning attacks are not as threatening as\ncommonly believed. This is because they often cause differences in how the\nmodel performs on the training set compared to the validation set. Such\ninconsistency can alert defenders that their data has been poisoned, allowing\nthem to take the necessary defensive actions. In this paper, we introduce a\nmore threatening type of poisoning attack called the Deferred Poisoning Attack.\nThis new attack allows the model to function normally during the training and\nvalidation phases but makes it very sensitive to evasion attacks or even\nnatural noise. We achieve this by ensuring the poisoned model's loss function\nhas a similar value as a normally trained model at each input sample but with a\nlarge local curvature. A similar model loss ensures that there is no obvious\ninconsistency between the training and validation accuracy, demonstrating high\nstealthiness. On the other hand, the large curvature implies that a small\nperturbation may cause a significant increase in model loss, leading to\nsubstantial performance degradation, which reflects a worse robustness. We\nfulfill this purpose by making the model have singular Hessian information at\nthe optimal point via our proposed Singularization Regularization term. We have\nconducted both theoretical and empirical analyses of the proposed method and\nvalidated its effectiveness through experiments on image classification tasks.\nFurthermore, we have confirmed the hazards of this form of poisoning attack\nunder more general scenarios using natural noise, offering a new perspective\nfor research in the field of security.\n","authors":["Yuhao He","Jinyu Tian","Xianwei Zheng","Li Dong","Yuanman Li","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.03752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03405v1","updated":"2024-12-04T15:36:20Z","published":"2024-12-04T15:36:20Z","title":"Deep Operator BSDE: a Numerical Scheme to Approximate the Solution\n Operators","summary":" Motivated by dynamic risk measures and conditional $g$-expectations, in this\nwork we propose a numerical method to approximate the solution operator given\nby a Backward Stochastic Differential Equation (BSDE). The main ingredients for\nthis are the Wiener chaos decomposition and the classical Euler scheme for\nBSDEs. We show convergence of this scheme under very mild assumptions, and\nprovide a rate of convergence in more restrictive cases. We then implement it\nusing neural networks, and we present several numerical examples where we can\ncheck the accuracy of the method.\n","authors":["Giulia Di Nunno","Pere Díaz Lozano"],"pdf_url":"https://arxiv.org/pdf/2412.03405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.09695v3","updated":"2024-12-04T15:35:48Z","published":"2024-10-13T02:10:26Z","title":"Can In-context Learning Really Generalize to Out-of-distribution Tasks?","summary":" In this work, we explore the mechanism of in-context learning (ICL) on\nout-of-distribution (OOD) tasks that were not encountered during training. To\nachieve this, we conduct synthetic experiments where the objective is to learn\nOOD mathematical functions through ICL using a GPT-2 model. We reveal that\nTransformers may struggle to learn OOD task functions through ICL.\nSpecifically, ICL performance resembles implementing a function within the\npretraining hypothesis space and optimizing it with gradient descent based on\nthe in-context examples. Additionally, we investigate ICL's well-documented\nability to learn unseen abstract labels in context. We demonstrate that such\nability only manifests in the scenarios without distributional shifts and,\ntherefore, may not serve as evidence of new-task-learning ability. Furthermore,\nwe assess ICL's performance on OOD tasks when the model is pretrained on\nmultiple tasks. Both empirical and theoretical analyses demonstrate the\nexistence of the \\textbf{low-test-error preference} of ICL, where it tends to\nimplement the pretraining function that yields low test error in the testing\ncontext. We validate this through numerical experiments. This new theoretical\nresult, combined with our empirical findings, elucidates the mechanism of ICL\nin addressing OOD tasks.\n","authors":["Qixun Wang","Yifei Wang","Yisen Wang","Xianghua Ying"],"pdf_url":"https://arxiv.org/pdf/2410.09695v3.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2305.05518v2","updated":"2024-12-04T15:32:32Z","published":"2023-05-09T15:16:50Z","title":"Minimal Learning Machine for Multi-Label Learning","summary":" Distance-based supervised method, the minimal learning machine, constructs a\npredictive model from data by learning a mapping between input and output\ndistance matrices. In this paper, we propose new methods and evaluate how their\ncore component, the distance mapping, can be adapted to multi-label learning.\nThe proposed approach is based on combining the distance mapping with an\ninverse distance weighting. Although the proposal is one of the simplest\nmethods in the multi-label learning literature, it achieves state-of-the-art\nperformance for small to moderate-sized multi-label learning problems. In\naddition to its simplicity, the proposed method is fully deterministic: Its\nhyper-parameter can be selected via ranking loss-based statistic which has a\nclosed form, thus avoiding conventional cross-validation-based hyper-parameter\ntuning. In addition, due to its simple linear distance mapping-based\nconstruction, we demonstrate that the proposed method can assess the\nuncertainty of the predictions for multi-label classification, which is a\nvaluable capability for data-centric machine learning pipelines.\n","authors":["Joonas Hämäläinen","Antoine Hubermont","Amauri Souza","César L. C. Mattos","João P. P. Gomes","Tommi Kärkkäinen"],"pdf_url":"https://arxiv.org/pdf/2305.05518v2.pdf","comment":"Submitted, 29 pages"},{"id":"http://arxiv.org/abs/2412.03393v1","updated":"2024-12-04T15:22:54Z","published":"2024-12-04T15:22:54Z","title":"Can neural operators always be continuously discretized?","summary":" We consider the problem of discretization of neural operators between Hilbert\nspaces in a general framework including skip connections. We focus on bijective\nneural operators through the lens of diffeomorphisms in infinite dimensions.\nFramed using category theory, we give a no-go theorem that shows that\ndiffeomorphisms between Hilbert spaces or Hilbert manifolds may not admit any\ncontinuous approximations by diffeomorphisms on finite-dimensional spaces, even\nif the approximations are nonlinear. The natural way out is the introduction of\nstrongly monotone diffeomorphisms and layerwise strongly monotone neural\noperators which have continuous approximations by strongly monotone\ndiffeomorphisms on finite-dimensional spaces. For these, one can guarantee\ndiscretization invariance, while ensuring that finite-dimensional\napproximations converge not only as sequences of functions, but that their\nrepresentations converge in a suitable sense as well. Finally, we show that\nbilipschitz neural operators may always be written in the form of an\nalternating composition of strongly monotone neural operators, plus a simple\nisometry. Thus we realize a rigorous platform for discretization of a\ngeneralization of a neural operator. We also show that neural operators of this\ntype may be approximated through the composition of finite-rank residual neural\noperators, where each block is strongly monotone, and may be inverted locally\nvia iteration. We conclude by providing a quantitative approximation result for\nthe discretization of general bilipschitz neural operators.\n","authors":["Takashi Furuya","Michael Puthawala","Maarten V. de Hoop","Matti Lassas"],"pdf_url":"https://arxiv.org/pdf/2412.03393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19732v4","updated":"2024-12-04T15:20:35Z","published":"2024-05-30T06:24:14Z","title":"LLM as a Complementary Optimizer to Gradient Descent: A Case Study in\n Prompt Tuning","summary":" Mastering a skill generally relies on both hands-on experience from doers and\ninsightful, high-level guidance by mentors. Will this strategy also work well\nfor solving complex non-convex optimization problems? Here, a common\ngradient-based optimizer acts like a disciplined doer, making locally optimal\nupdates at each step. Large Language Models (LLMs) can also search for better\nsolutions by inferring from natural language instructions, akin to a high-level\nmentor. In this paper, we show that these two participators are complementary\nto each other and can effectively collaborate as a combined optimization\nframework. The collaborative optimization is achieved by alternating between\nthe gradient-based and LLM-based optimizers. We instruct LLMs to generate\npossibly improved solutions by taking parameter trajectories recorded during\nthe previous stage of gradient-based optimization into account. Inferred\nresults of LLMs are used as restarting points for the next stage of gradient\noptimization. We verify the effectiveness of this optimization framework on\nprompt tuning. By leveraging both the locally rigorous gradient-based optimizer\nand the high-level deductive LLM-based optimizer, the combined optimization\nmethod consistently yields improvements over competitive baselines on a variety\nof tasks. Our results demonstrate the synergistic effect of conventional\ngradient-based optimization and the inference ability of LLMs. The code is\nreleased at https://github.com/guozix/LLM-catalyst.\n","authors":["Zixian Guo","Ming Liu","Zhilong Ji","Jinfeng Bai","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2405.19732v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03391v1","updated":"2024-12-04T15:20:12Z","published":"2024-12-04T15:20:12Z","title":"Risk-aware Classification via Uncertainty Quantification","summary":" Autonomous and semi-autonomous systems are using deep learning models to\nimprove decision-making. However, deep classifiers can be overly confident in\ntheir incorrect predictions, a major issue especially in safety-critical\ndomains. The present study introduces three foundational desiderata for\ndeveloping real-world risk-aware classification systems. Expanding upon the\npreviously proposed Evidential Deep Learning (EDL), we demonstrate the unity\nbetween these principles and EDL's operational attributes. We then augment EDL\nempowering autonomous agents to exercise discretion during structured\ndecision-making when uncertainty and risks are inherent. We rigorously examine\nempirical scenarios to substantiate these theoretical innovations. In contrast\nto existing risk-aware classifiers, our proposed methodologies consistently\nexhibit superior performance, underscoring their transformative potential in\nrisk-conscious classification strategies.\n","authors":["Murat Sensoy","Lance M. Kaplan","Simon Julier","Maryam Saleki","Federico Cerutti"],"pdf_url":"https://arxiv.org/pdf/2412.03391v1.pdf","comment":"Accepted for publication in Expert Systems with Applications"},{"id":"http://arxiv.org/abs/2412.03385v1","updated":"2024-12-04T15:12:00Z","published":"2024-12-04T15:12:00Z","title":"Reactive Orchestration for Hierarchical Federated Learning Under a\n Communication Cost Budget","summary":" Deploying a Hierarchical Federated Learning (HFL) pipeline across the\ncomputing continuum (CC) requires careful organization of participants into a\nhierarchical structure with intermediate aggregation nodes between FL clients\nand the global FL server. This is challenging to achieve due to (i) cost\nconstraints, (ii) varying data distributions, and (iii) the volatile operating\nenvironment of the CC. In response to these challenges, we present a framework\nfor the adaptive orchestration of HFL pipelines, designed to be reactive to\nclient churn and infrastructure-level events, while balancing communication\ncost and ML model accuracy. Our mechanisms identify and react to events that\ncause HFL reconfiguration actions at runtime, building on multi-level\nmonitoring information (model accuracy, resource availability, resource cost).\nMoreover, our framework introduces a generic methodology for estimating\nreconfiguration costs to continuously re-evaluate the quality of adaptation\nactions, while being extensible to optimize for various HFL performance\ncriteria. By extending the Kubernetes ecosystem, our framework demonstrates the\nability to react promptly and effectively to changes in the operating\nenvironment, making the best of the available communication cost budget and\neffectively balancing costs and ML performance at runtime.\n","authors":["Ivan Čilić","Anna Lackinger","Pantelis Frangoudis","Ivana Podnar Žarko","Alireza Furutanpey","Ilir Murturi","Schahram Dustdar"],"pdf_url":"https://arxiv.org/pdf/2412.03385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03381v1","updated":"2024-12-04T15:07:58Z","published":"2024-12-04T15:07:58Z","title":"Classical Shadows with Improved Median-of-Means Estimation","summary":" The classical shadows protocol, introduced by Huang et al. [Nat. Phys. 16,\n1050 (2020)], makes use of the median-of-means (MoM) estimator to efficiently\nestimate the expectation values of $M$ observables with failure probability\n$\\delta$ using only $\\mathcal{O}(\\log(M/\\delta))$ measurements. In their\nanalysis, Huang et al. used loose constants in their asymptotic performance\nbounds for simplicity. However, the specific values of these constants can\nsignificantly affect the number of shots used in practical implementations. To\naddress this, we studied a modified MoM estimator proposed by Minsker [PMLR\n195, 5925 (2023)] that uses optimal constants and involves a U-statistic over\nthe data set. For efficient estimation, we implemented two types of incomplete\nU-statistics estimators, the first based on random sampling and the second\nbased on cyclically permuted sampling. We compared the performance of the\noriginal and modified estimators when used with the classical shadows protocol\nwith single-qubit Clifford unitaries (Pauli measurements) for an Ising spin\nchain, and global Clifford unitaries (Clifford measurements) for the\nGreenberger-Horne-Zeilinger (GHZ) state. While the original estimator\noutperformed the modified estimators for Pauli measurements, the modified\nestimators showed improved performance over the original estimator for Clifford\nmeasurements. Our findings highlight the importance of tailoring estimators to\nspecific measurement settings to optimize the performance of the classical\nshadows protocol in practical applications.\n","authors":["Winston Fu","Dax Enshan Koh","Siong Thye Goh","Jian Feng Kong"],"pdf_url":"https://arxiv.org/pdf/2412.03381v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2412.03375v1","updated":"2024-12-04T15:02:28Z","published":"2024-12-04T15:02:28Z","title":"Granular Ball Twin Support Vector Machine with Universum Data","summary":" Classification with support vector machines (SVM) often suffers from limited\nperformance when relying solely on labeled data from target classes and is\nsensitive to noise and outliers. Incorporating prior knowledge from Universum\ndata and more robust data representations can enhance accuracy and efficiency.\nMotivated by these findings, we propose a novel Granular Ball Twin Support\nVector Machine with Universum Data (GBU-TSVM) that extends the TSVM framework\nto leverage both Universum samples and granular ball computing during model\ntraining. Unlike existing TSVM methods, the proposed GBU-TSVM represents data\ninstances as hyper-balls rather than points in the feature space. This\ninnovative approach improves the model's robustness and efficiency,\nparticularly in handling noisy and large datasets. By grouping data points into\ngranular balls, the model achieves superior computational efficiency, increased\nnoise resistance, and enhanced interpretability. Additionally, the inclusion of\nUniversum data, which consists of samples that are not strictly from the target\nclasses, further refines the classification boundaries. This integration\nenriches the model with contextual information, refining classification\nboundaries and boosting overall accuracy. Experimental results on UCI benchmark\ndatasets demonstrate that the GBU-TSVM outperforms existing TSVM models in both\naccuracy and computational efficiency. These findings highlight the potential\nof the GBU-TSVM model in setting a new standard in data representation and\nclassification.\n","authors":["M. A. Ganaie","Vrushank Ahire"],"pdf_url":"https://arxiv.org/pdf/2412.03375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08093v2","updated":"2024-12-04T14:45:23Z","published":"2024-04-11T19:15:45Z","title":"Towards a Robust Soft Baby Robot With Rich Interaction Ability for\n Advanced Machine Learning Algorithms","summary":" Advanced machine learning algorithms require platforms that are extremely\nrobust and equipped with rich sensory feedback to handle extensive\ntrial-and-error learning without relying on strong inductive biases.\nTraditional robotic designs, while well-suited for their specific use cases,\nare often fragile when used with these algorithms. To address this gap -- and\ninspired by the vision of enabling curiosity-driven baby robots -- we present a\nnovel robotic limb designed from scratch. Our design has a hybrid soft-hard\nstructure, high redundancy with rich non-contact sensors (exclusively cameras),\nand easily replaceable failure points. Proof-of-concept experiments using two\ncontemporary reinforcement learning algorithms on a physical prototype\ndemonstrate that our design is able to succeed in a simple target-finding task\neven under simulated sensor failures, all with minimal human oversight during\nextended learning periods. We believe this design represents a concrete step\ntoward more tailored robotic designs for achieving general-purpose, generally\nintelligent robots.\n","authors":["Mohannad Alhakami","Dylan R. Ashley","Joel Dunham","Yanning Dai","Francesco Faccio","Eric Feron","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2404.08093v2.pdf","comment":"6 pages in main text + 2 pages of references, 8 figures in main text,\n 1 table in main text; source code available at\n https://github.com/dylanashley/robot-limb-testai"},{"id":"http://arxiv.org/abs/2402.01930v4","updated":"2024-12-04T14:40:21Z","published":"2024-02-02T21:58:26Z","title":"Reducing Optimism Bias in Incomplete Cooperative Games","summary":" Cooperative game theory has diverse applications in contemporary artificial\nintelligence, including domains like interpretable machine learning, resource\nallocation, and collaborative decision-making. However, specifying a\ncooperative game entails assigning values to exponentially many coalitions, and\nobtaining even a single value can be resource-intensive in practice. Yet simply\nleaving certain coalition values undisclosed introduces ambiguity regarding\nindividual contributions to the collective grand coalition. This ambiguity\noften leads to players holding overly optimistic expectations, stemming from\neither inherent biases or strategic considerations, frequently resulting in\ncollective claims exceeding the actual grand coalition value. In this paper, we\npresent a framework aimed at optimizing the sequence for revealing coalition\nvalues, with the overarching goal of efficiently closing the gap between\nplayers' expectations and achievable outcomes in cooperative games. Our\ncontributions are threefold: (i) we study the individual players' optimistic\ncompletions of games with missing coalition values along with the arising gap,\nand investigate its analytical characteristics that facilitate more efficient\noptimization; (ii) we develop methods to minimize this gap over classes of\ngames with a known prior by disclosing values of additional coalitions in both\noffline and online fashion; and (iii) we empirically demonstrate the\nalgorithms' performance in practical scenarios, together with an investigation\ninto the typical order of revealing coalition values.\n","authors":["Filip Úradník","David Sychrovský","Jakub Černý","Martin Černý"],"pdf_url":"https://arxiv.org/pdf/2402.01930v4.pdf","comment":"Proc. of the 23rd International Conference on Autonomous Agents and\n Multiagent Systems (AAMAS 2024)"},{"id":"http://arxiv.org/abs/2404.13040v2","updated":"2024-12-04T14:38:11Z","published":"2024-04-19T17:53:43Z","title":"Analysis of Classifier-Free Guidance Weight Schedulers","summary":" Classifier-Free Guidance (CFG) enhances the quality and condition adherence\nof text-to-image diffusion models. It operates by combining the conditional and\nunconditional predictions using a fixed weight. However, recent works vary the\nweights throughout the diffusion process, reporting superior results but\nwithout providing any rationale or analysis. By conducting comprehensive\nexperiments, this paper provides insights into CFG weight schedulers. Our\nfindings suggest that simple, monotonically increasing weight schedulers\nconsistently lead to improved performances, requiring merely a single line of\ncode. In addition, more complex parametrized schedulers can be optimized for\nfurther improvement, but do not generalize across different models and tasks.\n","authors":["Xi Wang","Nicolas Dufour","Nefeli Andreou","Marie-Paule Cani","Victoria Fernandez Abrevaya","David Picard","Vicky Kalogeiton"],"pdf_url":"https://arxiv.org/pdf/2404.13040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01951v2","updated":"2024-12-04T14:20:21Z","published":"2024-12-02T20:24:17Z","title":"Self-Improvement in Language Models: The Sharpening Mechanism","summary":" Recent work in language modeling has raised the possibility of\nself-improvement, where a language models evaluates and refines its own\ngenerations to achieve higher performance without external feedback. It is\nimpossible for this self-improvement to create information that is not already\nin the model, so why should we expect that this will lead to improved\ncapabilities? We offer a new perspective on the capabilities of\nself-improvement through a lens we refer to as sharpening. Motivated by the\nobservation that language models are often better at verifying response quality\nthan they are at generating correct responses, we formalize self-improvement as\nusing the model itself as a verifier during post-training in order to\n``sharpen'' the model to one placing large mass on high-quality sequences,\nthereby amortizing the expensive inference-time computation of generating good\nsequences. We begin by introducing a new statistical framework for sharpening\nin which the learner aims to sharpen a pre-trained base policy via sample\naccess, and establish fundamental limits. Then we analyze two natural families\nof self-improvement algorithms based on SFT and RLHF. We find that (i) the\nSFT-based approach is minimax optimal whenever the initial model has sufficient\ncoverage, but (ii) the RLHF-based approach can improve over SFT-based\nself-improvement by leveraging online exploration, bypassing the need for\ncoverage. Finally, we empirically validate the sharpening mechanism via\ninference-time and amortization experiments. We view these findings as a\nstarting point toward a foundational understanding that can guide the design\nand evaluation of self-improvement algorithms.\n","authors":["Audrey Huang","Adam Block","Dylan J. Foster","Dhruv Rohatgi","Cyril Zhang","Max Simchowitz","Jordan T. Ash","Akshay Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2412.01951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03338v1","updated":"2024-12-04T14:13:38Z","published":"2024-12-04T14:13:38Z","title":"AI-Driven Day-to-Day Route Choice","summary":" Understanding travelers' route choices can help policymakers devise optimal\noperational and planning strategies for both normal and abnormal circumstances.\nHowever, existing choice modeling methods often rely on predefined assumptions\nand struggle to capture the dynamic and adaptive nature of travel behavior.\nRecently, Large Language Models (LLMs) have emerged as a promising alternative,\ndemonstrating remarkable ability to replicate human-like behaviors across\nvarious fields. Despite this potential, their capacity to accurately simulate\nhuman route choice behavior in transportation contexts remains doubtful. To\nsatisfy this curiosity, this paper investigates the potential of LLMs for route\nchoice modeling by introducing an LLM-empowered agent, \"LLMTraveler.\" This\nagent integrates an LLM as its core, equipped with a memory system that learns\nfrom past experiences and makes decisions by balancing retrieved data and\npersonality traits. The study systematically evaluates the LLMTraveler's\nability to replicate human-like decision-making through two stages: (1)\nanalyzing its route-switching behavior in single origin-destination (OD) pair\ncongestion game scenarios, where it demonstrates patterns align with laboratory\ndata but are not fully explained by traditional models, and (2) testing its\ncapacity to model day-to-day (DTD) adaptive learning behaviors on the Ortuzar\nand Willumsen (OW) network, producing results comparable to Multinomial Logit\n(MNL) and Reinforcement Learning (RL) models. These experiments demonstrate\nthat the framework can partially replicate human-like decision-making in route\nchoice while providing natural language explanations for its decisions. This\ncapability offers valuable insights for transportation policymaking, such as\nsimulating traveler responses to new policies or changes in the network.\n","authors":["Leizhen Wang","Peibo Duan","Zhengbing He","Cheng Lyu","Xin Chen","Nan Zheng","Li Yao","Zhenliang Ma"],"pdf_url":"https://arxiv.org/pdf/2412.03338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03332v1","updated":"2024-12-04T14:03:27Z","published":"2024-12-04T14:03:27Z","title":"On Approximability of $\\ell_2^2$ Min-Sum Clustering","summary":" The $\\ell_2^2$ min-sum $k$-clustering problem is to partition an input set\ninto clusters $C_1,\\ldots,C_k$ to minimize $\\sum_{i=1}^k\\sum_{p,q\\in\nC_i}\\|p-q\\|_2^2$. Although $\\ell_2^2$ min-sum $k$-clustering is NP-hard, it is\nnot known whether it is NP-hard to approximate $\\ell_2^2$ min-sum\n$k$-clustering beyond a certain factor.\n In this paper, we give the first hardness-of-approximation result for the\n$\\ell_2^2$ min-sum $k$-clustering problem. We show that it is NP-hard to\napproximate the objective to a factor better than $1.056$ and moreover,\nassuming a balanced variant of the Johnson Coverage Hypothesis, it is NP-hard\nto approximate the objective to a factor better than 1.327.\n We then complement our hardness result by giving the first\n$(1+\\varepsilon)$-coreset construction for $\\ell_2^2$ min-sum $k$-clustering.\nOur coreset uses $\\mathcal{O}\\left(k^{\\varepsilon^{-4}}\\right)$ space and can\nbe leveraged to achieve a polynomial-time approximation scheme with runtime\n$nd\\cdot f(k,\\varepsilon^{-1})$, where $d$ is the underlying dimension of the\ninput dataset and $f$ is a fixed function.\n Finally, we consider a learning-augmented setting, where the algorithm has\naccess to an oracle that outputs a label $i\\in[k]$ for input point, thereby\nimplicitly partitioning the input dataset into $k$ clusters that induce an\napproximately optimal solution, up to some amount of adversarial error\n$\\alpha\\in\\left[0,\\frac{1}{2}\\right)$. We give a polynomial-time algorithm that\noutputs a $\\frac{1+\\gamma\\alpha}{(1-\\alpha)^2}$-approximation to $\\ell_2^2$\nmin-sum $k$-clustering, for a fixed constant $\\gamma>0$.\n","authors":["Karthik C. S.","Euiwoong Lee","Yuval Rabani","Chris Schwiegelshohn","Samson Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.03332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03326v1","updated":"2024-12-04T13:57:20Z","published":"2024-12-04T13:57:20Z","title":"Multi-Action Restless Bandits with Weakly Coupled Constraints:\n Simultaneous Learning and Control","summary":" We study a system with finitely many groups of multi-action bandit processes,\neach of which is a Markov decision process (MDP) with finite state and action\nspaces and potentially different transition matrices when taking different\nactions. The bandit processes of the same group share the same state and action\nspaces and, given the same action that is taken, the same transition matrix.\nAll the bandit processes across various groups are subject to multiple weakly\ncoupled constraints over their state and action variables. Unlike the past\nstudies that focused on the offline case, we consider the online case without\nassuming full knowledge of transition matrices and reward functions a priori\nand propose an effective scheme that enables simultaneous learning and control.\nWe prove the convergence of the relevant processes in both the timeline and the\nnumber of the bandit processes, referred to as the convergence in the time and\nthe magnitude dimensions. Moreover, we prove that the relevant processes\nconverge exponentially fast in the magnitude dimension, leading to\nexponentially diminishing performance deviation between the proposed online\nalgorithms and offline optimality.\n","authors":["Jing Fu","Bill Moran","José Niño-Mora"],"pdf_url":"https://arxiv.org/pdf/2412.03326v1.pdf","comment":"70 pages,0 figure"},{"id":"http://arxiv.org/abs/2412.03321v1","updated":"2024-12-04T13:55:14Z","published":"2024-12-04T13:55:14Z","title":"Scalable Bayesian Tensor Ring Factorization for Multiway Data Analysis","summary":" Tensor decompositions play a crucial role in numerous applications related to\nmulti-way data analysis. By employing a Bayesian framework with\nsparsity-inducing priors, Bayesian Tensor Ring (BTR) factorization offers\nprobabilistic estimates and an effective approach for automatically adapting\nthe tensor ring rank during the learning process. However, previous BTR method\nemploys an Automatic Relevance Determination (ARD) prior, which can lead to\nsub-optimal solutions. Besides, it solely focuses on continuous data, whereas\nmany applications involve discrete data. More importantly, it relies on the\nCoordinate-Ascent Variational Inference (CAVI) algorithm, which is inadequate\nfor handling large tensors with extensive observations. These limitations\ngreatly limit its application scales and scopes, making it suitable only for\nsmall-scale problems, such as image/video completion. To address these issues,\nwe propose a novel BTR model that incorporates a nonparametric Multiplicative\nGamma Process (MGP) prior, known for its superior accuracy in identifying\nlatent structures. To handle discrete data, we introduce the P\\'olya-Gamma\naugmentation for closed-form updates. Furthermore, we develop an efficient\nGibbs sampler for consistent posterior simulation, which reduces the\ncomputational complexity of previous VI algorithm by two orders, and an online\nEM algorithm that is scalable to extremely large tensors. To showcase the\nadvantages of our model, we conduct extensive experiments on both simulation\ndata and real-world applications.\n","authors":["Zerui Tao","Toshihisa Tanaka","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2412.03321v1.pdf","comment":"ICONIP 2023"},{"id":"http://arxiv.org/abs/2412.03317v1","updated":"2024-12-04T13:52:04Z","published":"2024-12-04T13:52:04Z","title":"FlashAttention on a Napkin: A Diagrammatic Approach to Deep Learning\n IO-Awareness","summary":" Optimizing deep learning algorithms currently requires slow, manual\nderivation, potentially leaving much performance untapped. Methods like\nFlashAttention have achieved a x6 performance improvement over native PyTorch\nby avoiding unnecessary data transfers, but required three iterations over\nthree years. Automated compiled methods have consistently lagged behind. GPUs\nare limited by both transfers to processors and available compute, with\ntransfer bandwidth having improved at a far slower pace. Already, transfer\nbandwidth accounts for 46% of GPU energy costs. This indicates the future of\nenergy and capital-efficient algorithms relies on improved consideration of\ntransfer costs (IO-awareness) and a systematic method for deriving optimized\nalgorithms. In this paper, we present a diagrammatic approach to deep learning\nmodels which, with simple relabelings, derive optimal implementations and\nperformance models that consider low-level memory. Diagrams generalize down the\nGPU hierarchy, providing a universal performance model for comparing hardware\nand quantization choices. Diagrams generate pseudocode, which reveals the\napplication of hardware-specific features such as coalesced memory access,\ntensor core operations, and overlapped computation. We present attention\nalgorithms for Ampere, which fits 13 warps per SM (FlashAttention fits 8), and\nfor Hopper, which has improved overlapping and may achieve 1.32 PFLOPs.\n","authors":["Vincent Abbott","Gioele Zardini"],"pdf_url":"https://arxiv.org/pdf/2412.03317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13609v2","updated":"2024-12-04T13:46:04Z","published":"2024-05-22T13:01:37Z","title":"Tackling Decision Processes with Non-Cumulative Objectives using\n Reinforcement Learning","summary":" Markov decision processes (MDPs) are used to model a wide variety of\napplications ranging from game playing over robotics to finance. Their optimal\npolicy typically maximizes the expected sum of rewards given at each step of\nthe decision process. However, a large class of problems does not fit\nstraightforwardly into this framework: Non-cumulative Markov decision processes\n(NCMDPs), where instead of the expected sum of rewards, the expected value of\nan arbitrary function of the rewards is maximized. Example functions include\nthe maximum of the rewards or their mean divided by their standard deviation.\nIn this work, we introduce a general mapping of NCMDPs to standard MDPs. This\nallows all techniques developed to find optimal policies for MDPs, such as\nreinforcement learning or dynamic programming, to be directly applied to the\nlarger class of NCMDPs. Focusing on reinforcement learning, we show\napplications in a diverse set of tasks, including classical control, portfolio\noptimization in finance, and discrete optimization problems. Given our\napproach, we can improve both final performance and training time compared to\nrelying on standard MDPs.\n","authors":["Maximilian Nägele","Jan Olle","Thomas Fösel","Remmy Zen","Florian Marquardt"],"pdf_url":"https://arxiv.org/pdf/2405.13609v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03312v1","updated":"2024-12-04T13:44:56Z","published":"2024-12-04T13:44:56Z","title":"Path-Guided Particle-based Sampling","summary":" Particle-based Bayesian inference methods by sampling from a partition-free\ntarget (posterior) distribution, e.g., Stein variational gradient descent\n(SVGD), have attracted significant attention. We propose a path-guided\nparticle-based sampling~(PGPS) method based on a novel Log-weighted Shrinkage\n(LwS) density path linking an initial distribution to the target distribution.\nWe propose to utilize a Neural network to learn a vector field motivated by the\nFokker-Planck equation of the designed density path. Particles, initiated from\nthe initial distribution, evolve according to the ordinary differential\nequation defined by the vector field. The distribution of these particles is\nguided along a density path from the initial distribution to the target\ndistribution. The proposed LwS density path allows for an efficient search of\nmodes of the target distribution while canonical methods fail. We theoretically\nanalyze the Wasserstein distance of the distribution of the PGPS-generated\nsamples and the target distribution due to approximation and discretization\nerrors. Practically, the proposed PGPS-LwS method demonstrates higher Bayesian\ninference accuracy and better calibration ability in experiments conducted on\nboth synthetic and real-world Bayesian learning tasks, compared to baselines,\nsuch as SVGD and Langevin dynamics, etc.\n","authors":["Mingzhou Fan","Ruida Zhou","Chao Tian","Xiaoning Qian"],"pdf_url":"https://arxiv.org/pdf/2412.03312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04203v3","updated":"2024-12-04T13:43:10Z","published":"2023-04-09T10:08:38Z","title":"OpenDriver: An Open-Road Driver State Detection Dataset","summary":" Among numerous studies for driver state detection, wearable physiological\nmeasurements offer a practical method for real-time monitoring. However, there\nare few driver physiological datasets in open-road scenarios, and the existing\ndatasets suffer from issues such as poor signal quality, small sample sizes,\nand short data collection periods. Therefore, in this paper, a large-scale\nmultimodal driving dataset, OpenDriver, for driver state detection is\ndeveloped. The OpenDriver encompasses a total of 3,278 driving trips, with a\nsignal collection duration spanning approximately 4,600 hours. Two modalities\nof driving signals are enrolled in OpenDriver: electrocardiogram (ECG) signals\nand six-axis motion data of the steering wheel from a motion measurement unit\n(IMU), which were recorded from 81 drivers and their vehicles. Furthermore,\nthree challenging tasks are involved in our work, namely ECG signal quality\nassessment, individual biometric identification based on ECG signals, and\nphysiological signal analysis in complex driving environments. To facilitate\nresearch in these tasks, corresponding benchmarks have also been introduced.\nFirst, a noisy augmentation strategy is applied to generate a larger-scale ECG\nsignal dataset with realistic noise simulation for quality assessment. Second,\nan end-to-end contrastive learning framework is employed for individual\nbiometric identification. Finally, a comprehensive analysis of drivers' HRV\nfeatures under different driving conditions is conducted. Each benchmark\nprovides evaluation metrics and reference results. The OpenDriver dataset will\nbe publicly available at https://github.com/bdne/OpenDriver.\n","authors":["Delong Liu","Shichao Li","Tianyi Shi","Zhu Meng","Guanyu Chen","Yadong Huang","Jin Dong","Zhicheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2304.04203v3.pdf","comment":"Considering that there are flaws in the statistical data of the\n dataset, all the authors agreed to withdraw the manuscript"},{"id":"http://arxiv.org/abs/2412.03300v1","updated":"2024-12-04T13:17:42Z","published":"2024-12-04T13:17:42Z","title":"Conveying Emotions to Robots through Touch and Sound","summary":" Human emotions can be conveyed through nuanced touch gestures. However, there\nis a lack of understanding of how consistently emotions can be conveyed to\nrobots through touch. This study explores the consistency of touch-based\nemotional expression toward a robot by integrating tactile and auditory sensory\nreading of affective haptic expressions. We developed a piezoresistive pressure\nsensor and used a microphone to mimic touch and sound channels, respectively.\nIn a study with 28 participants, each conveyed 10 emotions to a robot using\nspontaneous touch gestures. Our findings reveal a statistically significant\nconsistency in emotion expression among participants. However, some emotions\nobtained low intraclass correlation values. Additionally, certain emotions with\nsimilar levels of arousal or valence did not exhibit significant differences in\nthe way they were conveyed. We subsequently constructed a multi-modal\nintegrating touch and audio features to decode the 10 emotions. A support\nvector machine (SVM) model demonstrated the highest accuracy, achieving 40% for\n10 classes, with \"Attention\" being the most accurately conveyed emotion at a\nbalanced accuracy of 87.65%.\n","authors":["Qiaoqiao Ren","Remko Proesmans","Frederick Bossuyt","Jan Vanfleteren","Francis Wyffels","Tony Belpaeme"],"pdf_url":"https://arxiv.org/pdf/2412.03300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03299v1","updated":"2024-12-04T13:16:57Z","published":"2024-12-04T13:16:57Z","title":"Gaussian Processes for Probabilistic Estimates of Earthquake Ground\n Shaking: A 1-D Proof-of-Concept","summary":" Estimates of seismic wave speeds in the Earth (seismic velocity models) are\nkey input parameters to earthquake simulations for ground motion prediction.\nOwing to the non-uniqueness of the seismic inverse problem, typically many\nvelocity models exist for any given region. The arbitrary choice of which\nvelocity model to use in earthquake simulations impacts ground motion\npredictions. However, current hazard analysis methods do not account for this\nsource of uncertainty. We present a proof-of-concept ground motion prediction\nworkflow for incorporating uncertainties arising from inconsistencies between\nexisting seismic velocity models. Our analysis is based on the probabilistic\nfusion of overlapping seismic velocity models using scalable Gaussian process\n(GP) regression. Specifically, we fit a GP to two synthetic 1-D velocity\nprofiles simultaneously, and show that the predictive uncertainty accounts for\nthe differences between the models. We subsequently draw velocity model samples\nfrom the predictive distribution and estimate peak ground displacement using\nacoustic wave propagation through the velocity models. The resulting\ndistribution of possible ground motion amplitudes is much wider than would be\npredicted by simulating shaking using only the two input velocity models. This\nproof-of-concept illustrates the importance of probabilistic methods for\nphysics-based seismic hazard analysis.\n","authors":["Sam A. Scivier","Tarje Nissen-Meyer","Paula Koelemeijer","Atılım Güneş Baydin"],"pdf_url":"https://arxiv.org/pdf/2412.03299v1.pdf","comment":"8 pages, 2 figures, accepted in the Machine Learning and the Physical\n Sciences Workshop at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.17882v2","updated":"2024-12-04T13:14:15Z","published":"2024-10-23T13:55:42Z","title":"Identifiable Representation and Model Learning for Latent Dynamic\n Systems","summary":" Learning identifiable representations and models from low-level observations\nis helpful for an intelligent spacecraft to complete downstream tasks reliably.\nFor temporal observations, to ensure that the data generating process is\nprovably inverted, most existing works either assume the noise variables in the\ndynamic mechanisms are (conditionally) independent or require that the\ninterventions can directly affect each latent variable. However, in practice,\nthe relationship between the exogenous inputs/interventions and the latent\nvariables may follow some complex deterministic mechanisms. In this work, we\nstudy the problem of identifiable representation and model learning for latent\ndynamic systems. The key idea is to use an inductive bias inspired by\ncontrollable canonical forms, which are sparse and input-dependent by\ndefinition. We prove that, for linear and affine nonlinear latent dynamic\nsystems with sparse input matrices, it is possible to identify the latent\nvariables up to scaling and determine the dynamic models up to some simple\ntransformations. The results have the potential to provide some theoretical\nguarantees for developing more trustworthy decision-making and control methods\nfor intelligent spacecrafts.\n","authors":["Congxi Zhang","Yongchun Xie"],"pdf_url":"https://arxiv.org/pdf/2410.17882v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20351v2","updated":"2024-12-04T12:42:20Z","published":"2024-10-27T06:32:41Z","title":"Leveraging Auxiliary Task Relevance for Enhanced Bearing Fault Diagnosis\n through Curriculum Meta-learning","summary":" The accurate diagnosis of machine breakdowns is crucial for maintaining\noperational safety in smart manufacturing. Despite the promise shown by deep\nlearning in automating fault identification, the scarcity of labeled training\ndata, particularly for equipment failure instances, poses a significant\nchallenge. This limitation hampers the development of robust classification\nmodels. Existing methods like model-agnostic meta-learning (MAML) do not\nadequately address variable working conditions, affecting knowledge transfer.\nTo address these challenges, a Related Task Aware Curriculum Meta-learning\n(RT-ACM) enhanced fault diagnosis framework is proposed in this paper, inspired\nby human cognitive learning processes. RT-ACM improves training by considering\nthe relevance of auxiliary sensor working conditions, adhering to the principle\nof ``paying more attention to more relevant knowledge\", and focusing on\n``easier first, harder later\" curriculum sampling. This approach aids the\nmeta-learner in achieving a superior convergence state. Extensive experiments\non two real-world datasets demonstrate the superiority of RT-ACM framework.\n","authors":["Jinze Wang","Jiong Jin","Tiehua Zhang","Boon Xian Chai","Adriano Di Pietro","Dimitrios Georgakopoulos"],"pdf_url":"https://arxiv.org/pdf/2410.20351v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00153v2","updated":"2024-12-04T12:40:30Z","published":"2024-11-29T07:00:18Z","title":"ROSE: Revolutionizing Open-Set Dense Segmentation with Patch-Wise\n Perceptual Large Multimodal Model","summary":" Advances in CLIP and large multimodal models (LMMs) have enabled\nopen-vocabulary and free-text segmentation, yet existing models still require\npredefined category prompts, limiting free-form category self-generation. Most\nsegmentation LMMs also remain confined to sparse predictions, restricting their\napplicability in open-set environments. In contrast, we propose ROSE, a\nRevolutionary Open-set dense SEgmentation LMM, which enables dense mask\nprediction and open-category generation through patch-wise perception. Our\nmethod treats each image patch as an independent region of interest candidate,\nenabling the model to predict both dense and sparse masks simultaneously.\nAdditionally, a newly designed instruction-response paradigm takes full\nadvantage of the generation and generalization capabilities of LMMs, achieving\ncategory prediction independent of closed-set constraints or predefined\ncategories. To further enhance mask detail and category precision, we introduce\na conversation-based refinement paradigm, integrating the prediction result\nfrom previous step with textual prompt for revision. Extensive experiments\ndemonstrate that ROSE achieves competitive performance across various\nsegmentation tasks in a unified framework. Code will be released.\n","authors":["Kunyang Han","Yibo Hu","Mengxue Qu","Hailin Shi","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2412.00153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03271v1","updated":"2024-12-04T12:31:15Z","published":"2024-12-04T12:31:15Z","title":"Nonparametric Filtering, Estimation and Classification using Neural Jump\n ODEs","summary":" Neural Jump ODEs model the conditional expectation between observations by\nneural ODEs and jump at arrival of new observations. They have demonstrated\neffectiveness for fully data-driven online forecasting in settings with\nirregular and partial observations, operating under weak regularity\nassumptions. This work extends the framework to input-output systems, enabling\ndirect applications in online filtering and classification. We establish\ntheoretical convergence guarantees for this approach, providing a robust\nsolution to $L^2$-optimal filtering. Empirical experiments highlight the\nmodel's superior performance over classical parametric methods, particularly in\nscenarios with complex underlying distributions. These results emphasise the\napproach's potential in time-sensitive domains such as finance and health\nmonitoring, where real-time accuracy is crucial.\n","authors":["Jakob Heiss","Florian Krach","Thorsten Schmidt","Félix B. Tambe-Ndonfack"],"pdf_url":"https://arxiv.org/pdf/2412.03271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18152v2","updated":"2024-12-04T12:18:17Z","published":"2024-09-25T17:15:26Z","title":"Reinforcement Learning for Finite Space Mean-Field Type Games","summary":" Mean field type games (MFTGs) describe Nash equilibria between large\ncoalitions: each coalition consists of a continuum of cooperative agents who\nmaximize the average reward of their coalition while interacting\nnon-cooperatively with a finite number of other coalitions. Although the theory\nhas been extensively developed, we are still lacking efficient and scalable\ncomputational methods. Here, we develop reinforcement learning methods for such\ngames in a finite space setting with general dynamics and reward functions. We\nstart by proving that MFTG solution yields approximate Nash equilibria in\nfinite-size coalition games. We then propose two algorithms. The first is based\non quantization of mean-field spaces and Nash Q-learning. We provide\nconvergence and stability analysis. We then propose a deep reinforcement\nlearning algorithm, which can scale to larger spaces. Numerical experiments in\n5 environments with mean-field distributions of dimension up to $200$ show the\nscalability and efficiency of the proposed method.\n","authors":["Kai Shao","Jiacheng Shen","Chijie An","Mathieu Laurière"],"pdf_url":"https://arxiv.org/pdf/2409.18152v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03263v1","updated":"2024-12-04T12:11:19Z","published":"2024-12-04T12:11:19Z","title":"NeRF and Gaussian Splatting SLAM in the Wild","summary":" Navigating outdoor environments with visual Simultaneous Localization and\nMapping (SLAM) systems poses significant challenges due to dynamic scenes,\nlighting variations, and seasonal changes, requiring robust solutions. While\ntraditional SLAM methods struggle with adaptability, deep learning-based\napproaches and emerging neural radiance fields as well as Gaussian\nSplatting-based SLAM methods, offer promising alternatives. However, these\nmethods have primarily been evaluated in controlled indoor environments with\nstable conditions, leaving a gap in understanding their performance in\nunstructured and variable outdoor settings. This study addresses this gap by\nevaluating these methods in natural outdoor environments, focusing on camera\ntracking accuracy, robustness to environmental factors, and computational\nefficiency, highlighting distinct trade-offs. Extensive evaluations demonstrate\nthat neural SLAM methods achieve superior robustness, particularly under\nchallenging conditions such as low light, but at a high computational cost. At\nthe same time, traditional methods perform the best across seasons but are\nhighly sensitive to variations in lighting conditions. The code of the\nbenchmark is publicly available at\nhttps://github.com/iis-esslingen/nerf-3dgs-benchmark.\n","authors":["Fabian Schmidt","Markus Enzweiler","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2412.03263v1.pdf","comment":"5 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2403.14695v2","updated":"2024-12-04T11:58:41Z","published":"2024-03-15T15:05:59Z","title":"Chain-structured neural architecture search for financial time series\n forecasting","summary":" Neural architecture search (NAS) emerged as a way to automatically optimize\nneural networks for a specific task and dataset. Despite an abundance of\nresearch on NAS for images and natural language applications, similar studies\nfor time series data are lacking. Among NAS search spaces, chain-structured are\nthe simplest and most applicable to small datasets like time series. We compare\nthree popular NAS strategies on chain-structured search spaces: Bayesian\noptimization (specifically Tree-structured Parzen Estimator), the hyperband\nmethod, and reinforcement learning in the context of financial time series\nforecasting. These strategies were employed to optimize simple well-understood\nneural architectures like the MLP, 1D CNN, and RNN, with more complex temporal\nfusion transformers (TFT) and their own optimizers included for comparison. We\nfind Bayesian optimization and the hyperband method performing best among the\nstrategies, and RNN and 1D CNN best among the architectures, but all methods\nwere very close to each other with a high variance due to the difficulty of\nworking with financial datasets. We discuss our approach to overcome the\nvariance and provide implementation recommendations for future users and\nresearchers.\n","authors":["Denis Levchenko","Efstratios Rappos","Shabnam Ataee","Biagio Nigro","Stephan Robert-Nicoud"],"pdf_url":"https://arxiv.org/pdf/2403.14695v2.pdf","comment":"This is the accepted version of the paper published in International\n Journal of Data Science and Analytics"},{"id":"http://arxiv.org/abs/2412.03258v1","updated":"2024-12-04T11:57:36Z","published":"2024-12-04T11:57:36Z","title":"Learning on One Mode: Addressing Multi-Modality in Offline Reinforcement\n Learning","summary":" Offline reinforcement learning (RL) seeks to learn optimal policies from\nstatic datasets without interacting with the environment. A common challenge is\nhandling multi-modal action distributions, where multiple behaviours are\nrepresented in the data. Existing methods often assume unimodal behaviour\npolicies, leading to suboptimal performance when this assumption is violated.\nWe propose Weighted Imitation Learning on One Mode (LOM), a novel approach that\nfocuses on learning from a single, promising mode of the behaviour policy. By\nusing a Gaussian mixture model to identify modes and selecting the best mode\nbased on expected returns, LOM avoids the pitfalls of averaging over\nconflicting actions. Theoretically, we show that LOM improves performance while\nmaintaining simplicity in policy learning. Empirically, LOM outperforms\nexisting methods on standard D4RL benchmarks and demonstrates its effectiveness\nin complex, multi-modal scenarios.\n","authors":["Mianchu Wang","Yue Jin","Giovanni Montana"],"pdf_url":"https://arxiv.org/pdf/2412.03258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01322v2","updated":"2024-12-04T11:53:32Z","published":"2024-12-02T09:40:03Z","title":"Explainable fault and severity classification for rolling element\n bearings using Kolmogorov-Arnold networks","summary":" Rolling element bearings are critical components of rotating machinery, with\ntheir performance directly influencing the efficiency and reliability of\nindustrial systems. At the same time, bearing faults are a leading cause of\nmachinery failures, often resulting in costly downtime, reduced productivity,\nand, in extreme cases, catastrophic damage. This study presents a methodology\nthat utilizes Kolmogorov-Arnold Networks to address these challenges through\nautomatic feature selection, hyperparameter tuning and interpretable fault\nanalysis within a unified framework. By training shallow network architectures\nand minimizing the number of selected features, the framework produces\nlightweight models that deliver explainable results through feature attribution\nand symbolic representations of their activation functions. Validated on two\nwidely recognized datasets for bearing fault diagnosis, the framework achieved\nperfect F1-Scores for fault detection and high performance in fault and\nseverity classification tasks, including 100% F1-Scores in most cases. Notably,\nit demonstrated adaptability by handling diverse fault types, such as imbalance\nand misalignment, within the same dataset. The symbolic representations\nenhanced model interpretability, while feature attribution offered insights\ninto the optimal feature types or signals for each studied task. These results\nhighlight the framework's potential for practical applications, such as\nreal-time machinery monitoring, and for scientific research requiring efficient\nand explainable models.\n","authors":["Spyros Rigas","Michalis Papachristou","Ioannis Sotiropoulos","Georgios Alexandridis"],"pdf_url":"https://arxiv.org/pdf/2412.01322v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03252v1","updated":"2024-12-04T11:51:50Z","published":"2024-12-04T11:51:50Z","title":"Variable-Speed Teaching-Playback as Real-World Data Augmentation for\n Imitation Learning","summary":" Because imitation learning relies on human demonstrations in hard-to-simulate\nsettings, the inclusion of force control in this method has resulted in a\nshortage of training data, even with a simple change in speed. Although the\nfield of data augmentation has addressed the lack of data, conventional methods\nof data augmentation for robot manipulation are limited to simulation-based\nmethods or downsampling for position control. This paper proposes a novel\nmethod of data augmentation that is applicable to force control and preserves\nthe advantages of real-world datasets. We applied teaching-playback at variable\nspeeds as real-world data augmentation to increase both the quantity and\nquality of environmental reactions at variable speeds. An experiment was\nconducted on bilateral control-based imitation learning using a method of\nimitation learning equipped with position-force control. We evaluated the\neffect of real-world data augmentation on two tasks, pick-and-place and wiping,\nat variable speeds, each from two human demonstrations at fixed speed. The\nresults showed a maximum 55% increase in success rate from a simple change in\nspeed of real-world reactions and improved accuracy along the\nduration/frequency command by gathering environmental reactions at variable\nspeeds.\n","authors":["Nozomu Masuya","Hiroshi Sato","Koki Yamane","Takuya Kusume","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2412.03252v1.pdf","comment":"16 pages, 12 figures, 4 tables. This is a preprint of an article\n submitted for consideration in ADVANCED ROBOTICS, copyright Taylor & Francis\n and Robotics Society of Japan; ADVANCED ROBOTICS is available online at\n http://www.tandfonline.com/"},{"id":"http://arxiv.org/abs/2412.03238v1","updated":"2024-12-04T11:39:03Z","published":"2024-12-04T11:39:03Z","title":"Dynamic Consistent $k$-Center Clustering with Optimal Recourse","summary":" Given points from an arbitrary metric space and a sequence of point updates\nsent by an adversary, what is the minimum recourse per update (i.e., the\nminimum number of changes needed to the set of centers after an update), in\norder to maintain a constant-factor approximation to a $k$-clustering problem?\nThis question has received attention in recent years under the name consistent\nclustering.\n Previous works by Lattanzi and Vassilvitskii [ICLM '17] and Fichtenberger,\nLattanzi, Norouzi-Fard, and Svensson [SODA '21] studied $k$-clustering\nobjectives, including the $k$-center and the $k$-median objectives, under only\npoint insertions. In this paper we study the $k$-center objective in the fully\ndynamic setting, where the update is either a point insertion or a point\ndeletion. Before our work, {\\L}\\k{a}cki, Haeupler, Grunau, Rozho\\v{n}, and\nJayaram [SODA '24] gave a deterministic fully dynamic constant-factor\napproximation algorithm for the $k$-center objective with worst-case recourse\nof $2$ per update.\n In this work, we prove that the $k$-center clustering problem admits optimal\nrecourse bounds by developing a deterministic fully dynamic constant-factor\napproximation algorithm with worst-case recourse of $1$ per update. Moreover\nour algorithm performs simple choices based on light data structures, and thus\nis arguably more direct and faster than the previous one which uses a\nsophisticated combinatorial structure. Additionally, we develop a new\ndeterministic decremental algorithm and a new deterministic incremental\nalgorithm, both of which maintain a $6$-approximate $k$-center solution with\nworst-case recourse of $1$ per update. Our incremental algorithm improves over\nthe $8$-approximation algorithm by Charikar, Chekuri, Feder, and Motwani [STOC\n'97]. Finally, we remark that since all three of our algorithms are\ndeterministic, they work against an adaptive adversary.\n","authors":["Sebastian Forster","Antonis Skarlatos"],"pdf_url":"https://arxiv.org/pdf/2412.03238v1.pdf","comment":"In Proceedings SODA 2025"},{"id":"http://arxiv.org/abs/2412.03224v1","updated":"2024-12-04T11:21:30Z","published":"2024-12-04T11:21:30Z","title":"Channel Reflection: Knowledge-Driven Data Augmentation for EEG-Based\n Brain-Computer Interfaces","summary":" A brain-computer interface (BCI) enables direct communication between the\nhuman brain and external devices. Electroencephalography (EEG) based BCIs are\ncurrently the most popular for able-bodied users. To increase\nuser-friendliness, usually a small amount of user-specific EEG data are used\nfor calibration, which may not be enough to develop a pure data-driven decoding\nmodel. To cope with this typical calibration data shortage challenge in\nEEG-based BCIs, this paper proposes a parameter-free channel reflection (CR)\ndata augmentation approach that incorporates prior knowledge on the channel\ndistributions of different BCI paradigms in data augmentation. Experiments on\neight public EEG datasets across four different BCI paradigms (motor imagery,\nsteady-state visual evoked potential, P300, and seizure classifications) using\ndifferent decoding algorithms demonstrated that: 1) CR is effective, i.e., it\ncan noticeably improve the classification accuracy; 2) CR is robust, i.e., it\nconsistently outperforms existing data augmentation approaches in the\nliterature; and, 3) CR is flexible, i.e., it can be combined with other data\naugmentation approaches to further increase the performance. We suggest that\ndata augmentation approaches like CR should be an essential step in EEG-based\nBCIs. Our code is available online.\n","authors":["Ziwei Wang","Siyang Li","Jingwei Luo","Jiajing Liu","Dongrui Wu"],"pdf_url":"https://arxiv.org/pdf/2412.03224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03220v1","updated":"2024-12-04T11:14:06Z","published":"2024-12-04T11:14:06Z","title":"Survey of different Large Language Model Architectures: Trends,\n Benchmarks, and Challenges","summary":" Large Language Models (LLMs) represent a class of deep learning models adept\nat understanding natural language and generating coherent responses to various\nprompts or queries. These models far exceed the complexity of conventional\nneural networks, often encompassing dozens of neural network layers and\ncontaining billions to trillions of parameters. They are typically trained on\nvast datasets, utilizing architectures based on transformer blocks. Present-day\nLLMs are multi-functional, capable of performing a range of tasks from text\ngeneration and language translation to question answering, as well as code\ngeneration and analysis. An advanced subset of these models, known as\nMultimodal Large Language Models (MLLMs), extends LLM capabilities to process\nand interpret multiple data modalities, including images, audio, and video.\nThis enhancement empowers MLLMs with capabilities like video editing, image\ncomprehension, and captioning for visual content. This survey provides a\ncomprehensive overview of the recent advancements in LLMs. We begin by tracing\nthe evolution of LLMs and subsequently delve into the advent and nuances of\nMLLMs. We analyze emerging state-of-the-art MLLMs, exploring their technical\nfeatures, strengths, and limitations. Additionally, we present a comparative\nanalysis of these models and discuss their challenges, potential limitations,\nand prospects for future development.\n","authors":["Minghao Shao","Abdul Basit","Ramesh Karri","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2412.03220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.05650v2","updated":"2024-12-04T11:12:42Z","published":"2024-07-08T06:22:10Z","title":"The Cooperative Network Architecture: Learning Structured Networks as\n Representation of Sensory Patterns","summary":" Nets, cooperative networks of neurons, have been proposed as format for the\nrepresentation of sensory signals, as physical implementation of the Gestalt\nphenomenon and as solution to the neural binding problem, while the direct\ninteraction between nets by structure-sensitive matching has been proposed as\nbasis for object-global operations such as object detection. The nets are\nflexibly composed of overlapping net fragments, which are learned from\nstatistical regularities of sensory input. We here present the cooperative\nnetwork architecture (CNA), a concrete model that learns such net structure to\nrepresent input patterns and deals robustly with noise, deformation, and\nout-of-distribution data, thus laying the groundwork for a novel neural\narchitecture.\n","authors":["Pascal J. Sager","Jan M. Deriu","Benjamin F. Grewe","Thilo Stadelmann","Christoph von der Malsburg"],"pdf_url":"https://arxiv.org/pdf/2407.05650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03215v1","updated":"2024-12-04T11:08:32Z","published":"2024-12-04T11:08:32Z","title":"Beyond [cls]: Exploring the true potential of Masked Image Modeling\n representations","summary":" Masked Image Modeling (MIM) has emerged as a popular method for\nSelf-Supervised Learning (SSL) of visual representations. However, for\nhigh-level perception tasks, MIM-pretrained models offer lower out-of-the-box\nrepresentation quality than the Joint-Embedding Architectures (JEA) - another\nprominent SSL paradigm. To understand this performance gap, we analyze the\ninformation flow in Vision Transformers (ViT) learned by both approaches. We\nreveal that whereas JEAs construct their representation on a selected set of\nrelevant image fragments, MIM models aggregate nearly whole image content.\nMoreover, we demonstrate that MIM-trained ViTs retain valuable information\nwithin their patch tokens, which is not effectively captured by the global\n[cls] token representations. Therefore, selective aggregation of relevant patch\ntokens, without any fine-tuning, results in consistently higher-quality of MIM\nrepresentations. To our knowledge, we are the first to highlight the lack of\neffective representation aggregation as an emergent issue of MIM and propose\ndirections to address it, contributing to future advances in Self-Supervised\nLearning.\n","authors":["Marcin Przewięźlikowski","Randall Balestriero","Wojciech Jasiński","Marek Śmieja","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2412.03215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03214v1","updated":"2024-12-04T11:05:01Z","published":"2024-12-04T11:05:01Z","title":"Continual Low-Rank Scaled Dot-product Attention","summary":" Transformers are widely used for their ability to capture data relations in\nsequence processing, with great success for a wide range of static tasks.\nHowever, the computational and memory footprint of their main component, i.e.,\nthe Scaled Dot-product Attention, is commonly overlooked. This makes their\nadoption in applications involving stream data processing with constraints in\nresponse latency, computational and memory resources infeasible. Some works\nhave proposed methods to lower the computational cost of transformers, i.e.\nlow-rank approximations, sparsity in attention, and efficient formulations for\nContinual Inference. In this paper, we introduce a new formulation of the\nScaled Dot-product Attention based on the Nystr\\\"om approximation that is\nsuitable for Continual Inference. In experiments on Online Audio Classification\nand Online Action Detection tasks, the proposed Continual Scaled Dot-product\nAttention can lower the number of operations by up to three orders of magnitude\ncompared to the original Transformers while retaining the predictive\nperformance of competing models.\n","authors":["Ginés Carreto Picón","Illia Oleksiienko","Lukas Hedegaard","Arian Bakhtiarnia","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2412.03214v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.03213v1","updated":"2024-12-04T10:58:27Z","published":"2024-12-04T10:58:27Z","title":"ClusterKV: Manipulating LLM KV Cache in Semantic Space for Recallable\n Compression","summary":" Large Language Models (LLMs) have been widely deployed in a variety of\napplications, and the context length is rapidly increasing to handle tasks such\nas long-document QA and complex logical reasoning. However, long context poses\nsignificant challenges for inference efficiency, including high memory costs of\nkey-value (KV) cache and increased latency due to extensive memory accesses.\nRecent works have proposed compressing KV cache to approximate computation, but\nthese methods either evict tokens permanently, never recalling them for later\ninference, or recall previous tokens at the granularity of pages divided by\ntextual positions. Both approaches degrade the model accuracy and output\nquality. To achieve efficient and accurate recallable KV cache compression, we\nintroduce ClusterKV, which recalls tokens at the granularity of semantic\nclusters. We design and implement efficient algorithms and systems for\nclustering, selection, indexing and caching. Experiment results show that\nClusterKV attains negligible accuracy loss across various tasks with 32k\ncontext lengths, using only a 1k to 2k KV cache budget, and achieves up to a\n2$\\times$ speedup in latency and a 2.5$\\times$ improvement in decoding\nthroughput. Compared to SoTA recallable KV compression methods, ClusterKV\ndemonstrates higher model accuracy and output quality, while maintaining or\nexceeding inference efficiency.\n","authors":["Guangda Liu","Chengwei Li","Jieru Zhao","Chenqi Zhang","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2412.03213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03212v1","updated":"2024-12-04T10:57:55Z","published":"2024-12-04T10:57:55Z","title":"Semi-Supervised Transfer Boosting (SS-TrBoosting)","summary":" Semi-supervised domain adaptation (SSDA) aims at training a high-performance\nmodel for a target domain using few labeled target data, many unlabeled target\ndata, and plenty of auxiliary data from a source domain. Previous works in SSDA\nmainly focused on learning transferable representations across domains.\nHowever, it is difficult to find a feature space where the source and target\ndomains share the same conditional probability distribution. Additionally,\nthere is no flexible and effective strategy extending existing unsupervised\ndomain adaptation (UDA) approaches to SSDA settings. In order to solve the\nabove two challenges, we propose a novel fine-tuning framework, semi-supervised\ntransfer boosting (SS-TrBoosting). Given a well-trained deep learning-based UDA\nor SSDA model, we use it as the initial model, generate additional base\nlearners by boosting, and then use all of them as an ensemble. More\nspecifically, half of the base learners are generated by supervised domain\nadaptation, and half by semi-supervised learning. Furthermore, for more\nefficient data transmission and better data privacy protection, we propose a\nsource data generation approach to extend SS-TrBoosting to semi-supervised\nsource-free domain adaptation (SS-SFDA). Extensive experiments showed that\nSS-TrBoosting can be applied to a variety of existing UDA, SSDA and SFDA\napproaches to further improve their performance.\n","authors":["Lingfei Deng","Changming Zhao","Zhenbang Du","Kun Xia","Dongrui Wu"],"pdf_url":"https://arxiv.org/pdf/2412.03212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03098v2","updated":"2024-12-04T10:52:25Z","published":"2024-11-05T13:44:25Z","title":"Local Lesion Generation is Effective for Capsule Endoscopy Image Data\n Augmentation in a Limited Data Setting","summary":" Limited medical imaging datasets challenge deep learning models by increasing\nrisks of overfitting and reduced generalization, particularly in Generative\nAdversarial Networks (GANs), where discriminators may overfit, leading to\ntraining divergence. This constraint also impairs classification models trained\non small datasets. Generative Data Augmentation (GDA) addresses this by\nexpanding training datasets with synthetic data, although it requires training\na generative model. We propose and evaluate two local lesion generation\napproaches to address the challenge of augmenting small medical image datasets.\nThe first approach employs the Poisson Image Editing algorithm, a classical\nimage processing technique, to create realistic image composites that\noutperform current state-of-the-art methods. The second approach introduces a\nnovel generative method, leveraging a fine-tuned Image Inpainting GAN to\nsynthesize realistic lesions within specified regions of real training images.\nA comprehensive comparison of the two proposed methods demonstrates that\neffective local lesion generation in a data-constrained setting allows for\nreaching new state-of-the-art results in capsule endoscopy lesion\nclassification. Combination of our techniques achieves a macro F1-score of\n33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on\nthe highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule\nendoscopy. To the best of our knowledge, this work is the first to apply a\nfine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that\nan image-conditional GAN can be adapted effectively to limited datasets to\ngenerate high-quality examples, facilitating effective data augmentation.\nAdditionally, we show that combining this GAN-based approach with classical\nimage processing techniques further improves the results.\n","authors":["Adrian B. Chłopowiec","Adam R. Chłopowiec","Krzysztof Galus","Wojciech Cebula","Martin Tabakov"],"pdf_url":"https://arxiv.org/pdf/2411.03098v2.pdf","comment":"54 pages, 35 figures"},{"id":"http://arxiv.org/abs/2410.16926v2","updated":"2024-12-04T10:52:04Z","published":"2024-10-22T11:57:32Z","title":"Pyramid Vector Quantization for LLMs","summary":" Recent works on compression of large language models (LLM) using quantization\nconsidered reparameterizing the architecture such that weights are distributed\non the sphere. This demonstratively improves the ability to quantize by\nincreasing the mathematical notion of coherence, resulting in fewer weight\noutliers without affecting the network output. In this work, we aim to further\nexploit this spherical geometry of the weights when performing quantization by\nconsidering Pyramid Vector Quantization (PVQ) for large language models.\nArranging points evenly on the sphere is notoriously difficult, especially in\nhigh dimensions, and in case approximate solutions exists, representing points\nexplicitly in a codebook is typically not feasible due to its additional memory\ncost. Instead, PVQ uses a fixed integer lattice on the sphere by projecting\npoints onto the 1-sphere, which allows for efficient encoding and decoding\nwithout requiring an explicit codebook in memory. To obtain a practical\nalgorithm, we propose to combine PVQ with scale quantization for which we\nderive theoretically optimal quantizations, under empirically verified\nassumptions. Further, we extend pyramid vector quantization to use Hessian\ninformation to minimize quantization error under expected feature activations,\ninstead of only relying on weight magnitudes. Experimentally, we achieves\nstate-of-the-art quantization performance with pareto-optimal trade-off between\nperformance and bits per weight and bits per activation, compared to compared\nmethods. On weight-only, we find that we can quantize a Llama-3 70B model to\n3.25 bits per weight and retain 98\\% accuracy on downstream tasks.\n","authors":["Tycho F. A. van der Ouderaa","Maximilian L. Croci","Agrin Hilmkil","James Hensman"],"pdf_url":"https://arxiv.org/pdf/2410.16926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00850v2","updated":"2024-12-04T10:45:41Z","published":"2024-10-30T11:16:04Z","title":"GWQ: Gradient-Aware Weight Quantization for Large Language Models","summary":" Large language models (LLMs) show impressive performance in solving complex\nlanguage tasks. However, its large number of parameters present significant\nchallenges for the deployment and application of the model on edge devices.\nCompressing large language models to low bits can enable them to run on\nresource-constrained devices, often leading to performance degradation. To\naddress this problem, we propose gradient-aware weight quantization (GWQ), the\nfirst quantization approach for low-bit weight quantization that leverages\ngradients to localize outliers, requiring only a minimal amount of calibration\ndata for outlier detection. GWQ retains the weights corresponding to the top 1%\noutliers preferentially at FP16 precision, while the remaining non-outlier\nweights are stored in a low-bit format. GWQ found experimentally that utilizing\nthe sensitive weights in the gradient localization model is more scientific\ncompared to utilizing the sensitive weights in the Hessian matrix localization\nmodel. Compared to current quantization methods, GWQ can be applied to multiple\nlanguage models and achieves lower PPL on the WikiText2 and C4 dataset. In the\nzero-shot task, GWQ quantized models have higher accuracy compared to other\nquantization methods. GWQ is also suitable for multimodal model quantization,\nand the quantized Qwen-VL family model is more accurate than other methods.\nZero-shot target detection task dataset RefCOCO outperforms the current\nstat-of-the-arts method SPQR. GWQ achieves 1.2 times inference speedup in\ncomparison to the original model, and effectively reduces the inference memory.\n","authors":["Yihua Shao","Siyu Liang","Zijian Ling","Minxi Yan","Haiyang Liu","Siyu Chen","Ziyang Yan","Chenyu Zhang","Haotong Qin","Michele Magno","Yang Yang","Zhen Lei","Yan Wang","Jingcai Guo","Ling Shao","Hao Tang"],"pdf_url":"https://arxiv.org/pdf/2411.00850v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06209v3","updated":"2024-12-04T10:33:18Z","published":"2024-04-09T10:58:21Z","title":"Elephants Never Forget: Memorization and Learning of Tabular Data in\n Large Language Models","summary":" While many have shown how Large Language Models (LLMs) can be applied to a\ndiverse set of tasks, the critical issues of data contamination and\nmemorization are often glossed over. In this work, we address this concern for\ntabular data. Specifically, we introduce a variety of different techniques to\nassess whether a language model has seen a tabular dataset during training.\nThis investigation reveals that LLMs have memorized many popular tabular\ndatasets verbatim. We then compare the few-shot learning performance of LLMs on\ndatasets that were seen during training to the performance on datasets released\nafter training. We find that LLMs perform better on datasets seen during\ntraining, indicating that memorization leads to overfitting. At the same time,\nLLMs show non-trivial performance on novel datasets and are surprisingly robust\nto data transformations. We then investigate the in-context statistical\nlearning abilities of LLMs. While LLMs are significantly better than random at\nsolving statistical classification problems, the sample efficiency of few-shot\nlearning lags behind traditional statistical learning algorithms, especially as\nthe dimension of the problem increases. This suggests that much of the observed\nfew-shot performance on novel real-world datasets is due to the LLM's world\nknowledge. Overall, our results highlight the importance of testing whether an\nLLM has seen an evaluation dataset during pre-training. We release the\nhttps://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package\nto test LLMs for memorization of tabular datasets.\n","authors":["Sebastian Bordt","Harsha Nori","Vanessa Rodrigues","Besmira Nushi","Rich Caruana"],"pdf_url":"https://arxiv.org/pdf/2404.06209v3.pdf","comment":"COLM camera ready, fix typo"},{"id":"http://arxiv.org/abs/2412.03190v1","updated":"2024-12-04T10:22:34Z","published":"2024-12-04T10:22:34Z","title":"Node Classification With Integrated Reject Option","summary":" One of the key tasks in graph learning is node classification. While Graph\nneural networks have been used for various applications, their adaptivity to\nreject option setting is not previously explored. In this paper, we propose\nNCwR, a novel approach to node classification in Graph Neural Networks (GNNs)\nwith an integrated reject option, which allows the model to abstain from making\npredictions when uncertainty is high. We propose both cost-based and\ncoverage-based methods for classification with abstention in node\nclassification setting using GNNs. We perform experiments using our method on\nthree standard citation network datasets Cora, Citeseer and Pubmed and compare\nwith relevant baselines. We also model the Legal judgment prediction problem on\nILDC dataset as a node classification problem where nodes represent legal cases\nand edges represent citations. We further interpret the model by analyzing the\ncases that the model abstains from predicting by visualizing which part of the\ninput features influenced this decision.\n","authors":["Uday Bhaskar","Jayadratha Gayen","Charu Sharma","Naresh Manwani"],"pdf_url":"https://arxiv.org/pdf/2412.03190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03188v1","updated":"2024-12-04T10:20:21Z","published":"2024-12-04T10:20:21Z","title":"Semi-decentralized Training of Spatio-Temporal Graph Neural Networks for\n Traffic Prediction","summary":" In smart mobility, large networks of geographically distributed sensors\nproduce vast amounts of high-frequency spatio-temporal data that must be\nprocessed in real time to avoid major disruptions. Traditional centralized\napproaches are increasingly unsuitable to this task, as they struggle to scale\nwith expanding sensor networks, and reliability issues in central components\ncan easily affect the whole deployment. To address these challenges, we explore\nand adapt semi-decentralized training techniques for Spatio-Temporal Graph\nNeural Networks (ST-GNNs) in smart mobility domain. We implement a simulation\nframework where sensors are grouped by proximity into multiple cloudlets, each\nhandling a subgraph of the traffic graph, fetching node features from other\ncloudlets to train its own local ST-GNN model, and exchanging model updates\nwith other cloudlets to ensure consistency, enhancing scalability and removing\nreliance on a centralized aggregator. We perform extensive comparative\nevaluation of four different ST-GNN training setups -- centralized, traditional\nFL, server-free FL, and Gossip Learning -- on large-scale traffic datasets, the\nMETR-LA and PeMS-BAY datasets, for short-, mid-, and long-term vehicle speed\npredictions. Experimental results show that semi-decentralized setups are\ncomparable to centralized approaches in performance metrics, while offering\nadvantages in terms of scalability and fault tolerance. In addition, we\nhighlight often overlooked issues in existing literature for distributed\nST-GNNs, such as the variation in model performance across different\ngeographical areas due to region-specific traffic patterns, and the significant\ncommunication overhead and computational costs that arise from the large\nreceptive field of GNNs, leading to substantial data transfers and increased\ncomputation of partial embeddings.\n","authors":["Ivan Kralj","Lodovico Giaretta","Gordan Ježić","Ivana Podnar Žarko","Šarūnas Girdzijauskas"],"pdf_url":"https://arxiv.org/pdf/2412.03188v1.pdf","comment":"8 pages, 4 figures, 3 tables, conference"},{"id":"http://arxiv.org/abs/2401.10962v2","updated":"2024-12-04T10:09:46Z","published":"2024-01-19T11:45:31Z","title":"One Step Learning, One Step Review","summary":" Visual fine-tuning has garnered significant attention with the rise of\npre-trained vision models. The current prevailing method, full fine-tuning,\nsuffers from the issue of knowledge forgetting as it focuses solely on fitting\nthe downstream training set. In this paper, we propose a novel weight\nrollback-based fine-tuning method called OLOR (One step Learning, One step\nReview). OLOR combines fine-tuning with optimizers, incorporating a weight\nrollback term into the weight update term at each step. This ensures\nconsistency in the weight range of upstream and downstream models, effectively\nmitigating knowledge forgetting and enhancing fine-tuning performance. In\naddition, a layer-wise penalty is presented to employ penalty decay and the\ndiversified decay rate to adjust the weight rollback levels of layers for\nadapting varying downstream tasks. Through extensive experiments on various\ntasks such as image classification, object detection, semantic segmentation,\nand instance segmentation, we demonstrate the general applicability and\nstate-of-the-art performance of our proposed OLOR. Code is available at\nhttps://github.com/rainbow-xiao/OLOR-AAAI-2024.\n","authors":["Xiaolong Huang","Qiankun Li","Xueran Li","Xuesong Gao"],"pdf_url":"https://arxiv.org/pdf/2401.10962v2.pdf","comment":"Published at the 38th AAAI Conference on Artificial Intelligence\n (AAAI 2024)"},{"id":"http://arxiv.org/abs/2310.01225v5","updated":"2024-12-04T10:04:02Z","published":"2023-10-02T14:12:53Z","title":"A path-norm toolkit for modern networks: consequences, promises and\n challenges","summary":" This work introduces the first toolkit around path-norms that fully\nencompasses general DAG ReLU networks with biases, skip connections and any\noperation based on the extraction of order statistics: max pooling, GroupSort\netc. This toolkit notably allows us to establish generalization bounds for\nmodern neural networks that are not only the most widely applicable path-norm\nbased ones, but also recover or beat the sharpest known bounds of this type.\nThese extended path-norms further enjoy the usual benefits of path-norms: ease\nof computation, invariance under the symmetries of the network, and improved\nsharpness on layered fully-connected networks compared to the product of\noperator norms, another complexity measure most commonly used.\n The versatility of the toolkit and its ease of implementation allow us to\nchallenge the concrete promises of path-norm-based generalization bounds, by\nnumerically evaluating the sharpest known bounds for ResNets on ImageNet.\n","authors":["Antoine Gonon","Nicolas Brisebarre","Elisa Riccietti","Rémi Gribonval"],"pdf_url":"https://arxiv.org/pdf/2310.01225v5.pdf","comment":"Erratum: in the published version there was a typo in the definition\n of the activation matrix in Definition A.3. This is fixed with this new\n version"},{"id":"http://arxiv.org/abs/2412.03178v1","updated":"2024-12-04T10:03:52Z","published":"2024-12-04T10:03:52Z","title":"Towards Understanding and Quantifying Uncertainty for Text-to-Image\n Generation","summary":" Uncertainty quantification in text-to-image (T2I) generative models is\ncrucial for understanding model behavior and improving output reliability. In\nthis paper, we are the first to quantify and evaluate the uncertainty of T2I\nmodels with respect to the prompt. Alongside adapting existing approaches\ndesigned to measure uncertainty in the image space, we also introduce\nPrompt-based UNCertainty Estimation for T2I models (PUNC), a novel method\nleveraging Large Vision-Language Models (LVLMs) to better address uncertainties\narising from the semantics of the prompt and generated images. PUNC utilizes a\nLVLM to caption a generated image, and then compares the caption with the\noriginal prompt in the more semantically meaningful text space. PUNC also\nenables the disentanglement of both aleatoric and epistemic uncertainties via\nprecision and recall, which image-space approaches are unable to do. Extensive\nexperiments demonstrate that PUNC outperforms state-of-the-art uncertainty\nestimation techniques across various settings. Uncertainty quantification in\ntext-to-image generation models can be used on various applications including\nbias detection, copyright protection, and OOD detection. We also introduce a\ncomprehensive dataset of text prompts and generation pairs to foster further\nresearch in uncertainty quantification for generative models. Our findings\nillustrate that PUNC not only achieves competitive performance but also enables\nnovel applications in evaluating and improving the trustworthiness of\ntext-to-image models.\n","authors":["Gianni Franchi","Dat Nguyen Trong","Nacim Belkhir","Guoxuan Xia","Andrea Pilzer"],"pdf_url":"https://arxiv.org/pdf/2412.03178v1.pdf","comment":"28 pages and 22 figures"},{"id":"http://arxiv.org/abs/2407.15017v4","updated":"2024-12-04T09:54:59Z","published":"2024-07-22T06:15:59Z","title":"Knowledge Mechanisms in Large Language Models: A Survey and Perspective","summary":" Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial\nfor advancing towards trustworthy AGI. This paper reviews knowledge mechanism\nanalysis from a novel taxonomy including knowledge utilization and evolution.\nKnowledge utilization delves into the mechanism of memorization, comprehension\nand application, and creation. Knowledge evolution focuses on the dynamic\nprogression of knowledge within individual and group LLMs. Moreover, we discuss\nwhat knowledge LLMs have learned, the reasons for the fragility of parametric\nknowledge, and the potential dark knowledge (hypothesis) that will be\nchallenging to address. We hope this work can help understand knowledge in LLMs\nand provide insights for future research.\n","authors":["Mengru Wang","Yunzhi Yao","Ziwen Xu","Shuofei Qiao","Shumin Deng","Peng Wang","Xiang Chen","Jia-Chen Gu","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.15017v4.pdf","comment":"EMNLP 2024 Findings; 39 pages (v4)"},{"id":"http://arxiv.org/abs/2207.09959v4","updated":"2024-12-04T09:45:26Z","published":"2022-07-20T15:09:16Z","title":"Exploration of Parameter Spaces Assisted by Machine Learning","summary":" We demonstrate two sampling procedures assisted by machine learning models\nvia regression and classification. The main objective is the use of a neural\nnetwork to suggest points likely inside regions of interest, reducing the\nnumber of evaluations of time consuming calculations. We compare results from\nthis approach with results from other sampling methods, namely Markov chain\nMonte Carlo and MultiNest, obtaining results that range from comparably similar\nto arguably better. In particular, we augment our classifier method with a\nboosting technique that rapidly increases the efficiency within a few\niterations. We show results from our methods applied to a toy model and the\ntype II 2HDM, using 3 and 7 free parameters, respectively. The code used for\nthis paper and instructions are publicly available on the web.\n","authors":["A. Hammad","Myeonghun Park","Raymundo Ramos","Pankaj Saha"],"pdf_url":"https://arxiv.org/pdf/2207.09959v4.pdf","comment":"30 pages, 9 figures. Matches published version. Code and instructions\n are available on https://github.com/AHamamd150/MLscanner"},{"id":"http://arxiv.org/abs/2402.14400v3","updated":"2024-12-04T09:44:26Z","published":"2024-02-22T09:34:48Z","title":"Learning Developmental Age from 3D Infant Kinetics Using Adaptive Graph\n Neural Networks","summary":" Reliable methods for the neurodevelopmental assessment of infants are\nessential for early detection of problems that may need prompt interventions.\nSpontaneous motor activity, or 'kinetics', is shown to provide a powerful\nsurrogate measure of upcoming neurodevelopment. However, its assessment is by\nand large qualitative and subjective, focusing on visually identified,\nage-specific gestures. In this work, we introduce Kinetic Age (KA), a novel\ndata-driven metric that quantifies neurodevelopmental maturity by predicting an\ninfant's age based on their movement patterns. KA offers an interpretable and\ngeneralizable proxy for motor development. Our method leverages 3D video\nrecordings of infants, processed with pose estimation to extract\nspatio-temporal series of anatomical landmarks, which are released as a new\nopenly available dataset. These data are modeled using adaptive graph\nconvolutional networks, able to capture the spatio-temporal dependencies in\ninfant movements. We also show that our data-driven approach achieves\nimprovement over traditional machine learning baselines based on manually\nengineered features.\n","authors":["Daniel Holmberg","Manu Airaksinen","Viviana Marchi","Andrea Guzzetta","Anna Kivi","Leena Haataja","Sampsa Vanhatalo","Teemu Roos"],"pdf_url":"https://arxiv.org/pdf/2402.14400v3.pdf","comment":"15 pages, 9 figures. Code repository available via\n https://github.com/deinal/infant-aagcn"},{"id":"http://arxiv.org/abs/2412.01064v2","updated":"2024-12-04T09:43:18Z","published":"2024-12-02T02:50:07Z","title":"FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking\n Portrait","summary":" With the rapid advancement of diffusion-based generative models, portrait\nimage animation has achieved remarkable results. However, it still faces\nchallenges in temporally consistent video generation and fast sampling due to\nits iterative sampling nature. This paper presents FLOAT, an audio-driven\ntalking portrait video generation method based on flow matching generative\nmodel. We shift the generative modeling from the pixel-based latent space to a\nlearned motion latent space, enabling efficient design of temporally consistent\nmotion. To achieve this, we introduce a transformer-based vector field\npredictor with a simple yet effective frame-wise conditioning mechanism.\nAdditionally, our method supports speech-driven emotion enhancement, enabling a\nnatural incorporation of expressive motions. Extensive experiments demonstrate\nthat our method outperforms state-of-the-art audio-driven talking portrait\nmethods in terms of visual quality, motion fidelity, and efficiency.\n","authors":["Taekyung Ki","Dongchan Min","Gyeongsu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.01064v2.pdf","comment":"Project page: https://deepbrainai-research.github.io/float/"},{"id":"http://arxiv.org/abs/2412.03158v1","updated":"2024-12-04T09:35:03Z","published":"2024-12-04T09:35:03Z","title":"LEP-QNN: Loan Eligibility Prediction Using Quantum Neural Networks","summary":" Predicting loan eligibility with high accuracy remains a significant\nchallenge in the finance sector. Accurate predictions enable financial\ninstitutions to make informed decisions, mitigate risks, and effectively adapt\nservices to meet customer needs. However, the complexity and the\nhigh-dimensional nature of financial data have always posed significant\nchallenges to achieving this level of precision. To overcome these issues, we\npropose a novel approach that employs Quantum Machine Learning (QML) for Loan\nEligibility Prediction using Quantum Neural Networks (LEP-QNN).Our innovative\napproach achieves an accuracy of 98% in predicting loan eligibility from a\nsingle, comprehensive dataset. This performance boost is attributed to the\nstrategic implementation of a dropout mechanism within the quantum circuit,\naimed at minimizing overfitting and thereby improving the model's predictive\nreliability. In addition, our exploration of various optimizers leads to\nidentifying the most efficient setup for our LEP-QNN framework, optimizing its\nperformance. We also rigorously evaluate the resilience of LEP-QNN under\ndifferent quantum noise scenarios, ensuring its robustness and dependability\nfor quantum computing environments. This research showcases the potential of\nQML in financial predictions and establishes a foundational guide for advancing\nQML technologies, marking a step towards developing advanced, quantum-driven\nfinancial decision-making tools.\n","authors":["Nouhaila Innan","Alberto Marchisio","Mohamed Bennai","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2412.03158v1.pdf","comment":"8 pages. 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2411.00809v2","updated":"2024-12-04T09:26:47Z","published":"2024-10-23T16:16:15Z","title":"Adaptive Dense Reward: Understanding the Gap Between Action and Reward\n Space in Alignment","summary":" Reinforcement Learning from Human Feedback (RLHF) has proven highly effective\nin aligning Large Language Models (LLMs) with human preferences. However, the\noriginal RLHF typically optimizes under an overall reward, which can lead to a\nsuboptimal learning process. This limitation stems from RLHF's lack of\nawareness regarding which specific tokens should be reinforced or suppressed.\nMoreover, conflicts in supervision can arise, for instance, when a chosen\nresponse includes erroneous tokens, while a rejected response contains accurate\nelements. To rectify these shortcomings, increasing dense reward methods, such\nas step-wise and token-wise RLHF, have been proposed. However, these existing\nmethods are limited to specific tasks (like mathematics). In this paper, we\npropose the ``Adaptive Message-wise RLHF'' method, which robustly applies to\nvarious tasks. By defining pivot tokens as key indicators, our approach\nadaptively identifies essential information and converts sequence-level\nsupervision into fine-grained, subsequence-level supervision. This aligns the\ndensity of rewards and action spaces more closely with the information density\nof the input. Experiments demonstrate that our method can be integrated into\nvarious training methods, significantly mitigating hallucinations and\ncatastrophic forgetting problems, while outperforming other methods on multiple\nevaluation metrics. Our method improves the success rate on adversarial samples\nby 10\\% compared to the sample-wise approach, and achieves a 1.3\\% improvement\non evaluation benchmarks such as MMLU, GSM8K, HumanEval, etc.\n","authors":["Yanshi Li","Shaopan Xiong","Gengru Chen","Xiaoyang Li","Yijia Luo","Xingyao Zhang","Yanhui Huang","Xingyuan Bu","Yingshui Tan","Chun Yuan","Jiamang Wang","Wenbo Su","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.00809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03154v1","updated":"2024-12-04T09:24:33Z","published":"2024-12-04T09:24:33Z","title":"Testing Neural Network Verifiers: A Soundness Benchmark with Hidden\n Counterexamples","summary":" In recent years, many neural network (NN) verifiers have been developed to\nformally verify certain properties of neural networks such as robustness.\nAlthough many benchmarks have been constructed to evaluate the performance of\nNN verifiers, they typically lack a ground-truth for hard instances where no\ncurrent verifier can verify and no counterexample can be found, which makes it\ndifficult to check the soundness of a new verifier if it claims to verify hard\ninstances which no other verifier can do. We propose to develop a soundness\nbenchmark for NN verification. Our benchmark contains instances with\ndeliberately inserted counterexamples while we also try to hide the\ncounterexamples from regular adversarial attacks which can be used for finding\ncounterexamples. We design a training method to produce neural networks with\nsuch hidden counterexamples. Our benchmark aims to be used for testing the\nsoundness of NN verifiers and identifying falsely claimed verifiability when it\nis known that hidden counterexamples exist. We systematically construct our\nbenchmark and generate instances across diverse model architectures, activation\nfunctions, input sizes, and perturbation radii. We demonstrate that our\nbenchmark successfully identifies bugs in state-of-the-art NN verifiers, as\nwell as synthetic bugs, providing a crucial step toward enhancing the\nreliability of testing NN verifiers. Our code is available at\nhttps://github.com/MVP-Harry/SoundnessBench and our benchmark is available at\nhttps://huggingface.co/datasets/SoundnessBench/SoundnessBench.\n","authors":["Xingjian Zhou","Hongji Xu","Andy Xu","Zhouxing Shi","Cho-Jui Hsieh","Huan Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03154v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.03145v1","updated":"2024-12-04T09:11:33Z","published":"2024-12-04T09:11:33Z","title":"Topological Trajectory Classification and Landmark Inference on\n Simplicial Complexes","summary":" We consider the problem of classifying trajectories on a discrete or\ndiscretised 2-dimensional manifold modelled by a simplicial complex. Previous\nworks have proposed to project the trajectories into the harmonic eigenspace of\nthe Hodge Laplacian, and then cluster the resulting embeddings. However, if the\nconsidered space has vanishing homology (i.e., no \"holes\"), then the harmonic\nspace of the 1-Hodge Laplacian is trivial and thus the approach fails. Here we\npropose to view this issue akin to a sensor placement problem and present an\nalgorithm that aims to learn \"optimal holes\" to distinguish a set of given\ntrajectory classes. Specifically, given a set of labelled trajectories, which\nwe interpret as edge-flows on the underlying simplicial complex, we search for\n2-simplices whose deletion results in an optimal separation of the trajectory\nlabels according to the corresponding spectral embedding of the trajectories\ninto the harmonic space. Finally, we generalise this approach to the\nunsupervised setting.\n","authors":["Vincent P. Grande","Josef Hoppe","Florian Frantzen","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2412.03145v1.pdf","comment":"5 pages, 4 figures, Accepted at the 58th Annual Asilomar Conference\n on Signals, Systems, and Computers 2024"},{"id":"http://arxiv.org/abs/2412.03134v1","updated":"2024-12-04T08:57:03Z","published":"2024-12-04T08:57:03Z","title":"Generalized Diffusion Model with Adjusted Offset Noise","summary":" Diffusion models have become fundamental tools for modeling data\ndistributions in machine learning and have applications in image generation,\ndrug discovery, and audio synthesis. Despite their success, these models face\nchallenges when generating data with extreme brightness values, as evidenced by\nlimitations in widely used frameworks like Stable Diffusion. Offset noise has\nbeen proposed as an empirical solution to this issue, yet its theoretical basis\nremains insufficiently explored. In this paper, we propose a generalized\ndiffusion model that naturally incorporates additional noise within a rigorous\nprobabilistic framework. Our approach modifies both the forward and reverse\ndiffusion processes, enabling inputs to be diffused into Gaussian distributions\nwith arbitrary mean structures. We derive a loss function based on the evidence\nlower bound, establishing its theoretical equivalence to offset noise with\ncertain adjustments, while broadening its applicability. Experiments on\nsynthetic datasets demonstrate that our model effectively addresses\nbrightness-related challenges and outperforms conventional methods in\nhigh-dimensional scenarios.\n","authors":["Takuro Kutsuna"],"pdf_url":"https://arxiv.org/pdf/2412.03134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03131v1","updated":"2024-12-04T08:51:23Z","published":"2024-12-04T08:51:23Z","title":"Unifying KV Cache Compression for Large Language Models with LeanKV","summary":" Large language models (LLMs) demonstrate exceptional performance but incur\nhigh serving costs due to substantial memory demands, with the key-value (KV)\ncache being a primary bottleneck. Existing KV cache compression methods,\nincluding quantization and pruning, struggle with limitations such as uniform\ntreatment of keys and values and static memory allocation across attention\nheads. To address these challenges, we introduce LeanKV, a unified KV cache\ncompression framework that enhances LLM serving efficiency without compromising\naccuracy through three innovations: (1) Hetero-KV quantization, which stores\nkeys at a higher precision than values to reflect their greater impact on\nattention computations; (2) per-head dynamic sparsity, which allocates memory\nbased on token importance per head and per request; and (3) unified KV\ncompression, integrating mixed-precision quantization and selective pruning to\nenable a smooth tradeoff between model accuracy and memory efficiency. To\nefficiently support these techniques, LeanKV introduces systems optimizations\nincluding unified paging and on-GPU parallel memory management. Implemented on\nvLLM, LeanKV compresses the KV cache by $3.0\\times$ to $5.0\\times$ without\naccuracy loss and up to $11.0\\times$ with under 5% accuracy loss, enhancing\nthroughput by $1.9\\times$ to $2.5\\times$, and up to $6.9\\times$.\n","authors":["Yanqi Zhang","Yuwei Hu","Runyuan Zhao","John C. S. Lui","Haibo Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.16615v2","updated":"2024-12-04T08:44:10Z","published":"2024-11-25T17:54:29Z","title":"Graph Pooling by Local Cluster Selection","summary":" Graph pooling is a family of operations which take graphs as input and\nproduce shrinked graphs as output. Modern graph pooling methods are trainable\nand, in general inserted in Graph Neural Networks (GNNs) architectures as graph\nshrinking operators along the (deep) processing pipeline. This work proposes a\nnovel procedure for pooling graphs, along with a node-centred graph pooling\noperator.\n","authors":["Yizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2411.16615v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.03120v1","updated":"2024-12-04T08:39:45Z","published":"2024-12-04T08:39:45Z","title":"Sinkhorn Algorithm for Sequentially Composed Optimal Transports","summary":" Sinkhorn algorithm is the de-facto standard approximation algorithm for\noptimal transport, which has been applied to a variety of applications,\nincluding image processing and natural language processing. In theory, the\nproof of its convergence follows from the convergence of the Sinkhorn--Knopp\nalgorithm for the matrix scaling problem, and Altschuler et al. show that its\nworst-case time complexity is in near-linear time. Very recently, sequentially\ncomposed optimal transports were proposed by Watanabe and Isobe as a\nhierarchical extension of optimal transports. In this paper, we present an\nefficient approximation algorithm, namely Sinkhorn algorithm for sequentially\ncomposed optimal transports, for its entropic regularization. Furthermore, we\npresent a theoretical analysis of the Sinkhorn algorithm, namely (i) its\nexponential convergence to the optimal solution with respect to the Hilbert\npseudometric, and (ii) a worst-case complexity analysis for the case of one\nsequential composition.\n","authors":["Kazuki Watanabe","Noboru Isobe"],"pdf_url":"https://arxiv.org/pdf/2412.03120v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2405.16436v3","updated":"2024-12-04T08:15:35Z","published":"2024-05-26T05:38:50Z","title":"Provably Mitigating Overoptimization in RLHF: Your SFT Loss is\n Implicitly an Adversarial Regularizer","summary":" Aligning generative models with human preference via RLHF typically suffers\nfrom overoptimization, where an imperfectly learned reward model can misguide\nthe generative model to output undesired responses. We investigate this problem\nin a principled manner by identifying the source of the misalignment as a form\nof distributional shift and uncertainty in learning human preferences. To\nmitigate overoptimization, we first propose a theoretical algorithm that\nchooses the best policy for an adversarially chosen reward model; one that\nsimultaneously minimizes the maximum likelihood estimation of the loss and a\nreward penalty term. Here, the reward penalty term is introduced to prevent the\npolicy from choosing actions with spurious high proxy rewards, resulting in\nprovable sample efficiency of the algorithm under a partial coverage style\ncondition. Moving from theory to practice, the proposed algorithm further\nenjoys an equivalent but surprisingly easy-to-implement reformulation. Using\nthe equivalence between reward models and the corresponding optimal policy, the\nalgorithm features a simple objective that combines: (i) a preference\noptimization loss that directly aligns the policy with human preference, and\n(ii) a supervised learning loss that explicitly imitates the policy with a\n(suitable) baseline distribution. In the context of aligning large language\nmodels (LLM), this objective fuses the direct preference optimization (DPO)\nloss with the supervised fine-tuning (SFT) loss to help mitigate the\noveroptimization towards undesired responses, for which we name the algorithm\nRegularized Preference Optimization (RPO). Experiments of aligning LLMs\ndemonstrate the improved performance of RPO compared with DPO baselines. Our\nwork sheds light on the interplay between preference optimization and SFT in\ntuning LLMs with both theoretical guarantees and empirical evidence.\n","authors":["Zhihan Liu","Miao Lu","Shenao Zhang","Boyi Liu","Hongyi Guo","Yingxiang Yang","Jose Blanchet","Zhaoran Wang"],"pdf_url":"https://arxiv.org/pdf/2405.16436v3.pdf","comment":"Accepted by The Thirty-Eighth Annual Conference on Neural Information\n Processing Systems. 31 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.03105v1","updated":"2024-12-04T08:10:48Z","published":"2024-12-04T08:10:48Z","title":"Few-Shot Learning with Adaptive Weight Masking in Conditional GANs","summary":" Deep learning has revolutionized various fields, yet its efficacy is hindered\nby overfitting and the requirement of extensive annotated data, particularly in\nfew-shot learning scenarios where limited samples are available. This paper\nintroduces a novel approach to few-shot learning by employing a Residual Weight\nMasking Conditional Generative Adversarial Network (RWM-CGAN) for data\naugmentation. The proposed model integrates residual units within the generator\nto enhance network depth and sample quality, coupled with a weight mask\nregularization technique in the discriminator to improve feature learning from\nsmall-sample categories. This method addresses the core issues of robustness\nand generalization in few-shot learning by providing a controlled and clear\naugmentation of the sample space. Extensive experiments demonstrate that\nRWM-CGAN not only expands the sample space effectively but also enriches the\ndiversity and quality of generated samples, leading to significant improvements\nin detection and classification accuracy on public datasets. The paper\ncontributes to the advancement of few-shot learning by offering a practical\nsolution to the challenges posed by data scarcity and the need for rapid\ngeneralization to new tasks or categories.\n","authors":["Jiacheng Hu","Zhen Qi","Jianjun Wei","Jiajing Chen","Runyuan Bao","Xinyu Qiu"],"pdf_url":"https://arxiv.org/pdf/2412.03105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14946v2","updated":"2024-12-04T07:58:40Z","published":"2024-10-19T02:32:09Z","title":"DEL-Ranking: Ranking-Correction Denoising Framework for Elucidating\n Molecular Affinities in DNA-Encoded Libraries","summary":" DNA-encoded library (DEL) screening has revolutionized the detection of\nprotein-ligand interactions through read counts, enabling rapid exploration of\nvast chemical spaces. However, noise in read counts, stemming from nonspecific\ninteractions, can mislead this exploration process. We present DEL-Ranking, a\nnovel distribution-correction denoising framework that addresses these\nchallenges. Our approach introduces two key innovations: (1) a novel ranking\nloss that rectifies relative magnitude relationships between read counts,\nenabling the learning of causal features determining activity levels, and (2)\nan iterative algorithm employing self-training and consistency loss to\nestablish model coherence between activity label and read count predictions.\nFurthermore, we contribute three new DEL screening datasets, the first to\ncomprehensively include multi-dimensional molecular representations,\nprotein-ligand enrichment values, and their activity labels. These datasets\nmitigate data scarcity issues in AI-driven DEL screening research. Rigorous\nevaluation on diverse DEL datasets demonstrates DEL-Ranking's superior\nperformance across multiple correlation metrics, with significant improvements\nin binding affinity prediction accuracy. Our model exhibits zero-shot\ngeneralization ability across different protein targets and successfully\nidentifies potential motifs determining compound binding affinity. This work\nadvances DEL screening analysis and provides valuable resources for future\nresearch in this area.\n","authors":["Hanqun Cao","Mutian He","Ning Ma","Chang-yu Hsieh","Chunbin Gu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2410.14946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03097v1","updated":"2024-12-04T07:50:27Z","published":"2024-12-04T07:50:27Z","title":"Enhancing Recommendation Systems with GNNs and Addressing Over-Smoothing","summary":" This paper addresses key challenges in enhancing recommendation systems by\nleveraging Graph Neural Networks (GNNs) and addressing inherent limitations\nsuch as over-smoothing, which reduces model effectiveness as network hierarchy\ndeepens. The proposed approach introduces three GNN-based recommendation\nmodels, specifically designed to mitigate over-smoothing through innovative\nmechanisms like residual connections and identity mapping within the\naggregation propagation process. These modifications enable more effective\ninformation flow across layers, preserving essential user-item interaction\ndetails to improve recommendation accuracy. Additionally, the study emphasizes\nthe critical need for interpretability in recommendation systems, aiming to\nprovide transparent and justifiable suggestions tailored to dynamic user\npreferences. By integrating collaborative filtering with GNN architectures, the\nproposed models not only enhance predictive accuracy but also align\nrecommendations more closely with individual behaviors, adapting to nuanced\nshifts in user interests. This work advances the field by tackling both\ntechnical and user-centric challenges, contributing to the development of\nrobust and explainable recommendation systems capable of managing the\ncomplexity and scale of modern online environments.\n","authors":["Wenyi Liu","Ziqi Zhang","Xinshi Li","Jiacheng Hu","Yuanshuai Luo","Junliang Du"],"pdf_url":"https://arxiv.org/pdf/2412.03097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03092v1","updated":"2024-12-04T07:44:35Z","published":"2024-12-04T07:44:35Z","title":"Revolve: Optimizing AI Systems by Tracking Response Evolution in Textual\n Optimization","summary":" Recent advancements in large language models (LLMs) have significantly\nenhanced the ability of LLM-based systems to perform complex tasks through\nnatural language processing and tool interaction. However, optimizing these\nLLM-based systems for specific tasks remains challenging, often requiring\nmanual interventions like prompt engineering and hyperparameter tuning.\nExisting automatic optimization methods, such as textual feedback-based\ntechniques (e.g., TextGrad), tend to focus on immediate feedback, analogous to\nusing immediate derivatives in traditional numerical gradient descent. However,\nrelying solely on such feedback can be limited when the adjustments made in\nresponse to this feedback are either too small or fluctuate irregularly,\npotentially slowing down or even stalling the optimization process. To overcome\nthese challenges, more adaptive methods are needed, especially in situations\nwhere the system's response is evolving slowly or unpredictably. In this paper,\nwe introduce REVOLVE, an optimization method that tracks how \"R\"esponses\n\"EVOLVE\" across iterations in LLM systems. By focusing on the evolution of\nresponses over time, REVOLVE enables more stable and effective optimization by\nmaking thoughtful, progressive adjustments at each step. Experimental results\ndemonstrate that REVOLVE outperforms competitive baselines, achieving a 7.8%\nimprovement in prompt optimization, a 20.72% gain in solution refinement, and a\n29.17% increase in code optimization. Additionally, REVOLVE converges in fewer\niterations, resulting in significant computational savings. These advantages\nhighlight its adaptability and efficiency, positioning REVOLVE as a valuable\ntool for optimizing LLM-based systems and accelerating the development of\nnext-generation AI technologies. Code is available at:\nhttps://github.com/Peiyance/REVOLVE.\n","authors":["Peiyan Zhang","Haibo Jin","Leyang Hu","Xinnuo Li","Liying Kang","Man Luo","Yangqiu Song","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2412.03092v1.pdf","comment":"20 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.03084v1","updated":"2024-12-04T07:26:36Z","published":"2024-12-04T07:26:36Z","title":"Hybrid deep learning-based strategy for the hepatocellular carcinoma\n cancer grade classification of H&E stained liver histopathology images","summary":" Hepatocellular carcinoma (HCC) is a common type of liver cancer whose\nearly-stage diagnosis is a common challenge, mainly due to the manual\nassessment of hematoxylin and eosin-stained whole slide images, which is a\ntime-consuming process and may lead to variability in decision-making. For\naccurate detection of HCC, we propose a hybrid deep learning-based architecture\nthat uses transfer learning to extract the features from pre-trained\nconvolutional neural network (CNN) models and a classifier made up of a\nsequence of fully connected layers. This study uses a publicly available The\nCancer Genome Atlas Hepatocellular Carcinoma (TCGA-LIHC)database (n=491) for\nmodel development and database of Kasturba Gandhi Medical College (KMC), India\nfor validation. The pre-processing step involves patch extraction, colour\nnormalization, and augmentation that results in 3920 patches for the TCGA\ndataset. The developed hybrid deep neural network consisting of a CNN-based\npre-trained feature extractor and a customized artificial neural network-based\nclassifier is trained using five-fold cross-validation. For this study, eight\ndifferent state-of-the-art models are trained and tested as feature extractors\nfor the proposed hybrid model. The proposed hybrid model with ResNet50-based\nfeature extractor provided the sensitivity, specificity, F1-score, accuracy,\nand AUC of 100.00%, 100.00%, 100.00%, 100.00%, and 1.00, respectively on the\nTCGA database. On the KMC database, EfficientNetb3 resulted in the optimal\nchoice of the feature extractor giving sensitivity, specificity, F1-score,\naccuracy, and AUC of 96.97, 98.85, 96.71, 96.71, and 0.99, respectively. The\nproposed hybrid models showed improvement in accuracy of 2% and 4% over the\npre-trained models in TCGA-LIHC and KMC databases.\n","authors":["Ajinkya Deshpande","Deep Gupta","Ankit Bhurane","Nisha Meshram","Sneha Singh","Petia Radeva"],"pdf_url":"https://arxiv.org/pdf/2412.03084v1.pdf","comment":"14 figure, 9 tables"},{"id":"http://arxiv.org/abs/2412.03083v1","updated":"2024-12-04T07:21:23Z","published":"2024-12-04T07:21:23Z","title":"A Scalable Quantum Neural Network for Approximate SRBB-Based Unitary\n Synthesis","summary":" In this work, scalable quantum neural networks are introduced to approximate\nunitary evolutions through the Standard Recursive Block Basis (SRBB) and,\nsubsequently, redesigned with a reduced number of CNOTs. This algebraic\napproach to the problem of unitary synthesis exploits Lie algebras and their\ntopological features to obtain scalable parameterizations of unitary operators.\nFirst, the recursive algorithm that builds the SRBB is presented, framed in the\noriginal scalability scheme already known to the literature only from a\ntheoretical point of view. Unexpectedly, 2-qubit systems emerge as a special\ncase outside this scheme. Furthermore, an algorithm to reduce the number of\nCNOTs is proposed, thus deriving a new implementable scaling scheme that\nrequires one single layer of approximation. From the mathematical algorithm,\nthe scalable CNOT-reduced quantum neural network is implemented and its\nperformance is assessed with a variety of different unitary matrices, both\nsparse and dense, up to 6 qubits via the PennyLane library. The effectiveness\nof the approximation is measured with different metrics in relation to two\noptimizers: a gradient-based method and the Nelder-Mead method. The approximate\nSRBB-based synthesis algorithm with CNOT-reduction is also tested on real\nhardware and compared with other valid approximation and decomposition methods\navailable in the literature.\n","authors":["Giacomo Belli","Marco Mordacci","Michele Amoretti"],"pdf_url":"https://arxiv.org/pdf/2412.03083v1.pdf","comment":"Journal"},{"id":"http://arxiv.org/abs/2410.07170v2","updated":"2024-12-04T07:18:17Z","published":"2024-10-09T17:59:06Z","title":"One Initialization to Rule them All: Fine-tuning via Explained Variance\n Adaptation","summary":" Foundation models (FMs) are pre-trained on large-scale datasets and then\nfine-tuned on a downstream task for a specific application. The most successful\nand most commonly used fine-tuning method is to update the pre-trained weights\nvia a low-rank adaptation (LoRA). LoRA introduces new weight matrices that are\nusually initialized at random with a uniform rank distribution across the model\nweights. Recent works focus on different initialization schemes or the learning\nof adaptive ranks during fine-tuning. Both approaches have only been\ninvestigated in isolation, resulting in slow convergence or a uniform rank\ndistribution, in turn leading to suboptimal performance. We propose to improve\nLoRA by initializing the new weights in a data-driven manner by computing\nsingular value decomposition (SVD) on minibatches of activation vectors. Then,\nwe initialize the LoRA matrices with the obtained right-singular vectors and\nredistribute ranks among all weight matrices to provably store the maximum\namount of information of the downstream data in the newly introduced weights.\nIn this way, only what information to maintain or neglect during the\nfine-tuning process needs to be learned. We call our new method Explained\nVariance Adaptation (EVA). We apply EVA to a variety of fine-tuning tasks\nranging from language generation and understanding to image classification and\nreinforcement learning. EVA exhibits faster convergence than competitors and\nachieves the highest average score across a multitude of tasks per domain while\nreducing the number of trainable parameters through rank redistribution.\n","authors":["Fabian Paischer","Lukas Hauzenberger","Thomas Schmied","Benedikt Alkin","Marc Peter Deisenroth","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2410.07170v2.pdf","comment":"11 pages + references and appendix, code available at\n https://github.com/ml-jku/EVA"},{"id":"http://arxiv.org/abs/2412.02538v2","updated":"2024-12-04T07:11:07Z","published":"2024-12-03T16:32:19Z","title":"On Privacy, Security, and Trustworthiness in Distributed Wireless Large\n AI Models (WLAM)","summary":" Combining wireless communication with large artificial intelligence (AI)\nmodels can open up a myriad of novel application scenarios. In sixth generation\n(6G) networks, ubiquitous communication and computing resources allow large AI\nmodels to serve democratic large AI models-related services to enable real-time\napplications like autonomous vehicles, smart cities, and Internet of Things\n(IoT) ecosystems. However, the security considerations and sustainable\ncommunication resources limit the deployment of large AI models over\ndistributed wireless networks. This paper provides a comprehensive overview of\nprivacy, security, and trustworthy for distributed wireless large AI model\n(WLAM). In particular, a detailed privacy and security are analysis for\ndistributed WLAM is fist revealed. The classifications and theoretical findings\nabout privacy and security in distributed WLAM are discussed. Then the\ntrustworthy and ethics for implementing distributed WLAM are described.\nFinally, the comprehensive applications of distributed WLAM are presented in\nthe context of electromagnetic signal processing.\n","authors":["Zhaohui Yang","Wei Xu","Le Liang","Yuanhao Cui","Zhijin Qin","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2412.02538v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2410.08631v2","updated":"2024-12-04T06:58:26Z","published":"2024-10-11T08:53:58Z","title":"CryoFM: A Flow-based Foundation Model for Cryo-EM Densities","summary":" Cryo-electron microscopy (cryo-EM) is a powerful technique in structural\nbiology and drug discovery, enabling the study of biomolecules at high\nresolution. Significant advancements by structural biologists using cryo-EM\nhave led to the production of over 38,626 protein density maps at various\nresolutions1. However, cryo-EM data processing algorithms have yet to fully\nbenefit from our knowledge of biomolecular density maps, with only a few recent\nmodels being data-driven but limited to specific tasks. In this study, we\npresent CryoFM, a foundation model designed as a generative model, learning the\ndistribution of high-quality density maps and generalizing effectively to\ndownstream tasks. Built on flow matching, CryoFM is trained to accurately\ncapture the prior distribution of biomolecular density maps. Furthermore, we\nintroduce a flow posterior sampling method that leverages CRYOFM as a flexible\nprior for several downstream tasks in cryo-EM and cryo-electron tomography\n(cryo-ET) without the need for fine-tuning, achieving state-of-the-art\nperformance on most tasks and demonstrating its potential as a foundational\nmodel for broader applications in these fields.\n","authors":["Yi Zhou","Yilai Li","Jing Yuan","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2410.08631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03068v1","updated":"2024-12-04T06:42:55Z","published":"2024-12-04T06:42:55Z","title":"UTSD: Unified Time Series Diffusion Model","summary":" Transformer-based architectures have achieved unprecedented success in time\nseries analysis. However, facing the challenge of across-domain modeling,\nexisting studies utilize statistical prior as prompt engineering fails under\nthe huge distribution shift among various domains. In this paper, a Unified\nTime Series Diffusion (UTSD) model is established for the first time to model\nthe multi-domain probability distribution, utilizing the powerful probability\ndistribution modeling ability of Diffusion. Unlike the autoregressive models\nthat capture the conditional probabilities of the prediction horizon to the\nhistorical sequence, we use a diffusion denoising process to model the mixture\ndistribution of the cross-domain data and generate the prediction sequence for\nthe target domain directly utilizing conditional sampling. The proposed UTSD\ncontains three pivotal designs: (1) The condition network captures the\nmulti-scale fluctuation patterns from the observation sequence, which are\nutilized as context representations to guide the denoising network to generate\nthe prediction sequence; (2) Adapter-based fine-tuning strategy, the\nmulti-domain universal representation learned in the pretraining stage is\nutilized for downstream tasks in target domains; (3) The diffusion and\ndenoising process on the actual sequence space, combined with the improved\nclassifier free guidance as the conditional generation strategy, greatly\nimproves the stability and accuracy of the downstream task. We conduct\nextensive experiments on mainstream benchmarks, and the pre-trained UTSD\noutperforms existing foundation models on all data domains, exhibiting superior\nzero-shot generalization ability. After training from scratch, UTSD achieves\ncomparable performance against domain-specific proprietary models. The\nempirical results validate the potential of UTSD as a time series foundational\nmodel.\n","authors":["Xiangkai Ma","Xiaobin Hong","Wenzhong Li","Sanglu Lu"],"pdf_url":"https://arxiv.org/pdf/2412.03068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03056v1","updated":"2024-12-04T06:20:51Z","published":"2024-12-04T06:20:51Z","title":"Point-GN: A Non-Parametric Network Using Gaussian Positional Encoding\n for Point Cloud Classification","summary":" This paper introduces Point-GN, a novel non-parametric network for efficient\nand accurate 3D point cloud classification. Unlike conventional deep learning\nmodels that rely on a large number of trainable parameters, Point-GN leverages\nnon-learnable components-specifically, Farthest Point Sampling (FPS), k-Nearest\nNeighbors (k-NN), and Gaussian Positional Encoding (GPE)-to extract both local\nand global geometric features. This design eliminates the need for additional\ntraining while maintaining high performance, making Point-GN particularly\nsuited for real-time, resource-constrained applications. We evaluate Point-GN\non two benchmark datasets, ModelNet40 and ScanObjectNN, achieving\nclassification accuracies of 85.29% and 85.89%, respectively, while\nsignificantly reducing computational complexity. Point-GN outperforms existing\nnon-parametric methods and matches the performance of fully trained models, all\nwith zero learnable parameters. Our results demonstrate that Point-GN is a\npromising solution for 3D point cloud classification in practical, real-time\nenvironments.\n","authors":["Marzieh Mohammadi","Amir Salarpour"],"pdf_url":"https://arxiv.org/pdf/2412.03056v1.pdf","comment":"This paper has been accepted for presentation at the IEEE Winter\n Conference on Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2405.18407v2","updated":"2024-12-04T06:18:16Z","published":"2024-05-28T17:47:19Z","title":"Phased Consistency Models","summary":" Consistency Models (CMs) have made significant progress in accelerating the\ngeneration of diffusion models. However, their application to high-resolution,\ntext-conditioned image generation in the latent space remains unsatisfactory.\nIn this paper, we identify three key flaws in the current design of Latent\nConsistency Models (LCMs). We investigate the reasons behind these limitations\nand propose Phased Consistency Models (PCMs), which generalize the design space\nand address the identified limitations. Our evaluations demonstrate that PCMs\noutperform LCMs across 1--16 step generation settings. While PCMs are\nspecifically designed for multi-step refinement, they achieve comparable 1-step\ngeneration results to previously state-of-the-art specifically designed 1-step\nmethods. Furthermore, we show the methodology of PCMs is versatile and\napplicable to video generation, enabling us to train the state-of-the-art\nfew-step text-to-video generator. Our code is available at\nhttps://github.com/G-U-N/Phased-Consistency-Model.\n","authors":["Fu-Yun Wang","Zhaoyang Huang","Alexander William Bergman","Dazhong Shen","Peng Gao","Michael Lingelbach","Keqiang Sun","Weikang Bian","Guanglu Song","Yu Liu","Xiaogang Wang","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2405.18407v2.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03051v1","updated":"2024-12-04T06:11:09Z","published":"2024-12-04T06:11:09Z","title":"Less is More: A Stealthy and Efficient Adversarial Attack Method for\n DRL-based Autonomous Driving Policies","summary":" Despite significant advancements in deep reinforcement learning (DRL)-based\nautonomous driving policies, these policies still exhibit vulnerability to\nadversarial attacks. This vulnerability poses a formidable challenge to the\npractical deployment of these policies in autonomous driving. Designing\neffective adversarial attacks is an indispensable prerequisite for enhancing\nthe robustness of these policies. In view of this, we present a novel stealthy\nand efficient adversarial attack method for DRL-based autonomous driving\npolicies. Specifically, we introduce a DRL-based adversary designed to trigger\nsafety violations (e.g., collisions) by injecting adversarial samples at\ncritical moments. We model the attack as a mixed-integer optimization problem\nand formulate it as a Markov decision process. Then, we train the adversary to\nlearn the optimal policy for attacking at critical moments without domain\nknowledge. Furthermore, we introduce attack-related information and a\ntrajectory clipping method to enhance the learning capability of the adversary.\nFinally, we validate our method in an unprotected left-turn scenario across\ndifferent traffic densities. The experimental results show that our method\nachieves more than 90% collision rate within three attacks in most cases.\nFurthermore, our method achieves more than 130% improvement in attack\nefficiency compared to the unlimited attack method.\n","authors":["Junchao Fan","Xuyang Lei","Xiaolin Chang","Jelena Mišić","Vojislav B. Mišić"],"pdf_url":"https://arxiv.org/pdf/2412.03051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01858v2","updated":"2024-12-04T05:35:17Z","published":"2024-11-30T19:53:25Z","title":"MQFL-FHE: Multimodal Quantum Federated Learning Framework with Fully\n Homomorphic Encryption","summary":" The integration of fully homomorphic encryption (FHE) in federated learning\n(FL) has led to significant advances in data privacy. However, during the\naggregation phase, it often results in performance degradation of the\naggregated model, hindering the development of robust representational\ngeneralization. In this work, we propose a novel multimodal quantum federated\nlearning framework that utilizes quantum computing to counteract the\nperformance drop resulting from FHE. For the first time in FL, our framework\ncombines a multimodal quantum mixture of experts (MQMoE) model with FHE,\nincorporating multimodal datasets for enriched representation and task-specific\nlearning. Our MQMoE framework enhances performance on multimodal datasets and\ncombined genomics and brain MRI scans, especially for underrepresented\ncategories. Our results also demonstrate that the quantum-enhanced approach\nmitigates the performance degradation associated with FHE and improves\nclassification accuracy across diverse datasets, validating the potential of\nquantum interventions in enhancing privacy in FL.\n","authors":["Siddhant Dutta","Nouhaila Innan","Sadok Ben Yahia","Muhammad Shafique","David Esteban Bernal Neira"],"pdf_url":"https://arxiv.org/pdf/2412.01858v2.pdf","comment":"14 pages, 6 figures, 5 Tables. Under Review"},{"id":"http://arxiv.org/abs/2412.03038v1","updated":"2024-12-04T05:19:34Z","published":"2024-12-04T05:19:34Z","title":"MILLION: A General Multi-Objective Framework with Controllable Risk for\n Portfolio Management","summary":" Portfolio management is an important yet challenging task in AI for FinTech,\nwhich aims to allocate investors' budgets among different assets to balance the\nrisk and return of an investment. In this study, we propose a general\nMulti-objectIve framework with controLLable rIsk for pOrtfolio maNagement\n(MILLION), which consists of two main phases, i.e., return-related maximization\nand risk control. Specifically, in the return-related maximization phase, we\nintroduce two auxiliary objectives, i.e., return rate prediction, and return\nrate ranking, combined with portfolio optimization to remit the overfitting\nproblem and improve the generalization of the trained model to future markets.\nSubsequently, in the risk control phase, we propose two methods, i.e.,\nportfolio interpolation and portfolio improvement, to achieve fine-grained risk\ncontrol and fast risk adaption to a user-specified risk level. For the\nportfolio interpolation method, we theoretically prove that the risk can be\nperfectly controlled if the to-be-set risk level is in a proper interval. In\naddition, we also show that the return rate of the adjusted portfolio after\nportfolio interpolation is no less than that of the min-variance optimization,\nas long as the model in the reward maximization phase is effective.\nFurthermore, the portfolio improvement method can achieve greater return rates\nwhile keeping the same risk level compared to portfolio interpolation.\nExtensive experiments are conducted on three real-world datasets. The results\ndemonstrate the effectiveness and efficiency of the proposed framework.\n","authors":["Liwei Deng","Tianfu Wang","Yan Zhao","Kai Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.03038v1.pdf","comment":"accepted by VLDB 2025"},{"id":"http://arxiv.org/abs/2412.03035v1","updated":"2024-12-04T05:16:48Z","published":"2024-12-04T05:16:48Z","title":"A Granger-Causal Perspective on Gradient Descent with Application to\n Pruning","summary":" Stochastic Gradient Descent (SGD) is the main approach to optimizing neural\nnetworks. Several generalization properties of deep networks, such as\nconvergence to a flatter minima, are believed to arise from SGD. This article\nexplores the causality aspect of gradient descent. Specifically, we show that\nthe gradient descent procedure has an implicit granger-causal relationship\nbetween the reduction in loss and a change in parameters. By suitable\nmodifications, we make this causal relationship explicit. A causal approach to\ngradient descent has many significant applications which allow greater control.\nIn this article, we illustrate the significance of the causal approach using\nthe application of Pruning. The causal approach to pruning has several\ninteresting properties - (i) We observe a phase shift as the percentage of\npruned parameters increase. Such phase shift is indicative of an optimal\npruning strategy. (ii) After pruning, we see that minima becomes \"flatter\",\nexplaining the increase in accuracy after pruning weights.\n","authors":["Aditya Shah","Aditya Challa","Sravan Danda","Archana Mathur","Snehanshu Saha"],"pdf_url":"https://arxiv.org/pdf/2412.03035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18958v3","updated":"2024-12-04T05:04:42Z","published":"2024-10-24T17:55:52Z","title":"Stable Consistency Tuning: Understanding and Improving Consistency\n Models","summary":" Diffusion models achieve superior generation quality but suffer from slow\ngeneration speed due to the iterative nature of denoising. In contrast,\nconsistency models, a new generative family, achieve competitive performance\nwith significantly faster sampling. These models are trained either through\nconsistency distillation, which leverages pretrained diffusion models, or\nconsistency training/tuning directly from raw data. In this work, we propose a\nnovel framework for understanding consistency models by modeling the denoising\nprocess of the diffusion model as a Markov Decision Process (MDP) and framing\nconsistency model training as the value estimation through Temporal\nDifference~(TD) Learning. More importantly, this framework allows us to analyze\nthe limitations of current consistency training/tuning strategies. Built upon\nEasy Consistency Tuning (ECT), we propose Stable Consistency Tuning (SCT),\nwhich incorporates variance-reduced learning using the score identity. SCT\nleads to significant performance improvements on benchmarks such as CIFAR-10\nand ImageNet-64. On ImageNet-64, SCT achieves 1-step FID 2.42 and 2-step FID\n1.55, a new SoTA for consistency models.\n","authors":["Fu-Yun Wang","Zhengyang Geng","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2410.18958v3.pdf","comment":"Code is available at\n https://github.com/G-U-N/Stable-Consistency-Tuning"},{"id":"http://arxiv.org/abs/2405.19600v2","updated":"2024-12-04T04:41:49Z","published":"2024-05-30T01:30:34Z","title":"Rethinking Spectral Augmentation for Contrast-based Graph\n Self-Supervised Learning","summary":" The recent surge in contrast-based graph self-supervised learning has\nprominently featured an intensified exploration of spectral cues. Spectral\naugmentation, which involves modifying a graph's spectral properties such as\neigenvalues or eigenvectors, is widely believed to enhance model performance.\nHowever, an intriguing paradox emerges, as methods grounded in seemingly\nconflicting assumptions regarding the spectral domain demonstrate notable\nenhancements in learning performance. Through extensive empirical studies, we\nfind that simple edge perturbations - random edge dropping for node-level and\nrandom edge adding for graph-level self-supervised learning - consistently\nyield comparable or superior performance while being significantly more\ncomputationally efficient. This suggests that the computational overhead of\nsophisticated spectral augmentations may not justify their practical benefits.\nOur theoretical analysis of the InfoNCE loss bounds for shallow GNNs further\nsupports this observation. The proposed insights represent a significant leap\nforward in the field, potentially refining the understanding and implementation\nof graph self-supervised learning.\n","authors":["Xiangru Jian","Xinjian Zhao","Wei Pang","Chaolong Ying","Yimu Wang","Yaoyao Xu","Tianshu Yu"],"pdf_url":"https://arxiv.org/pdf/2405.19600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12635v2","updated":"2024-12-04T04:18:35Z","published":"2024-04-19T05:32:37Z","title":"AED-PADA:Improving Generalizability of Adversarial Example Detection via\n Principal Adversarial Domain Adaptation","summary":" Adversarial example detection, which can be conveniently applied in many\nscenarios, is important in the area of adversarial defense. Unfortunately,\nexisting detection methods suffer from poor generalization performance, because\ntheir training process usually relies on the examples generated from a single\nknown adversarial attack and there exists a large discrepancy between the\ntraining and unseen testing adversarial examples. To address this issue, we\npropose a novel method, named Adversarial Example Detection via Principal\nAdversarial Domain Adaptation (AED-PADA). Specifically, our approach identifies\nthe Principal Adversarial Domains (PADs), i.e., a combination of features of\nthe adversarial examples generated by different attacks, which possesses a\nlarge portion of the entire adversarial feature space. Subsequently, we pioneer\nto exploit Multi-source Unsupervised Domain Adaptation in adversarial example\ndetection, with PADs as the source domains. Experimental results demonstrate\nthe superior generalization ability of our proposed AED-PADA. Note that this\nsuperiority is particularly achieved in challenging scenarios characterized by\nemploying the minimal magnitude constraint for the perturbations.\n","authors":["Heqi Peng","Yunhong Wang","Ruijie Yang","Beichen Li","Rui Wang","Yuanfang Guo"],"pdf_url":"https://arxiv.org/pdf/2404.12635v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03018v1","updated":"2024-12-04T04:08:51Z","published":"2024-12-04T04:08:51Z","title":"Hamiltonian-based neural networks for systems under nonholonomic\n constraints","summary":" There has been increasing interest in methodologies that incorporate physics\npriors into neural network architectures to enhance their modeling\ncapabilities. A family of these methodologies that has gained traction are\nHamiltonian neural networks (HNN) and their variations. These architectures\nexplicitly encode Hamiltonian mechanics both in their structure and loss\nfunction. Although Hamiltonian systems under nonholonomic constraints are in\ngeneral not Hamiltonian, it is possible to formulate them in pseudo-Hamiltonian\nform, equipped with a Lie bracket which is almost Poisson. This opens the\npossibility of using some principles of HNNs in systems under nonholonomic\nconstraints. The goal of the present work is to develop a modified Hamiltonian\nneural network architecture capable of modeling Hamiltonian systems under\nholonomic and nonholonomic constraints. A three-network parallel architecture\nis proposed to simultaneously learn the Hamiltonian of the system, the\nconstraints, and their associated multipliers. A rolling disk and a ball on a\nspinning table are considered as canonical examples to assess the performance\nof the proposed Hamiltonian architecture. The experiments are then repeated\nwith a noisy training set to study modeling performance under more realistic\nconditions.\n","authors":["Ignacio Puiggros T.","A. Srikantha Phani"],"pdf_url":"https://arxiv.org/pdf/2412.03018v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03012v1","updated":"2024-12-04T04:02:38Z","published":"2024-12-04T04:02:38Z","title":"Learning Whole-Body Loco-Manipulation for Omni-Directional Task Space\n Pose Tracking with a Wheeled-Quadrupedal-Manipulator","summary":" In this paper, we study the whole-body loco-manipulation problem using\nreinforcement learning (RL). Specifically, we focus on the problem of how to\ncoordinate the floating base and the robotic arm of a wheeled-quadrupedal\nmanipulator robot to achieve direct six-dimensional (6D) end-effector (EE) pose\ntracking in task space. Different from conventional whole-body\nloco-manipulation problems that track both floating-base and end-effector\ncommands, the direct EE pose tracking problem requires inherent balance among\nredundant degrees of freedom in the whole-body motion. We leverage RL to solve\nthis challenging problem. To address the associated difficulties, we develop a\nnovel reward fusion module (RFM) that systematically integrates reward terms\ncorresponding to different tasks in a nonlinear manner. In such a way, the\ninherent multi-stage and hierarchical feature of the loco-manipulation problem\ncan be carefully accommodated. By combining the proposed RFM with the a\nteacher-student RL training paradigm, we present a complete RL scheme to\nachieve 6D EE pose tracking for the wheeled-quadruped manipulator robot.\nExtensive simulation and hardware experiments demonstrate the significance of\nthe RFM. In particular, we enable smooth and precise tracking performance,\nachieving state-of-the-art tracking position error of less than 5 cm, and\nrotation error of less than 0.1 rad. Please refer to\nhttps://clearlab-sustech.github.io/RFM_loco_mani/ for more experimental videos.\n","authors":["Kaiwen Jiang","Zhen Fu","Junde Guo","Wei Zhang","Hua Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03009v1","updated":"2024-12-04T03:56:54Z","published":"2024-12-04T03:56:54Z","title":"Data Acquisition for Improving Model Fairness using Reinforcement\n Learning","summary":" Machine learning systems are increasingly being used in critical decision\nmaking such as healthcare, finance, and criminal justice. Concerns around their\nfairness have resulted in several bias mitigation techniques that emphasize the\nneed for high-quality data to ensure fairer decisions. However, the role of\nearlier stages of machine learning pipelines in mitigating model bias has not\nbeen explored well. In this paper, we focus on the task of acquiring additional\nlabeled data points for training the downstream machine learning model to\nrapidly improve its fairness. Since not all data points in a data pool are\nequally beneficial to the task of fairness, we generate an ordering in which\ndata points should be acquired. We present DataSift, a data acquisition\nframework based on the idea of data valuation that relies on partitioning and\nmulti-armed bandits to determine the most valuable data points to acquire. Over\nseveral iterations, DataSift selects a partition and randomly samples a batch\nof data points from the selected partition, evaluates the benefit of acquiring\nthe batch on model fairness, and updates the utility of partitions depending on\nthe benefit. To further improve the effectiveness and efficiency of evaluating\nbatches, we leverage influence functions that estimate the effect of acquiring\na batch without retraining the model. We empirically evaluate DataSift on\nseveral real-world and synthetic datasets and show that the fairness of a\nmachine learning model can be significantly improved even while acquiring a few\ndata points.\n","authors":["Jahid Hasan","Romila Pradhan"],"pdf_url":"https://arxiv.org/pdf/2412.03009v1.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.03008v1","updated":"2024-12-04T03:56:14Z","published":"2024-12-04T03:56:14Z","title":"Provably Extending PageRank-based Local Clustering Algorithm to Weighted\n Directed Graphs with Self-Loops and to Hypergraphs","summary":" Local clustering aims to find a compact cluster near the given starting\ninstances. This work focuses on graph local clustering, which has broad\napplications beyond graphs because of the internal connectivities within\nvarious modalities. While most existing studies on local graph clustering adopt\nthe discrete graph setting (i.e., unweighted graphs without self-loops),\nreal-world graphs can be more complex. In this paper, we extend the\nnon-approximating Andersen-Chung-Lang (\"ACL\") algorithm beyond discrete graphs\nand generalize its quadratic optimality to a wider range of graphs, including\nweighted, directed, and self-looped graphs and hypergraphs. Specifically,\nleveraging PageRank, we propose two algorithms: GeneralACL for graphs and\nHyperACL for hypergraphs. We theoretically prove that, under two mild\nconditions, both algorithms can identify a quadratically optimal local cluster\nin terms of conductance with at least 1/2 probability. On the property of\nhypergraphs, we address a fundamental gap in the literature by defining\nconductance for hypergraphs from the perspective of hypergraph random walks.\nAdditionally, we provide experiments to validate our theoretical findings.\n","authors":["Zihao Li","Dongqi Fu","Hengyu Liu","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2412.03008v1.pdf","comment":"Preprint, 42 pages"},{"id":"http://arxiv.org/abs/2408.15126v6","updated":"2024-12-04T03:24:18Z","published":"2024-08-27T15:07:27Z","title":"Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of\n Peptides","summary":" Molecular Dynamics (MD) is crucial in various fields such as materials\nscience, chemistry, and pharmacology to name a few. Conventional MD software\nstruggles with the balance between time cost and prediction accuracy, which\nrestricts its wider application. Recently, data-driven approaches based on deep\ngenerative models have been devised for time-coarsened dynamics, which aim at\nlearning dynamics of diverse molecular systems over a long timestep, enjoying\nboth universality and efficiency. Nevertheless, most current methods are\ndesigned solely to learn from the data distribution regardless of the\nunderlying Boltzmann distribution, and the physics priors such as energies and\nforces are constantly overlooked. In this work, we propose a conditional\ngenerative model called Force-guided Bridge Matching (FBM), which learns\nfull-atom time-coarsened dynamics and targets the Boltzmann-constrained\ndistribution. With the guidance of our delicately-designed intermediate force\nfield, FBM leverages favourable physics priors into the generation process,\ngiving rise to enhanced simulations. Experiments on two datasets consisting of\npeptides verify our superiority in terms of comprehensive metrics and\ndemonstrate transferability to unseen systems.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2408.15126v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02988v1","updated":"2024-12-04T03:02:55Z","published":"2024-12-04T03:02:55Z","title":"Preference-based Pure Exploration","summary":" We study the preference-based pure exploration problem for bandits with\nvector-valued rewards. The rewards are ordered using a (given) preference cone\n$\\mathcal{C}$ and our the goal is to identify the set of Pareto optimal arms.\nFirst, to quantify the impact of preferences, we derive a novel lower bound on\nthe sample complexity for identifying the most preferred policy with confidence\nlevel $1-\\delta$. Our lower bound elicits the role played by the geometry of\nthe preference cone and punctuates the difference in hardness compared to\nexisting best-arm identification variants of the problem. We further explicate\nthis geometry when rewards follow Gaussian distributions. We then provide a\nconvex relaxation of the lower bound. and leverage it to design\nPreference-based Track and Stop (PreTS) algorithm that identifies the most\npreferred policy. Finally, we show that sample complexity of PreTS is\nasymptotically tight by deriving a new concentration inequality for\nvector-valued rewards.\n","authors":["Apurv Shukla","Debabrota Basu"],"pdf_url":"https://arxiv.org/pdf/2412.02988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.08027v2","updated":"2024-12-04T02:57:03Z","published":"2024-04-11T15:58:12Z","title":"SurvMamba: State Space Model with Multi-grained Multi-modal Interaction\n for Survival Prediction","summary":" Multi-modal learning that combines pathological images with genomic data has\nsignificantly enhanced the accuracy of survival prediction. Nevertheless,\nexisting methods have not fully utilized the inherent hierarchical structure\nwithin both whole slide images (WSIs) and transcriptomic data, from which\nbetter intra-modal representations and inter-modal integration could be\nderived. Moreover, many existing studies attempt to improve multi-modal\nrepresentations through attention mechanisms, which inevitably lead to high\ncomplexity when processing high-dimensional WSIs and transcriptomic data.\nRecently, a structured state space model named Mamba emerged as a promising\napproach for its superior performance in modeling long sequences with low\ncomplexity. In this study, we propose Mamba with multi-grained multi-modal\ninteraction (SurvMamba) for survival prediction. SurvMamba is implemented with\na Hierarchical Interaction Mamba (HIM) module that facilitates efficient\nintra-modal interactions at different granularities, thereby capturing more\ndetailed local features as well as rich global representations. In addition, an\nInteraction Fusion Mamba (IFM) module is used for cascaded inter-modal\ninteractive fusion, yielding more comprehensive features for survival\nprediction. Comprehensive evaluations on five TCGA datasets demonstrate that\nSurvMamba outperforms other existing methods in terms of performance and\ncomputational cost.\n","authors":["Ying Chen","Jiajing Xie","Yuxiang Lin","Yuhang Song","Wenxian Yang","Rongshan Yu"],"pdf_url":"https://arxiv.org/pdf/2404.08027v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02980v1","updated":"2024-12-04T02:47:45Z","published":"2024-12-04T02:47:45Z","title":"Surveying the Effects of Quality, Diversity, and Complexity in Synthetic\n Data From Large Language Models","summary":" Synthetic data generation with Large Language Models is a promising paradigm\nfor augmenting natural data over a nearly infinite range of tasks. Given this\nvariety, direct comparisons among synthetic data generation algorithms are\nscarce, making it difficult to understand where improvement comes from and what\nbottlenecks exist. We propose to evaluate algorithms via the makeup of\nsynthetic data generated by each algorithm in terms of data quality, diversity,\nand complexity. We choose these three characteristics for their significance in\nopen-ended processes and the impact each has on the capabilities of downstream\nmodels. We find quality to be essential for in-distribution model\ngeneralization, diversity to be essential for out-of-distribution\ngeneralization, and complexity to be beneficial for both. Further, we emphasize\nthe existence of Quality-Diversity trade-offs in training data and the\ndownstream effects on model performance. We then examine the effect of various\ncomponents in the synthetic data pipeline on each data characteristic. This\nexamination allows us to taxonomize and compare synthetic data generation\nalgorithms through the components they utilize and the resulting effects on\ndata QDC composition. This analysis extends into a discussion on the importance\nof balancing QDC in synthetic data for efficient reinforcement learning and\nself-improvement algorithms. Analogous to the QD trade-offs in training data,\noften there exist trade-offs between model output quality and output diversity\nwhich impact the composition of synthetic data. We observe that many models are\ncurrently evaluated and optimized only for output quality, thereby limiting\noutput diversity and the potential for self-improvement. We argue that\nbalancing these trade-offs is essential to the development of future\nself-improvement algorithms and highlight a number of works making progress in\nthis direction.\n","authors":["Alex Havrilla","Andrew Dai","Laura O'Mahony","Koen Oostermeijer","Vera Zisler","Alon Albalak","Fabrizio Milo","Sharath Chandra Raparthy","Kanishk Gandhi","Baber Abbasi","Duy Phung","Maia Iyer","Dakota Mahan","Chase Blagden","Srishti Gureja","Mohammed Hamdy","Wen-Ding Li","Giovanni Paolini","Pawan Sasanka Ammanamanchi","Elliot Meyerson"],"pdf_url":"https://arxiv.org/pdf/2412.02980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02975v1","updated":"2024-12-04T02:37:31Z","published":"2024-12-04T02:37:31Z","title":"Theoretical limitations of multi-layer Transformer","summary":" Transformers, especially the decoder-only variants, are the backbone of most\nmodern large language models; yet we do not have much understanding of their\nexpressive power except for the simple $1$-layer case.\n Due to the difficulty of analyzing multi-layer models, all previous work\nrelies on unproven complexity conjectures to show limitations for multi-layer\nTransformers. In this work, we prove the first $\\textit{unconditional}$ lower\nbound against multi-layer decoder-only transformers. For any constant $L$, we\nprove that any $L$-layer decoder-only transformer needs a polynomial model\ndimension ($n^{\\Omega(1)}$) to perform sequential composition of $L$ functions\nover an input of $n$ tokens.\n As a consequence, our results give: (1) the first depth-width trade-off for\nmulti-layer transformers, exhibiting that the $L$-step composition task is\nexponentially harder for $L$-layer models compared to $(L+1)$-layer ones; (2)\nan unconditional separation between encoder and decoder, exhibiting a hard task\nfor decoders that can be solved by an exponentially shallower and smaller\nencoder; (3) a provable advantage of chain-of-thought, exhibiting a task that\nbecomes exponentially easier with chain-of-thought.\n On the technical side, we propose the multi-party $\\textit{autoregressive}$\n$\\textit{communication}$ $\\textit{model}$ that captures the computation of a\ndecoder-only Transformer. We also introduce a new proof technique that finds a\ncertain $\\textit{indistinguishable}$ $\\textit{decomposition}$ of all possible\ninputs iteratively for proving lower bounds in this model. We believe our new\ncommunication model and proof technique will be helpful to further understand\nthe computational power of transformers.\n","authors":["Lijie Chen","Binghui Peng","Hongxun Wu"],"pdf_url":"https://arxiv.org/pdf/2412.02975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02969v1","updated":"2024-12-04T02:31:31Z","published":"2024-12-04T02:31:31Z","title":"Unified Inductive Logic: From Formal Learning to Statistical Inference\n to Supervised Learning","summary":" While the traditional conception of inductive logic is Carnapian, I develop a\nPeircean alternative and use it to unify formal learning theory, statistics,\nand a significant part of machine learning: supervised learning. Some crucial\nstandards for evaluating non-deductive inferences have been assumed separately\nin those areas, but can actually be justified by a unifying principle.\n","authors":["Hanti Lin"],"pdf_url":"https://arxiv.org/pdf/2412.02969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02968v1","updated":"2024-12-04T02:31:28Z","published":"2024-12-04T02:31:28Z","title":"How Many Ratings per Item are Necessary for Reliable Significance\n Testing?","summary":" Most approaches to machine learning evaluation assume that machine and human\nresponses are repeatable enough to be measured against data with unitary,\nauthoritative, \"gold standard\" responses, via simple metrics such as accuracy,\nprecision, and recall that assume scores are independent given the test item.\nHowever, AI models have multiple sources of stochasticity and the human raters\nwho create gold standards tend to disagree with each other, often in meaningful\nways, hence a single output response per input item may not provide enough\ninformation. We introduce methods for determining whether an (existing or\nplanned) evaluation dataset has enough responses per item to reliably compare\nthe performance of one model to another. We apply our methods to several of\nvery few extant gold standard test sets with multiple disaggregated responses\nper item and show that there are usually not enough responses per item to\nreliably compare the performance of one model against another. Our methods also\nallow us to estimate the number of responses per item for hypothetical datasets\nwith similar response distributions to the existing datasets we study. When two\nmodels are very far apart in their predictive performance, fewer raters are\nneeded to confidently compare them, as expected. However, as the models draw\ncloser, we find that a larger number of raters than are currently typical in\nannotation collection are needed to ensure that the power analysis correctly\nreflects the difference in performance.\n","authors":["Christopher Homan","Flip Korn","Chris Welty"],"pdf_url":"https://arxiv.org/pdf/2412.02968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02957v1","updated":"2024-12-04T02:05:55Z","published":"2024-12-04T02:05:55Z","title":"3D Interaction Geometric Pre-training for Molecular Relational Learning","summary":" Molecular Relational Learning (MRL) is a rapidly growing field that focuses\non understanding the interaction dynamics between molecules, which is crucial\nfor applications ranging from catalyst engineering to drug discovery. Despite\nrecent progress, earlier MRL approaches are limited to using only the 2D\ntopological structure of molecules, as obtaining the 3D interaction geometry\nremains prohibitively expensive. This paper introduces a novel 3D geometric\npre-training strategy for MRL (3DMRL) that incorporates a 3D virtual\ninteraction environment, overcoming the limitations of costly traditional\nquantum mechanical calculation methods. With the constructed 3D virtual\ninteraction environment, 3DMRL trains 2D MRL model to learn the overall 3D\ngeometric information of molecular interaction through contrastive learning.\nMoreover, fine-grained interaction between molecules is learned through force\nprediction loss, which is crucial in understanding the wide range of molecular\ninteraction processes. Extensive experiments on various tasks using real-world\ndatasets, including out-of-distribution and extrapolation scenarios,\ndemonstrate the effectiveness of 3DMRL, showing up to a 24.93\\% improvement in\nperformance across 40 tasks.\n","authors":["Namkyeong Lee","Yunhak Oh","Heewoong Noh","Gyoung S. Na","Minkai Xu","Hanchen Wang","Tianfan Fu","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2412.02957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18822v2","updated":"2024-12-04T01:56:07Z","published":"2024-11-27T23:51:53Z","title":"RelCon: Relative Contrastive Learning for a Motion Foundation Model for\n Wearable Data","summary":" We present RelCon, a novel self-supervised *Rel*ative *Con*trastive learning\napproach that uses a learnable distance measure in combination with a softened\ncontrastive loss for training an motion foundation model from wearable sensors.\nThe learnable distance measure captures motif similarity and domain-specific\nsemantic information such as rotation invariance. The learned distance provides\na measurement of semantic similarity between a pair of accelerometer\ntime-series segments, which is used to measure the distance between an anchor\nand various other sampled candidate segments. The self-supervised model is\ntrained on 1 billion segments from 87,376 participants from a large wearables\ndataset. The model achieves strong performance across multiple downstream\ntasks, encompassing both classification and regression. To our knowledge, we\nare the first to show the generalizability of a self-supervised learning model\nwith motion data from wearables across distinct evaluation tasks.\n","authors":["Maxwell A. Xu","Jaya Narain","Gregory Darnell","Haraldur Hallgrimsson","Hyewon Jeong","Darren Forde","Richard Fineman","Karthik J. Raghuram","James M. Rehg","Shirley Ren"],"pdf_url":"https://arxiv.org/pdf/2411.18822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06220v2","updated":"2024-12-04T01:47:08Z","published":"2024-04-09T11:14:45Z","title":"Zero-Shot Relational Learning for Multimodal Knowledge Graphs","summary":" Relational learning is an essential task in the domain of knowledge\nrepresentation, particularly in knowledge graph completion (KGC). While\nrelational learning in traditional single-modal settings has been extensively\nstudied, exploring it within a multimodal KGC context presents distinct\nchallenges and opportunities. One of the major challenges is inference on newly\ndiscovered relations without any associated training data. This zero-shot\nrelational learning scenario poses unique requirements for multimodal KGC,\ni.e., utilizing multimodality to facilitate relational learning.However,\nexisting works fail to support the leverage of multimodal information and leave\nthe problem unexplored. In this paper, we propose a novel end-to-end framework,\nconsisting of three components, i.e., multimodal learner, structure\nconsolidator, and relation embedding generator, to integrate diverse multimodal\ninformation and knowledge graph structures to facilitate the zero-shot\nrelational learning. Evaluation results on three multimodal knowledge graphs\ndemonstrate the superior performance of our proposed method.\n","authors":["Rui Cai","Shichao Pei","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.06220v2.pdf","comment":"In the Proceedings of the 2024 IEEE International Conference on Big\n Data (IEEE BigData 2024)"},{"id":"http://arxiv.org/abs/2412.02951v1","updated":"2024-12-04T01:40:54Z","published":"2024-12-04T01:40:54Z","title":"Incorporating System-level Safety Requirements in Perception Models via\n Reinforcement Learning","summary":" Perception components in autonomous systems are often developed and optimized\nindependently of downstream decision-making and control components, relying on\nestablished performance metrics like accuracy, precision, and recall.\nTraditional loss functions, such as cross-entropy loss and negative\nlog-likelihood, focus on reducing misclassification errors but fail to consider\ntheir impact on system-level safety, overlooking the varying severities of\nsystem-level failures caused by these errors. To address this limitation, we\npropose a novel training paradigm that augments the perception component with\nan understanding of system-level safety objectives. Central to our approach is\nthe translation of system-level safety requirements, formally specified using\nthe rulebook formalism, into safety scores. These scores are then incorporated\ninto the reward function of a reinforcement learning framework for fine-tuning\nperception models with system-level safety objectives. Simulation results\ndemonstrate that models trained with this approach outperform baseline\nperception models in terms of system-level safety.\n","authors":["Weisi Fan","Jesse Lane","Qisai Liu","Soumik Sarkar","Tichakorn Wongpiromsarn"],"pdf_url":"https://arxiv.org/pdf/2412.02951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02946v1","updated":"2024-12-04T01:23:57Z","published":"2024-12-04T01:23:57Z","title":"Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large\n Vision-Language Model via Causality Analysis","summary":" Recent advancements in large vision-language models (LVLM) have significantly\nenhanced their ability to comprehend visual inputs alongside natural language.\nHowever, a major challenge in their real-world application is hallucination,\nwhere LVLMs generate non-existent visual elements, eroding user trust. The\nunderlying mechanism driving this multimodal hallucination is poorly\nunderstood. Minimal research has illuminated whether contexts such as sky,\ntree, or grass field involve the LVLM in hallucinating a frisbee. We\nhypothesize that hidden factors, such as objects, contexts, and semantic\nforeground-background structures, induce hallucination. This study proposes a\nnovel causal approach: a hallucination probing system to identify these hidden\nfactors. By analyzing the causality between images, text prompts, and network\nsaliency, we systematically explore interventions to block these factors. Our\nexperimental findings show that a straightforward technique based on our\nanalysis can significantly reduce hallucinations. Additionally, our analyses\nindicate the potential to edit network internals to minimize hallucinated\noutputs.\n","authors":["Po-Hsuan Huang","Jeng-Lin Li","Chin-Po Chen","Ming-Ching Chang","Wei-Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02946v1.pdf","comment":"Accepted by WACV2025"},{"id":"http://arxiv.org/abs/2408.12841v2","updated":"2024-12-04T01:20:16Z","published":"2024-08-23T05:15:24Z","title":"COVID-19 Probability Prediction Using Machine Learning: An Infectious\n Approach","summary":" The ongoing COVID-19 pandemic continues to pose significant challenges to\nglobal public health, despite the widespread availability of vaccines. Early\ndetection of the disease remains paramount in curbing its transmission and\nmitigating its impact on public health systems. In response, this study delves\ninto the application of advanced machine learning (ML) techniques for\npredicting COVID-19 infection probability. We conducted a rigorous\ninvestigation into the efficacy of various ML models, including XGBoost, LGBM,\nAdaBoost, Logistic Regression, Decision Tree, RandomForest, CatBoost, KNN, and\nDeep Neural Networks (DNN). Leveraging a dataset comprising 4000 samples, with\n3200 allocated for training and 800 for testing, our experiment offers\ncomprehensive insights into the performance of these models in COVID-19\nprediction. Our findings reveal that Deep Neural Networks (DNN) emerge as the\ntop-performing model, exhibiting superior accuracy and recall metrics. With an\nimpressive accuracy rate of 89%, DNN demonstrates remarkable potential in early\nCOVID-19 detection. This underscores the efficacy of deep learning approaches\nin leveraging complex data patterns to identify COVID-19 infections accurately.\nThis study underscores the critical role of machine learning, particularly deep\nlearning methodologies, in augmenting early detection efforts amidst the\nongoing pandemic. The success of DNN in accurately predicting COVID-19\ninfection probability highlights the importance of continued research and\ndevelopment in leveraging advanced technologies to combat infectious diseases.\n","authors":["Mohsen Asghari Ilani","Saba Moftakhar Tehran","Ashkan Kavei","Arian Radmehr"],"pdf_url":"https://arxiv.org/pdf/2408.12841v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02940v1","updated":"2024-12-04T01:13:44Z","published":"2024-12-04T01:13:44Z","title":"SAVER: A Toolbox for Sampling-Based, Probabilistic Verification of\n Neural Networks","summary":" We present a neural network verification toolbox to 1) assess the probability\nof satisfaction of a constraint, and 2) synthesize a set expansion factor to\nachieve the probability of satisfaction. Specifically, the tool box establishes\nwith a user-specified level of confidence whether the output of the neural\nnetwork for a given input distribution is likely to be contained within a given\nset. Should the tool determine that the given set cannot satisfy the likelihood\nconstraint, the tool also implements an approach outlined in this paper to\nalter the constraint set to ensure that the user-defined satisfaction\nprobability is achieved. The toolbox is comprised of sampling-based approaches\nwhich exploit the properties of signed distance function to define set\ncontainment.\n","authors":["Vignesh Sivaramakrishnan","Krishna C. Kalagarla","Rosalyn Devonport","Joshua Pilipovsky","Panagiotis Tsiotras","Meeko Oishi"],"pdf_url":"https://arxiv.org/pdf/2412.02940v1.pdf","comment":"7 pages, 8 figures, submitted to the 28th ACM International\n Conference on Hybrid Systems: Computation and Control"},{"id":"http://arxiv.org/abs/2412.02934v1","updated":"2024-12-04T01:07:04Z","published":"2024-12-04T01:07:04Z","title":"BGTplanner: Maximizing Training Accuracy for Differentially Private\n Federated Recommenders via Strategic Privacy Budget Allocation","summary":" To mitigate the rising concern about privacy leakage, the federated\nrecommender (FR) paradigm emerges, in which decentralized clients co-train the\nrecommendation model without exposing their raw user-item rating data. The\ndifferentially private federated recommender (DPFR) further enhances FR by\ninjecting differentially private (DP) noises into clients. Yet, current DPFRs,\nsuffering from noise distortion, cannot achieve satisfactory accuracy. Various\nefforts have been dedicated to improving DPFRs by adaptively allocating the\nprivacy budget over the learning process. However, due to the intricate\nrelation between privacy budget allocation and model accuracy, existing works\nare still far from maximizing DPFR accuracy. To address this challenge, we\ndevelop BGTplanner (Budget Planner) to strategically allocate the privacy\nbudget for each round of DPFR training, improving overall training performance.\nSpecifically, we leverage the Gaussian process regression and historical\ninformation to predict the change in recommendation accuracy with a certain\nallocated privacy budget. Additionally, Contextual Multi-Armed Bandit (CMAB) is\nharnessed to make privacy budget allocation decisions by reconciling the\ncurrent improvement and long-term privacy constraints. Our extensive\nexperimental results on real datasets demonstrate that \\emph{BGTplanner}\nachieves an average improvement of 6.76\\% in training performance compared to\nstate-of-the-art baselines.\n","authors":["Xianzhi Zhang","Yipeng Zhou","Miao Hu","Di Wu","Pengshan Liao","Mohsen Guizani","Michael Sheng"],"pdf_url":"https://arxiv.org/pdf/2412.02934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02931v1","updated":"2024-12-04T00:53:55Z","published":"2024-12-04T00:53:55Z","title":"Inverse Delayed Reinforcement Learning","summary":" Inverse Reinforcement Learning (IRL) has demonstrated effectiveness in a\nvariety of imitation tasks. In this paper, we introduce an IRL framework\ndesigned to extract rewarding features from expert trajectories affected by\ndelayed disturbances. Instead of relying on direct observations, our approach\nemploys an efficient off-policy adversarial training framework to derive expert\nfeatures and recover optimal policies from augmented delayed observations.\nEmpirical evaluations in the MuJoCo environment under diverse delay settings\nvalidate the effectiveness of our method. Furthermore, we provide a theoretical\nanalysis showing that recovering expert policies from augmented delayed\nobservations outperforms using direct delayed observations.\n","authors":["Simon Sinong Zhan","Qingyuan Wu","Zhian Ruan","Frank Yang","Philip Wang","Yixuan Wang","Ruochen Jiao","Chao Huang","Qi Zhu"],"pdf_url":"https://arxiv.org/pdf/2412.02931v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.15367v2","updated":"2024-12-04T00:42:56Z","published":"2024-04-19T13:24:09Z","title":"Leveraging Visibility Graphs for Enhanced Arrhythmia Classification with\n Graph Convolutional Networks","summary":" Arrhythmias, detectable through electrocardiograms (ECGs), pose significant\nhealth risks, underscoring the need for accurate and efficient automated\ndetection techniques. While recent advancements in graph-based methods have\ndemonstrated potential to enhance arrhythmia classification, the challenge lies\nin effectively representing ECG signals as graphs. This study investigates the\nuse of Visibility Graph (VG) and Vector Visibility Graph (VVG) representations\ncombined with Graph Convolutional Networks (GCNs) for arrhythmia classification\nunder the ANSI/AAMI standard, ensuring reproducibility and fair comparison with\nother techniques. Through extensive experiments on the MIT-BIH dataset, we\nevaluate various GCN architectures and preprocessing parameters. Our findings\ndemonstrate that VG and VVG mappings enable GCNs to classify arrhythmias\ndirectly from raw ECG signals, without the need for preprocessing or noise\nremoval. Notably, VG offers superior computational efficiency, while VVG\ndelivers enhanced classification performance by leveraging additional lead\nfeatures. The proposed approach outperforms baseline methods in several\nmetrics, although challenges persist in classifying the supraventricular\nectopic beat (S) class, particularly under the inter-patient paradigm.\n","authors":["Rafael F. Oliveira","Gladston J. P. Moreira","Vander L. S. Freitas","Eduardo J. S. Luz"],"pdf_url":"https://arxiv.org/pdf/2404.15367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02924v1","updated":"2024-12-04T00:27:54Z","published":"2024-12-04T00:27:54Z","title":"Harnessing Loss Decomposition for Long-Horizon Wave Predictions via Deep\n Neural Networks","summary":" Accurate prediction over long time horizons is crucial for modeling complex\nphysical processes such as wave propagation. Although deep neural networks show\npromise for real-time forecasting, they often struggle with accumulating phase\nand amplitude errors as predictions extend over a long period. To address this\nissue, we propose a novel loss decomposition strategy that breaks down the loss\ninto separate phase and amplitude components. This technique improves the\nlong-term prediction accuracy of neural networks in wave propagation tasks by\nexplicitly accounting for numerical errors, improving stability, and reducing\nerror accumulation over extended forecasts.\n","authors":["Indu Kant Deo","Rajeev Jaiman"],"pdf_url":"https://arxiv.org/pdf/2412.02924v1.pdf","comment":"6 pages, 4 figures, NeurIPS Machine Learning for Physical Sciences\n workshop"},{"id":"http://arxiv.org/abs/2403.09548v2","updated":"2024-12-04T00:26:21Z","published":"2024-03-14T16:35:43Z","title":"Breast Cancer Classification Using Gradient Boosting Algorithms Focusing\n on Reducing the False Negative and SHAP for Explainability","summary":" Cancer is one of the diseases that kill the most women in the world, with\nbreast cancer being responsible for the highest number of cancer cases and\nconsequently deaths. However, it can be prevented by early detection and,\nconsequently, early treatment. Any development for detection or perdition this\nkind of cancer is important for a better healthy life. Many studies focus on a\nmodel with high accuracy in cancer prediction, but sometimes accuracy alone may\nnot always be a reliable metric. This study implies an investigative approach\nto studying the performance of different machine learning algorithms based on\nboosting to predict breast cancer focusing on the recall metric. Boosting\nmachine learning algorithms has been proven to be an effective tool for\ndetecting medical diseases. The dataset of the University of California, Irvine\n(UCI) repository has been utilized to train and test the model classifier that\ncontains their attributes. The main objective of this study is to use\nstate-of-the-art boosting algorithms such as AdaBoost, XGBoost, CatBoost and\nLightGBM to predict and diagnose breast cancer and to find the most effective\nmetric regarding recall, ROC-AUC, and confusion matrix. Furthermore, our study\nis the first to use these four boosting algorithms with Optuna, a library for\nhyperparameter optimization, and the SHAP method to improve the\ninterpretability of our model, which can be used as a support to identify and\npredict breast cancer. We were able to improve AUC or recall for all the models\nand reduce the False Negative for AdaBoost and LigthGBM the final AUC were more\nthan 99.41\\% for all models.\n","authors":["João Manoel Herrera Pinheiro","Marcelo Becker"],"pdf_url":"https://arxiv.org/pdf/2403.09548v2.pdf","comment":"9 pages, 16 figures"},{"id":"http://arxiv.org/abs/2402.17363v4","updated":"2024-12-04T00:11:36Z","published":"2024-02-27T09:55:34Z","title":"CGGM: A conditional graph generation model with adaptive sparsity for\n node anomaly detection in IoT networks","summary":" Dynamic graphs are extensively employed for detecting anomalous behavior in\nnodes within the Internet of Things (IoT). Graph generative models are often\nused to address the issue of imbalanced node categories in dynamic graphs.\nNevertheless, the constraints it faces include the monotonicity of adjacency\nrelationships, the difficulty in constructing multi-dimensional features for\nnodes, and the lack of a method for end-to-end generation of multiple\ncategories of nodes. In this paper, we propose a novel graph generation model,\ncalled CGGM, specifically for generating samples belonging to the minority\nclass. The framework consists two core module: a conditional graph generation\nmodule and a graph-based anomaly detection module. The generative module adapts\nto the sparsity of the matrix by downsampling a noise adjacency matrix, and\nincorporates a multi-dimensional feature encoder based on multi-head\nself-attention to capture latent dependencies among features. Additionally, a\nlatent space constraint is combined with the distribution distance to\napproximate the latent distribution of real data. The graph-based anomaly\ndetection module utilizes the generated balanced dataset to predict the node\nbehaviors. Extensive experiments have shown that CGGM outperforms the\nstate-of-the-art methods in terms of accuracy and divergence. The results also\ndemonstrate CGGM can generated diverse data categories, that enhancing the\nperformance of multi-category classification task.\n","authors":["Munan Li","Xianshi Su","Runze Ma","Tongbang Jiang","Zijian Li","Tony Q. S. Quek"],"pdf_url":"https://arxiv.org/pdf/2402.17363v4.pdf","comment":"10 pages, 19 figures"},{"id":"http://arxiv.org/abs/2412.02919v1","updated":"2024-12-04T00:10:47Z","published":"2024-12-04T00:10:47Z","title":"Higher Order Transformers: Efficient Attention Mechanism for Tensor\n Structured Data","summary":" Transformers are now ubiquitous for sequence modeling tasks, but their\nextension to multi-dimensional data remains a challenge due to the quadratic\ncost of the attention mechanism. In this paper, we propose Higher-Order\nTransformers (HOT), a novel architecture designed to efficiently process data\nwith more than two axes, i.e. higher-order tensors. To address the\ncomputational challenges associated with high-order tensor attention, we\nintroduce a novel Kronecker factorized attention mechanism that reduces the\nattention cost to quadratic in each axis' dimension, rather than quadratic in\nthe total size of the input tensor. To further enhance efficiency, HOT\nleverages kernelized attention, reducing the complexity to linear. This\nstrategy maintains the model's expressiveness while enabling scalable attention\ncomputation. We validate the effectiveness of HOT on two high-dimensional\ntasks, including multivariate time series forecasting, and 3D medical image\nclassification. Experimental results demonstrate that HOT achieves competitive\nperformance while significantly improving computational efficiency, showcasing\nits potential for tackling a wide range of complex, multi-dimensional data.\n","authors":["Soroush Omranpour","Guillaume Rabusseau","Reihaneh Rabbany"],"pdf_url":"https://arxiv.org/pdf/2412.02919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12914v2","updated":"2024-12-04T00:03:38Z","published":"2024-09-19T17:10:34Z","title":"Mitigating Unsafe Feedback with Learning Constraints","summary":" While there has been progress towards aligning Large Language Models (LLMs)\nwith human values and ensuring safe behaviour at inference time, safety-guards\ncan easily be removed when fine-tuned on unsafe and harmful datasets.While this\nsetting has been treated extensively, another popular training paradigm,\nlearning from unsafe feedback with reinforcement learning, has previously been\nunexplored. This is concerning due to the widespread deployment of feedback\ncollection systems. We address this gap by providing an analysis of learning\nsettings where feedback is adversarial and noisy, i.e. that unsafe samples are\npreferred over safe ones despite model developers goal to maintain safety. We\nfind that safety-aligned LLMs easily explore unsafe action spaces through\ngenerating harmful text and optimize for adversarial reward indicating that\ncurrent safety guards are not enough to prevent learning from unsafe feedback.\nIn order to protect against this vulnerability, we adapt a number of both\n\"implict\" and \"explicit\" harmful fine-tuning defences to evaluate whether they\nare effective as learning constraints in an RL setting finding that no method\nis generally effective pointing to the need for more research in defences given\nthe widespread adoption of methods designed to learn from feedback. We end the\npaper with the observation that some defences work by performing \"harmless\nreward hacking\" for which we provide a theoretical explanation drawn from the\ntheory of Constrained Markov Decision Processes and provide some direction for\nfuture defence development.\n","authors":["Domenic Rosati","Giles Edkins","Harsh Raj","David Atanasov","Subhabrata Majumdar","Janarthanan Rajendran","Frank Rudzicz","Hassan Sajjad"],"pdf_url":"https://arxiv.org/pdf/2409.12914v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.03551v1","updated":"2024-12-04T18:49:26Z","published":"2024-12-04T18:49:26Z","title":"SPICE: Smart Projection Interface for Cooking Enhancement","summary":" Tangible User Interfaces (TUI) for human--computer interaction (HCI) provide\nthe user with physical representations of digital information with the aim to\novercome the limitations of screen-based interfaces. Although many compelling\ndemonstrations of TUIs exist in the literature, there is a lack of research on\nTUIs intended for daily two-handed tasks and processes, such as cooking. In\nresponse to this gap, we propose SPICE (Smart Projection Interface for Cooking\nEnhancement). SPICE investigates TUIs in a kitchen setting, aiming to transform\nthe recipe following experience from simply text-based to tangibly interactive.\nSPICE includes a tracking system, an agent-based software, and vision large\nlanguage models to create and interpret a kitchen environment where recipe\ninformation is projected directly onto the cooking surface. We conducted a\ncomparative usability study of SPICE and text-based recipe following with 30\nparticipants, assessing the task difficulty, total duration, and efficiency, as\nwell as user confidence and taste perception. The results indicate that SPICE\nallowed participants to perform the recipe with less stops and in shorter time\nwhile also improving self-reported efficiency, confidence, and taste. Despite\nthis, participants self-reported no change in overall difficulty, which is a\ndirection for future research. Overall, the SPICE project demonstrates the\npotential of using TUIs to improve everyday activities, paving the way for\nfuture research in HCI and new computing interfaces.\n","authors":["Vera Prohaska","Eduardo Castelló Ferrer"],"pdf_url":"https://arxiv.org/pdf/2412.03551v1.pdf","comment":"Article submitted to IUI 2025"},{"id":"http://arxiv.org/abs/2412.01064v2","updated":"2024-12-04T09:43:18Z","published":"2024-12-02T02:50:07Z","title":"FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking\n Portrait","summary":" With the rapid advancement of diffusion-based generative models, portrait\nimage animation has achieved remarkable results. However, it still faces\nchallenges in temporally consistent video generation and fast sampling due to\nits iterative sampling nature. This paper presents FLOAT, an audio-driven\ntalking portrait video generation method based on flow matching generative\nmodel. We shift the generative modeling from the pixel-based latent space to a\nlearned motion latent space, enabling efficient design of temporally consistent\nmotion. To achieve this, we introduce a transformer-based vector field\npredictor with a simple yet effective frame-wise conditioning mechanism.\nAdditionally, our method supports speech-driven emotion enhancement, enabling a\nnatural incorporation of expressive motions. Extensive experiments demonstrate\nthat our method outperforms state-of-the-art audio-driven talking portrait\nmethods in terms of visual quality, motion fidelity, and efficiency.\n","authors":["Taekyung Ki","Dongchan Min","Gyeongsu Chae"],"pdf_url":"https://arxiv.org/pdf/2412.01064v2.pdf","comment":"Project page: https://deepbrainai-research.github.io/float/"},{"id":"http://arxiv.org/abs/2406.00758v3","updated":"2024-12-04T09:36:56Z","published":"2024-06-02T14:22:09Z","title":"Once-for-All: Controllable Generative Image Compression with Dynamic\n Granularity Adaption","summary":" Although recent generative image compression methods have demonstrated\nimpressive potential in optimizing the rate-distortion-perception trade-off,\nthey still face the critical challenge of flexible rate adaption to diverse\ncompression necessities and scenarios. To overcome this challenge, this paper\nproposes a Controllable Generative Image Compression framework, termed\nControl-GIC, the first capable of fine-grained bitrate adaption across a broad\nspectrum while ensuring high-fidelity and generality compression. Control-GIC\nis grounded in a VQGAN framework that encodes an image as a sequence of\nvariable-length codes (i.e. VQ-indices), which can be losslessly compressed and\nexhibits a direct positive correlation with the bitrates. Drawing inspiration\nfrom the classical coding principle, we correlate the information density of\nlocal image patches with their granular representations. Hence, we can flexibly\ndetermine a proper allocation of granularity for the patches to achieve dynamic\nadjustment for VQ-indices, resulting in desirable compression rates. We further\ndevelop a probabilistic conditional decoder capable of retrieving historic\nencoded multi-granularity representations according to transmitted codes, and\nthen reconstruct hierarchical granular features in the formalization of\nconditional probability, enabling more informative aggregation to improve\nreconstruction realism. Our experiments show that Control-GIC allows highly\nflexible and controllable bitrate adaption where the results demonstrate its\nsuperior performance over recent state-of-the-art methods.\n","authors":["Anqi Li","Feng Li","Yuxi Liu","Runmin Cong","Yao Zhao","Huihui Bai"],"pdf_url":"https://arxiv.org/pdf/2406.00758v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.06220v2","updated":"2024-12-04T01:47:08Z","published":"2024-04-09T11:14:45Z","title":"Zero-Shot Relational Learning for Multimodal Knowledge Graphs","summary":" Relational learning is an essential task in the domain of knowledge\nrepresentation, particularly in knowledge graph completion (KGC). While\nrelational learning in traditional single-modal settings has been extensively\nstudied, exploring it within a multimodal KGC context presents distinct\nchallenges and opportunities. One of the major challenges is inference on newly\ndiscovered relations without any associated training data. This zero-shot\nrelational learning scenario poses unique requirements for multimodal KGC,\ni.e., utilizing multimodality to facilitate relational learning.However,\nexisting works fail to support the leverage of multimodal information and leave\nthe problem unexplored. In this paper, we propose a novel end-to-end framework,\nconsisting of three components, i.e., multimodal learner, structure\nconsolidator, and relation embedding generator, to integrate diverse multimodal\ninformation and knowledge graph structures to facilitate the zero-shot\nrelational learning. Evaluation results on three multimodal knowledge graphs\ndemonstrate the superior performance of our proposed method.\n","authors":["Rui Cai","Shichao Pei","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2404.06220v2.pdf","comment":"In the Proceedings of the 2024 IEEE International Conference on Big\n Data (IEEE BigData 2024)"},{"id":"http://arxiv.org/abs/2412.02946v1","updated":"2024-12-04T01:23:57Z","published":"2024-12-04T01:23:57Z","title":"Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large\n Vision-Language Model via Causality Analysis","summary":" Recent advancements in large vision-language models (LVLM) have significantly\nenhanced their ability to comprehend visual inputs alongside natural language.\nHowever, a major challenge in their real-world application is hallucination,\nwhere LVLMs generate non-existent visual elements, eroding user trust. The\nunderlying mechanism driving this multimodal hallucination is poorly\nunderstood. Minimal research has illuminated whether contexts such as sky,\ntree, or grass field involve the LVLM in hallucinating a frisbee. We\nhypothesize that hidden factors, such as objects, contexts, and semantic\nforeground-background structures, induce hallucination. This study proposes a\nnovel causal approach: a hallucination probing system to identify these hidden\nfactors. By analyzing the causality between images, text prompts, and network\nsaliency, we systematically explore interventions to block these factors. Our\nexperimental findings show that a straightforward technique based on our\nanalysis can significantly reduce hallucinations. Additionally, our analyses\nindicate the potential to edit network internals to minimize hallucinated\noutputs.\n","authors":["Po-Hsuan Huang","Jeng-Lin Li","Chin-Po Chen","Ming-Ching Chang","Wei-Chao Chen"],"pdf_url":"https://arxiv.org/pdf/2412.02946v1.pdf","comment":"Accepted by WACV2025"},{"id":"http://arxiv.org/abs/2411.08307v2","updated":"2024-12-04T22:02:25Z","published":"2024-11-13T03:14:10Z","title":"PerceiverS: A Multi-Scale Perceiver with Effective Segmentation for\n Long-Term Expressive Symbolic Music Generation","summary":" AI-based music generation has progressed significantly in recent years.\nHowever, creating symbolic music that is both long-structured and expressive\nremains a considerable challenge. In this paper, we propose PerceiverS\n(Segmentation and Scale), a novel architecture designed to address this issue\nby leveraging both Effective Segmentation and Multi-Scale attention mechanisms.\nOur approach enhances symbolic music generation by simultaneously learning\nlong-term structural dependencies and short-term expressive details. By\ncombining cross-attention and self-attention in a Multi-Scale setting,\nPerceiverS captures long-range musical structure while preserving musical\ndiversity. The proposed model has been evaluated using the Maestro dataset and\nhas demonstrated improvements in generating music of conventional length with\nexpressive nuances. The project demos and the generated music samples can be\naccessed through the link: https://perceivers.github.io\n","authors":["Yungang Yi","Weihua Li","Matthew Kuo","Quan Bai"],"pdf_url":"https://arxiv.org/pdf/2411.08307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03665v1","updated":"2024-12-04T19:01:06Z","published":"2024-12-04T19:01:06Z","title":"Personalizing Multimodal Large Language Models for Image Captioning: An\n Experimental Analysis","summary":" The task of image captioning demands an algorithm to generate natural\nlanguage descriptions of visual inputs. Recent advancements have seen a\nconvergence between image captioning research and the development of Large\nLanguage Models (LLMs) and Multimodal LLMs -- like GPT-4V and Gemini -- which\nextend the capabilities of text-only LLMs to multiple modalities. This paper\ninvestigates whether Multimodal LLMs can supplant traditional image captioning\nnetworks by evaluating their performance on various image description\nbenchmarks. We explore both the zero-shot capabilities of these models and\ntheir adaptability to different semantic domains through fine-tuning methods,\nincluding prompt learning, prefix tuning, and low-rank adaptation. Our results\ndemonstrate that while Multimodal LLMs achieve impressive zero-shot\nperformance, fine-tuning for specific domains while maintaining their\ngeneralization capabilities intact remains challenging. We discuss the\nimplications of these findings for future research in image captioning and the\ndevelopment of more adaptable Multimodal LLMs.\n","authors":["Davide Bucciarelli","Nicholas Moratelli","Marcella Cornia","Lorenzo Baraldi","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2412.03665v1.pdf","comment":"ECCV 2024 Workshop on Green Foundation Models"}]},"2024-12-05T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2412.04472v1","updated":"2024-12-05T18:59:58Z","published":"2024-12-05T18:59:58Z","title":"Stereo Anywhere: Robust Zero-Shot Deep Stereo Matching Even Where Either\n Stereo or Mono Fail","summary":" We introduce Stereo Anywhere, a novel stereo-matching framework that combines\ngeometric constraints with robust priors from monocular depth Vision Foundation\nModels (VFMs). By elegantly coupling these complementary worlds through a\ndual-branch architecture, we seamlessly integrate stereo matching with learned\ncontextual cues. Following this design, our framework introduces novel cost\nvolume fusion mechanisms that effectively handle critical challenges such as\ntextureless regions, occlusions, and non-Lambertian surfaces. Through our novel\noptical illusion dataset, MonoTrap, and extensive evaluation across multiple\nbenchmarks, we demonstrate that our synthetic-only trained model achieves\nstate-of-the-art results in zero-shot generalization, significantly\noutperforming existing solutions while showing remarkable robustness to\nchallenging cases such as mirrors and transparencies.\n","authors":["Luca Bartolomei","Fabio Tosi","Matteo Poggi","Stefano Mattoccia"],"pdf_url":"https://arxiv.org/pdf/2412.04472v1.pdf","comment":"Code: https://github.com/bartn8/stereoanywhere - Project page:\n https://stereoanywhere.github.io/"},{"id":"http://arxiv.org/abs/2412.04471v1","updated":"2024-12-05T18:59:57Z","published":"2024-12-05T18:59:57Z","title":"PaintScene4D: Consistent 4D Scene Generation from Text Prompts","summary":" Recent advances in diffusion models have revolutionized 2D and 3D content\ncreation, yet generating photorealistic dynamic 4D scenes remains a significant\nchallenge. Existing dynamic 4D generation methods typically rely on distilling\nknowledge from pre-trained 3D generative models, often fine-tuned on synthetic\nobject datasets. Consequently, the resulting scenes tend to be object-centric\nand lack photorealism. While text-to-video models can generate more realistic\nscenes with motion, they often struggle with spatial understanding and provide\nlimited control over camera viewpoints during rendering. To address these\nlimitations, we present PaintScene4D, a novel text-to-4D scene generation\nframework that departs from conventional multi-view generative models in favor\nof a streamlined architecture that harnesses video generative models trained on\ndiverse real-world datasets. Our method first generates a reference video using\na video generation model, and then employs a strategic camera array selection\nfor rendering. We apply a progressive warping and inpainting technique to\nensure both spatial and temporal consistency across multiple viewpoints.\nFinally, we optimize multi-view images using a dynamic renderer, enabling\nflexible camera control based on user preferences. Adopting a training-free\narchitecture, our PaintScene4D efficiently produces realistic 4D scenes that\ncan be viewed from arbitrary trajectories. The code will be made publicly\navailable. Our project page is at https://paintscene4d.github.io/\n","authors":["Vinayak Gupta","Yunze Man","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04471v1.pdf","comment":"Project page: https://paintscene4d.github.io/"},{"id":"http://arxiv.org/abs/2412.04470v1","updated":"2024-12-05T18:59:56Z","published":"2024-12-05T18:59:56Z","title":"Turbo3D: Ultra-fast Text-to-3D Generation","summary":" We present Turbo3D, an ultra-fast text-to-3D system capable of generating\nhigh-quality Gaussian splatting assets in under one second. Turbo3D employs a\nrapid 4-step, 4-view diffusion generator and an efficient feed-forward Gaussian\nreconstructor, both operating in latent space. The 4-step, 4-view generator is\na student model distilled through a novel Dual-Teacher approach, which\nencourages the student to learn view consistency from a multi-view teacher and\nphoto-realism from a single-view teacher. By shifting the Gaussian\nreconstructor's inputs from pixel space to latent space, we eliminate the extra\nimage decoding time and halve the transformer sequence length for maximum\nefficiency. Our method demonstrates superior 3D generation results compared to\nprevious baselines, while operating in a fraction of their runtime.\n","authors":["Hanzhe Hu","Tianwei Yin","Fujun Luan","Yiwei Hu","Hao Tan","Zexiang Xu","Sai Bi","Shubham Tulsiani","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.04470v1.pdf","comment":"project page: https://turbo-3d.github.io/"},{"id":"http://arxiv.org/abs/2412.04468v1","updated":"2024-12-05T18:59:55Z","published":"2024-12-05T18:59:55Z","title":"NVILA: Efficient Frontier Visual Language Models","summary":" Visual language models (VLMs) have made significant advances in accuracy in\nrecent years. However, their efficiency has received much less attention. This\npaper introduces NVILA, a family of open VLMs designed to optimize both\nefficiency and accuracy. Building on top of VILA, we improve its model\narchitecture by first scaling up the spatial and temporal resolutions, and then\ncompressing visual tokens. This \"scale-then-compress\" approach enables NVILA to\nefficiently process high-resolution images and long videos. We also conduct a\nsystematic investigation to enhance the efficiency of NVILA throughout its\nentire lifecycle, from training and fine-tuning to deployment. NVILA matches or\nsurpasses the accuracy of many leading open and proprietary VLMs across a wide\nrange of image and video benchmarks. At the same time, it reduces training\ncosts by 4.5X, fine-tuning memory usage by 3.4X, pre-filling latency by\n1.6-2.2X, and decoding latency by 1.2-2.8X. We will soon make our code and\nmodels available to facilitate reproducibility.\n","authors":["Zhijian Liu","Ligeng Zhu","Baifeng Shi","Zhuoyang Zhang","Yuming Lou","Shang Yang","Haocheng Xi","Shiyi Cao","Yuxian Gu","Dacheng Li","Xiuyu Li","Yunhao Fang","Yukang Chen","Cheng-Yu Hsieh","De-An Huang","An-Chieh Cheng","Vishwesh Nath","Jinyi Hu","Sifei Liu","Ranjay Krishna","Daguang Xu","Xiaolong Wang","Pavlo Molchanov","Jan Kautz","Hongxu Yin","Song Han","Yao Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04469v1","updated":"2024-12-05T18:59:55Z","published":"2024-12-05T18:59:55Z","title":"QUEEN: QUantized Efficient ENcoding of Dynamic Gaussians for Streaming\n Free-viewpoint Videos","summary":" Online free-viewpoint video (FVV) streaming is a challenging problem, which\nis relatively under-explored. It requires incremental on-the-fly updates to a\nvolumetric representation, fast training and rendering to satisfy real-time\nconstraints and a small memory footprint for efficient transmission. If\nachieved, it can enhance user experience by enabling novel applications, e.g.,\n3D video conferencing and live volumetric video broadcast, among others. In\nthis work, we propose a novel framework for QUantized and Efficient ENcoding\n(QUEEN) for streaming FVV using 3D Gaussian Splatting (3D-GS). QUEEN directly\nlearns Gaussian attribute residuals between consecutive frames at each\ntime-step without imposing any structural constraints on them, allowing for\nhigh quality reconstruction and generalizability. To efficiently store the\nresiduals, we further propose a quantization-sparsity framework, which contains\na learned latent-decoder for effectively quantizing attribute residuals other\nthan Gaussian positions and a learned gating module to sparsify position\nresiduals. We propose to use the Gaussian viewspace gradient difference vector\nas a signal to separate the static and dynamic content of the scene. It acts as\na guide for effective sparsity learning and speeds up training. On diverse FVV\nbenchmarks, QUEEN outperforms the state-of-the-art online FVV methods on all\nmetrics. Notably, for several highly dynamic scenes, it reduces the model size\nto just 0.7 MB per frame while training in under 5 sec and rendering at 350\nFPS. Project website is at https://research.nvidia.com/labs/amri/projects/queen\n","authors":["Sharath Girish","Tianye Li","Amrita Mazumdar","Abhinav Shrivastava","David Luebke","Shalini De Mello"],"pdf_url":"https://arxiv.org/pdf/2412.04469v1.pdf","comment":"Accepted at NeurIPS 2024, Project website:\n https://research.nvidia.com/labs/amri/projects/queen"},{"id":"http://arxiv.org/abs/2412.04467v1","updated":"2024-12-05T18:59:53Z","published":"2024-12-05T18:59:53Z","title":"VisionZip: Longer is Better but Not Necessary in Vision Language Models","summary":" Recent advancements in vision-language models have enhanced performance by\nincreasing the length of visual tokens, making them much longer than text\ntokens and significantly raising computational costs. However, we observe that\nthe visual tokens generated by popular vision encoders, such as CLIP and\nSigLIP, contain significant redundancy. To address this, we introduce\nVisionZip, a simple yet effective method that selects a set of informative\ntokens for input to the language model, reducing visual token redundancy and\nimproving efficiency while maintaining model performance. The proposed\nVisionZip can be widely applied to image and video understanding tasks and is\nwell-suited for multi-turn dialogues in real-world scenarios, where previous\nmethods tend to underperform. Experimental results show that VisionZip\noutperforms the previous state-of-the-art method by at least 5% performance\ngains across nearly all settings. Moreover, our method significantly enhances\nmodel inference speed, improving the prefilling time by 8x and enabling the\nLLaVA-Next 13B model to infer faster than the LLaVA-Next 7B model while\nachieving better results. Furthermore, we analyze the causes of this redundancy\nand encourage the community to focus on extracting better visual features\nrather than merely increasing token length. Our code is available at\nhttps://github.com/dvlab-research/VisionZip .\n","authors":["Senqiao Yang","Yukang Chen","Zhuotao Tian","Chengyao Wang","Jingyao Li","Bei Yu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2412.04467v1.pdf","comment":"2 columns, 28 pages, 15 figures, 18 tables"},{"id":"http://arxiv.org/abs/2412.04465v1","updated":"2024-12-05T18:59:50Z","published":"2024-12-05T18:59:50Z","title":"UnZipLoRA: Separating Content and Style from a Single Image","summary":" This paper introduces UnZipLoRA, a method for decomposing an image into its\nconstituent subject and style, represented as two distinct LoRAs (Low-Rank\nAdaptations). Unlike existing personalization techniques that focus on either\nsubject or style in isolation, or require separate training sets for each,\nUnZipLoRA disentangles these elements from a single image by training both the\nLoRAs simultaneously. UnZipLoRA ensures that the resulting LoRAs are\ncompatible, i.e., they can be seamlessly combined using direct addition.\nUnZipLoRA enables independent manipulation and recontextualization of subject\nand style, including generating variations of each, applying the extracted\nstyle to new subjects, and recombining them to reconstruct the original image\nor create novel variations. To address the challenge of subject and style\nentanglement, UnZipLoRA employs a novel prompt separation technique, as well as\ncolumn and block separation strategies to accurately preserve the\ncharacteristics of subject and style, and ensure compatibility between the\nlearned LoRAs. Evaluation with human studies and quantitative metrics\ndemonstrates UnZipLoRA's effectiveness compared to other state-of-the-art\nmethods, including DreamBooth-LoRA, Inspiration Tree, and B-LoRA.\n","authors":["Chang Liu","Viraj Shah","Aiyu Cui","Svetlana Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2412.04465v1.pdf","comment":"Project page: https://unziplora.github.io"},{"id":"http://arxiv.org/abs/2412.04464v1","updated":"2024-12-05T18:59:48Z","published":"2024-12-05T18:59:48Z","title":"DualPM: Dual Posed-Canonical Point Maps for 3D Shape and Pose\n Reconstruction","summary":" The choice of data representation is a key factor in the success of deep\nlearning in geometric tasks. For instance, DUSt3R has recently introduced the\nconcept of viewpoint-invariant point maps, generalizing depth prediction, and\nshowing that one can reduce all the key problems in the 3D reconstruction of\nstatic scenes to predicting such point maps. In this paper, we develop an\nanalogous concept for a very different problem, namely, the reconstruction of\nthe 3D shape and pose of deformable objects. To this end, we introduce the Dual\nPoint Maps (DualPM), where a pair of point maps is extracted from the {same}\nimage, one associating pixels to their 3D locations on the object, and the\nother to a canonical version of the object at rest pose. We also extend point\nmaps to amodal reconstruction, seeing through self-occlusions to obtain the\ncomplete shape of the object. We show that 3D reconstruction and 3D pose\nestimation reduce to the prediction of the DualPMs. We demonstrate empirically\nthat this representation is a good target for a deep network to predict;\nspecifically, we consider modeling horses, showing that DualPMs can be trained\npurely on 3D synthetic data, consisting of a single model of a horse, while\ngeneralizing very well to real images. With this, we improve by a large margin\nprevious methods for the 3D analysis and reconstruction of this type of\nobjects.\n","authors":["Ben Kaye","Tomas Jakab","Shangzhe Wu","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2412.04464v1.pdf","comment":"First two authors contributed equally. Project page:\n https://dualpm.github.io"},{"id":"http://arxiv.org/abs/2412.04463v1","updated":"2024-12-05T18:59:42Z","published":"2024-12-05T18:59:42Z","title":"MegaSaM: Accurate, Fast, and Robust Structure and Motion from Casual\n Dynamic Videos","summary":" We present a system that allows for accurate, fast, and robust estimation of\ncamera parameters and depth maps from casual monocular videos of dynamic\nscenes. Most conventional structure from motion and monocular SLAM techniques\nassume input videos that feature predominantly static scenes with large amounts\nof parallax. Such methods tend to produce erroneous estimates in the absence of\nthese conditions. Recent neural network-based approaches attempt to overcome\nthese challenges; however, such methods are either computationally expensive or\nbrittle when run on dynamic videos with uncontrolled camera motion or unknown\nfield of view. We demonstrate the surprising effectiveness of a deep visual\nSLAM framework: with careful modifications to its training and inference\nschemes, this system can scale to real-world videos of complex dynamic scenes\nwith unconstrained camera paths, including videos with little camera parallax.\nExtensive experiments on both synthetic and real videos demonstrate that our\nsystem is significantly more accurate and robust at camera pose and depth\nestimation when compared with prior and concurrent work, with faster or\ncomparable running times. See interactive results on our project page:\nhttps://mega-sam.github.io/\n","authors":["Zhengqi Li","Richard Tucker","Forrester Cole","Qianqian Wang","Linyi Jin","Vickie Ye","Angjoo Kanazawa","Aleksander Holynski","Noah Snavely"],"pdf_url":"https://arxiv.org/pdf/2412.04463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04462v1","updated":"2024-12-05T18:59:41Z","published":"2024-12-05T18:59:41Z","title":"4Real-Video: Learning Generalizable Photo-Realistic 4D Video Diffusion","summary":" We propose 4Real-Video, a novel framework for generating 4D videos, organized\nas a grid of video frames with both time and viewpoint axes. In this grid, each\nrow contains frames sharing the same timestep, while each column contains\nframes from the same viewpoint. We propose a novel two-stream architecture. One\nstream performs viewpoint updates on columns, and the other stream performs\ntemporal updates on rows. After each diffusion transformer layer, a\nsynchronization layer exchanges information between the two token streams. We\npropose two implementations of the synchronization layer, using either hard or\nsoft synchronization. This feedforward architecture improves upon previous work\nin three ways: higher inference speed, enhanced visual quality (measured by\nFVD, CLIP, and VideoScore), and improved temporal and viewpoint consistency\n(measured by VideoScore and Dust3R-Confidence).\n","authors":["Chaoyang Wang","Peiye Zhuang","Tuan Duc Ngo","Willi Menapace","Aliaksandr Siarohin","Michael Vasilkovsky","Ivan Skorokhodov","Sergey Tulyakov","Peter Wonka","Hsin-Ying Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04462v1.pdf","comment":"Project page: https://snap-research.github.io/4Real-Video/"},{"id":"http://arxiv.org/abs/2412.04460v1","updated":"2024-12-05T18:59:18Z","published":"2024-12-05T18:59:18Z","title":"LayerFusion: Harmonized Multi-Layer Text-to-Image Generation with\n Generative Priors","summary":" Large-scale diffusion models have achieved remarkable success in generating\nhigh-quality images from textual descriptions, gaining popularity across\nvarious applications. However, the generation of layered content, such as\ntransparent images with foreground and background layers, remains an\nunder-explored area. Layered content generation is crucial for creative\nworkflows in fields like graphic design, animation, and digital art, where\nlayer-based approaches are fundamental for flexible editing and composition. In\nthis paper, we propose a novel image generation pipeline based on Latent\nDiffusion Models (LDMs) that generates images with two layers: a foreground\nlayer (RGBA) with transparency information and a background layer (RGB). Unlike\nexisting methods that generate these layers sequentially, our approach\nintroduces a harmonized generation mechanism that enables dynamic interactions\nbetween the layers for more coherent outputs. We demonstrate the effectiveness\nof our method through extensive qualitative and quantitative experiments,\nshowing significant improvements in visual coherence, image quality, and layer\nconsistency compared to baseline methods.\n","authors":["Yusuf Dalva","Yijun Li","Qing Liu","Nanxuan Zhao","Jianming Zhang","Zhe Lin","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2412.04460v1.pdf","comment":"Project page: https://layerfusion.github.io"},{"id":"http://arxiv.org/abs/2412.04459v1","updated":"2024-12-05T18:59:11Z","published":"2024-12-05T18:59:11Z","title":"Sparse Voxels Rasterization: Real-time High-fidelity Radiance Field\n Rendering","summary":" We propose an efficient radiance field rendering algorithm that incorporates\na rasterization process on sparse voxels without neural networks or 3D\nGaussians. There are two key contributions coupled with the proposed system.\nThe first is to render sparse voxels in the correct depth order along pixel\nrays by using dynamic Morton ordering. This avoids the well-known popping\nartifact found in Gaussian splatting. Second, we adaptively fit sparse voxels\nto different levels of detail within scenes, faithfully reproducing scene\ndetails while achieving high rendering frame rates. Our method improves the\nprevious neural-free voxel grid representation by over 4db PSNR and more than\n10x rendering FPS speedup, achieving state-of-the-art comparable novel-view\nsynthesis results. Additionally, our neural-free sparse voxels are seamlessly\ncompatible with grid-based 3D processing algorithms. We achieve promising mesh\nreconstruction accuracy by integrating TSDF-Fusion and Marching Cubes into our\nsparse grid system.\n","authors":["Cheng Sun","Jaesung Choe","Charles Loop","Wei-Chiu Ma","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04459v1.pdf","comment":"Code release in progress"},{"id":"http://arxiv.org/abs/2412.04458v1","updated":"2024-12-05T18:59:09Z","published":"2024-12-05T18:59:09Z","title":"Cubify Anything: Scaling Indoor 3D Object Detection","summary":" We consider indoor 3D object detection with respect to a single RGB(-D) frame\nacquired from a commodity handheld device. We seek to significantly advance the\nstatus quo with respect to both data and modeling. First, we establish that\nexisting datasets have significant limitations to scale, accuracy, and\ndiversity of objects. As a result, we introduce the Cubify-Anything 1M (CA-1M)\ndataset, which exhaustively labels over 400K 3D objects on over 1K highly\naccurate laser-scanned scenes with near-perfect registration to over 3.5K\nhandheld, egocentric captures. Next, we establish Cubify Transformer (CuTR), a\nfully Transformer 3D object detection baseline which rather than operating in\n3D on point or voxel-based representations, predicts 3D boxes directly from 2D\nfeatures derived from RGB(-D) inputs. While this approach lacks any 3D\ninductive biases, we show that paired with CA-1M, CuTR outperforms point-based\nmethods - accurately recalling over 62% of objects in 3D, and is significantly\nmore capable at handling noise and uncertainty present in commodity\nLiDAR-derived depth maps while also providing promising RGB only performance\nwithout architecture changes. Furthermore, by pre-training on CA-1M, CuTR can\noutperform point-based methods on a more diverse variant of SUN RGB-D -\nsupporting the notion that while inductive biases in 3D are useful at the\nsmaller sizes of existing datasets, they fail to scale to the data-rich regime\nof CA-1M. Overall, this dataset and baseline model provide strong evidence that\nwe are moving towards models which can effectively Cubify Anything.\n","authors":["Justin Lazarow","David Griffiths","Gefen Kohavi","Francisco Crespo","Afshin Dehghan"],"pdf_url":"https://arxiv.org/pdf/2412.04458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04457v1","updated":"2024-12-05T18:59:08Z","published":"2024-12-05T18:59:08Z","title":"Monocular Dynamic Gaussian Splatting is Fast and Brittle but Smooth\n Motion Helps","summary":" Gaussian splatting methods are emerging as a popular approach for converting\nmulti-view image data into scene representations that allow view synthesis. In\nparticular, there is interest in enabling view synthesis for dynamic scenes\nusing only monocular input data -- an ill-posed and challenging problem. The\nfast pace of work in this area has produced multiple simultaneous papers that\nclaim to work best, which cannot all be true. In this work, we organize,\nbenchmark, and analyze many Gaussian-splatting-based methods, providing\napples-to-apples comparisons that prior works have lacked. We use multiple\nexisting datasets and a new instructive synthetic dataset designed to isolate\nfactors that affect reconstruction quality. We systematically categorize\nGaussian splatting methods into specific motion representation types and\nquantify how their differences impact performance. Empirically, we find that\ntheir rank order is well-defined in synthetic data, but the complexity of\nreal-world data currently overwhelms the differences. Furthermore, the fast\nrendering speed of all Gaussian-based methods comes at the cost of brittleness\nin optimization. We summarize our experiments into a list of findings that can\nhelp to further progress in this lively problem setting. Project Webpage:\nhttps://lynl7130.github.io/MonoDyGauBench.github.io/\n","authors":["Yiqing Liang","Mikhail Okunev","Mikaela Angelina Uy","Runfeng Li","Leonidas Guibas","James Tompkin","Adam W. Harley"],"pdf_url":"https://arxiv.org/pdf/2412.04457v1.pdf","comment":"37 pages, 39 figures, 9 tables"},{"id":"http://arxiv.org/abs/2412.04456v1","updated":"2024-12-05T18:59:00Z","published":"2024-12-05T18:59:00Z","title":"HeatFormer: A Neural Optimizer for Multiview Human Mesh Recovery","summary":" We introduce a novel method for human shape and pose recovery that can fully\nleverage multiple static views. We target fixed-multiview people monitoring,\nincluding elderly care and safety monitoring, in which calibrated cameras can\nbe installed at the corners of a room or an open space but whose configuration\nmay vary depending on the environment. Our key idea is to formulate it as\nneural optimization. We achieve this with HeatFormer, a neural optimizer that\niteratively refines the SMPL parameters given multiview images, which is\nfundamentally agonistic to the configuration of views. HeatFormer realizes this\nSMPL parameter estimation as heat map generation and alignment with a novel\ntransformer encoder and decoder. We demonstrate the effectiveness of HeatFormer\nincluding its accuracy, robustness to occlusion, and generalizability through\nan extensive set of experiments. We believe HeatFormer can serve a key role in\npassive human behavior modeling.\n","authors":["Yuto Matsubara","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2412.04456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01819v3","updated":"2024-12-05T18:58:43Z","published":"2024-12-02T18:57:41Z","title":"Switti: Designing Scale-Wise Transformers for Text-to-Image Synthesis","summary":" This work presents Switti, a scale-wise transformer for text-to-image\ngeneration. Starting from existing next-scale prediction AR models, we first\nexplore them for T2I generation and propose architectural modifications to\nimprove their convergence and overall performance. We then argue that\nscale-wise transformers do not require causality and propose a non-causal\ncounterpart facilitating ~11% faster sampling and lower memory usage while also\nachieving slightly better generation quality. Furthermore, we reveal that\nclassifier-free guidance at high-resolution scales is often unnecessary and can\neven degrade performance. By disabling guidance at these scales, we achieve an\nadditional sampling acceleration of ~20% and improve the generation of\nfine-grained details. Extensive human preference studies and automated\nevaluations show that Switti outperforms existing T2I AR models and competes\nwith state-of-the-art T2I diffusion models while being up to 7 times faster.\n","authors":["Anton Voronov","Denis Kuznedelev","Mikhail Khoroshikh","Valentin Khrulkov","Dmitry Baranchuk"],"pdf_url":"https://arxiv.org/pdf/2412.01819v3.pdf","comment":"20 pages, 22 figures"},{"id":"http://arxiv.org/abs/2412.04455v1","updated":"2024-12-05T18:58:27Z","published":"2024-12-05T18:58:27Z","title":"Code-as-Monitor: Constraint-aware Visual Programming for Reactive and\n Proactive Robotic Failure Detection","summary":" Automatic detection and prevention of open-set failures are crucial in\nclosed-loop robotic systems. Recent studies often struggle to simultaneously\nidentify unexpected failures reactively after they occur and prevent\nforeseeable ones proactively. To this end, we propose Code-as-Monitor (CaM), a\nnovel paradigm leveraging the vision-language model (VLM) for both open-set\nreactive and proactive failure detection. The core of our method is to\nformulate both tasks as a unified set of spatio-temporal constraint\nsatisfaction problems and use VLM-generated code to evaluate them for real-time\nmonitoring. To enhance the accuracy and efficiency of monitoring, we further\nintroduce constraint elements that abstract constraint-related entities or\ntheir parts into compact geometric elements. This approach offers greater\ngenerality, simplifies tracking, and facilitates constraint-aware visual\nprogramming by leveraging these elements as visual prompts. Experiments show\nthat CaM achieves a 28.7% higher success rate and reduces execution time by\n31.8% under severe disturbances compared to baselines across three simulators\nand a real-world setting. Moreover, CaM can be integrated with open-loop\ncontrol policies to form closed-loop systems, enabling long-horizon tasks in\ncluttered scenes with dynamic environments.\n","authors":["Enshen Zhou","Qi Su","Cheng Chi","Zhizheng Zhang","Zhongyuan Wang","Tiejun Huang","Lu Sheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04455v1.pdf","comment":"Project page: https://zhoues.github.io/Code-as-Monitor/"},{"id":"http://arxiv.org/abs/2412.04452v1","updated":"2024-12-05T18:58:17Z","published":"2024-12-05T18:58:17Z","title":"Four-Plane Factorized Video Autoencoders","summary":" Latent variable generative models have emerged as powerful tools for\ngenerative tasks including image and video synthesis. These models are enabled\nby pretrained autoencoders that map high resolution data into a compressed\nlower dimensional latent space, where the generative models can subsequently be\ndeveloped while requiring fewer computational resources. Despite their\neffectiveness, the direct application of latent variable models to higher\ndimensional domains such as videos continues to pose challenges for efficient\ntraining and inference. In this paper, we propose an autoencoder that projects\nvolumetric data onto a four-plane factorized latent space that grows\nsublinearly with the input size, making it ideal for higher dimensional data\nlike videos. The design of our factorized model supports straightforward\nadoption in a number of conditional generation tasks with latent diffusion\nmodels (LDMs), such as class-conditional generation, frame prediction, and\nvideo interpolation. Our results show that the proposed four-plane latent space\nretains a rich representation needed for high-fidelity reconstructions despite\nthe heavy compression, while simultaneously enabling LDMs to operate with\nsignificant improvements in speed and memory.\n","authors":["Mohammed Suhail","Carlos Esteves","Leonid Sigal","Ameesh Makadia"],"pdf_url":"https://arxiv.org/pdf/2412.04452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04453v1","updated":"2024-12-05T18:58:17Z","published":"2024-12-05T18:58:17Z","title":"NaVILA: Legged Robot Vision-Language-Action Model for Navigation","summary":" This paper proposes to solve the problem of Vision-and-Language Navigation\nwith legged robots, which not only provides a flexible way for humans to\ncommand but also allows the robot to navigate through more challenging and\ncluttered scenes. However, it is non-trivial to translate human language\ninstructions all the way to low-level leg joint actions. We propose NaVILA, a\n2-level framework that unifies a Vision-Language-Action model (VLA) with\nlocomotion skills. Instead of directly predicting low-level actions from VLA,\nNaVILA first generates mid-level actions with spatial information in the form\nof language, (e.g., \"moving forward 75cm\"), which serves as an input for a\nvisual locomotion RL policy for execution. NaVILA substantially improves\nprevious approaches on existing benchmarks. The same advantages are\ndemonstrated in our newly developed benchmarks with IsaacLab, featuring more\nrealistic scenes, low-level controls, and real-world robot experiments. We show\nmore results at https://navila-bot.github.io/\n","authors":["An-Chieh Cheng","Yandong Ji","Zhaojing Yang","Xueyan Zou","Jan Kautz","Erdem Bıyık","Hongxu Yin","Sifei Liu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04453v1.pdf","comment":"Website: https://navila-bot.github.io/"},{"id":"http://arxiv.org/abs/2412.04449v1","updated":"2024-12-05T18:58:03Z","published":"2024-12-05T18:58:03Z","title":"p-MoD: Building Mixture-of-Depths MLLMs via Progressive Ratio Decay","summary":" Despite the remarkable performance of multimodal large language models\n(MLLMs) across diverse tasks, the substantial training and inference costs\nimpede their advancement. The majority of computation stems from the\noverwhelming volume of vision tokens processed by the transformer decoder. In\nthis paper, we propose to build efficient MLLMs by leveraging the\nMixture-of-Depths (MoD) mechanism, where each transformer decoder layer selects\nessential vision tokens to process while skipping redundant ones. However,\nintegrating MoD into MLLMs is non-trivial. To address the challenges of\ntraining and inference stability as well as limited training data, we adapt the\nMoD module with two novel designs: tanh-gated weight normalization (TanhNorm)\nand symmetric token reweighting (STRing). Moreover, we observe that vision\ntokens exhibit higher redundancy in deeper layer and thus design a progressive\nratio decay (PRD) strategy, which gradually reduces the token retention ratio\nlayer by layer, employing a shifted cosine schedule. This crucial design fully\nunleashes the potential of MoD, significantly boosting the efficiency and\nperformance of our models. To validate the effectiveness of our approach, we\nconduct extensive experiments with two baseline models across 14 benchmarks.\nOur model, p-MoD, matches or even surpasses the performance of the baseline\nmodels, with only 55.6% TFLOPs and 53.8% KV cache storage during inference, and\n77.7% GPU hours during training.\n","authors":["Jun Zhang","Desen Meng","Ji Qi","Zhenpeng Huang","Tao Wu","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04449v1.pdf","comment":"Technical Report; Code released at https://github.com/MCG-NJU/p-MoD"},{"id":"http://arxiv.org/abs/2412.04448v1","updated":"2024-12-05T18:57:26Z","published":"2024-12-05T18:57:26Z","title":"MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation","summary":" Recent advances in video diffusion models have unlocked new potential for\nrealistic audio-driven talking video generation. However, achieving seamless\naudio-lip synchronization, maintaining long-term identity consistency, and\nproducing natural, audio-aligned expressions in generated talking videos remain\nsignificant challenges. To address these challenges, we propose Memory-guided\nEMOtion-aware diffusion (MEMO), an end-to-end audio-driven portrait animation\napproach to generate identity-consistent and expressive talking videos. Our\napproach is built around two key modules: (1) a memory-guided temporal module,\nwhich enhances long-term identity consistency and motion smoothness by\ndeveloping memory states to store information from a longer past context to\nguide temporal modeling via linear attention; and (2) an emotion-aware audio\nmodule, which replaces traditional cross attention with multi-modal attention\nto enhance audio-video interaction, while detecting emotions from audio to\nrefine facial expressions via emotion adaptive layer norm. Extensive\nquantitative and qualitative results demonstrate that MEMO generates more\nrealistic talking videos across diverse image and audio types, outperforming\nstate-of-the-art methods in overall quality, audio-lip synchronization,\nidentity consistency, and expression-emotion alignment.\n","authors":["Longtao Zheng","Yifan Zhang","Hanzhong Guo","Jiachun Pan","Zhenxiong Tan","Jiahao Lu","Chuanxin Tang","Bo An","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2412.04448v1.pdf","comment":"Project Page: https://memoavatar.github.io"},{"id":"http://arxiv.org/abs/2412.04447v1","updated":"2024-12-05T18:57:23Z","published":"2024-12-05T18:57:23Z","title":"EgoPlan-Bench2: A Benchmark for Multimodal Large Language Model Planning\n in Real-World Scenarios","summary":" The advent of Multimodal Large Language Models, leveraging the power of Large\nLanguage Models, has recently demonstrated superior multimodal understanding\nand reasoning abilities, heralding a new era for artificial general\nintelligence. However, achieving AGI necessitates more than just comprehension\nand reasoning. A crucial capability required is effective planning in diverse\nscenarios, which involves making reasonable decisions based on complex\nenvironments to solve real-world problems. Despite its importance, the planning\nabilities of current MLLMs in varied scenarios remain underexplored. In this\npaper, we introduce EgoPlan-Bench2, a rigorous and comprehensive benchmark\ndesigned to assess the planning capabilities of MLLMs across a wide range of\nreal-world scenarios. EgoPlan-Bench2 encompasses everyday tasks spanning 4\nmajor domains and 24 detailed scenarios, closely aligned with human daily life.\nEgoPlan-Bench2 is constructed through a semi-automatic process utilizing\negocentric videos, complemented by manual verification. Grounded in a\nfirst-person perspective, it mirrors the way humans approach problem-solving in\neveryday life. We evaluate 21 competitive MLLMs and provide an in-depth\nanalysis of their limitations, revealing that they face significant challenges\nin real-world planning. To further improve the planning proficiency of current\nMLLMs, we propose a training-free approach using multimodal Chain-of-Thought\n(CoT) prompting through investigating the effectiveness of various multimodal\nprompts in complex planning. Our approach enhances the performance of GPT-4V by\n10.24 on EgoPlan-Bench2 without additional training. Our work not only sheds\nlight on the current limitations of MLLMs in planning, but also provides\ninsights for future enhancements in this critical area. We have made data and\ncode available at https://qiulu66.github.io/egoplanbench2/.\n","authors":["Lu Qiu","Yuying Ge","Yi Chen","Yixiao Ge","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04447v1.pdf","comment":"Code & data are available at:\n https://qiulu66.github.io/egoplanbench2/"},{"id":"http://arxiv.org/abs/2412.04446v1","updated":"2024-12-05T18:57:06Z","published":"2024-12-05T18:57:06Z","title":"DiCoDe: Diffusion-Compressed Deep Tokens for Autoregressive Video\n Generation with Language Models","summary":" Videos are inherently temporal sequences by their very nature. In this work,\nwe explore the potential of modeling videos in a chronological and scalable\nmanner with autoregressive (AR) language models, inspired by their success in\nnatural language processing. We introduce DiCoDe, a novel approach that\nleverages Diffusion-Compressed Deep Tokens to generate videos with a language\nmodel in an autoregressive manner. Unlike existing methods that employ\nlow-level representations with limited compression rates, DiCoDe utilizes deep\ntokens with a considerable compression rate (a 1000x reduction in token count).\nThis significant compression is made possible by a tokenizer trained through\nleveraging the prior knowledge of video diffusion models. Deep tokens enable\nDiCoDe to employ vanilla AR language models for video generation, akin to\ntranslating one visual \"language\" into another. By treating videos as temporal\nsequences, DiCoDe fully harnesses the capabilities of language models for\nautoregressive generation. DiCoDe is scalable using readily available AR\narchitectures, and is capable of generating videos ranging from a few seconds\nto one minute using only 4 A100 GPUs for training. We evaluate DiCoDe both\nquantitatively and qualitatively, demonstrating that it performs comparably to\nexisting methods in terms of quality while ensuring efficient training. To\nshowcase its scalability, we release a series of DiCoDe configurations with\nvarying parameter sizes and observe a consistent improvement in performance as\nthe model size increases from 100M to 3B. We believe that DiCoDe's exploration\nin academia represents a promising initial step toward scalable video modeling\nwith AR language models, paving the way for the development of larger and more\npowerful video generation models.\n","authors":["Yizhuo Li","Yuying Ge","Yixiao Ge","Ping Luo","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2412.04446v1.pdf","comment":"Project Page: https://liyz15.github.io/DiCoDe"},{"id":"http://arxiv.org/abs/2412.04445v1","updated":"2024-12-05T18:57:04Z","published":"2024-12-05T18:57:04Z","title":"Moto: Latent Motion Token as the Bridging Language for Robot\n Manipulation","summary":" Recent developments in Large Language Models pre-trained on extensive corpora\nhave shown significant success in various natural language processing tasks\nwith minimal fine-tuning. This success offers new promise for robotics, which\nhas long been constrained by the high cost of action-labeled data. We ask:\ngiven the abundant video data containing interaction-related knowledge\navailable as a rich \"corpus\", can a similar generative pre-training approach be\neffectively applied to enhance robot learning? The key challenge is to identify\nan effective representation for autoregressive pre-training that benefits robot\nmanipulation tasks. Inspired by the way humans learn new skills through\nobserving dynamic environments, we propose that effective robotic learning\nshould emphasize motion-related knowledge, which is closely tied to low-level\nactions and is hardware-agnostic, facilitating the transfer of learned motions\nto actual robot actions. To this end, we introduce Moto, which converts video\ncontent into latent Motion Token sequences by a Latent Motion Tokenizer,\nlearning a bridging \"language\" of motion from videos in an unsupervised manner.\nWe pre-train Moto-GPT through motion token autoregression, enabling it to\ncapture diverse visual motion knowledge. After pre-training, Moto-GPT\ndemonstrates the promising ability to produce semantically interpretable motion\ntokens, predict plausible motion trajectories, and assess trajectory\nrationality through output likelihood. To transfer learned motion priors to\nreal robot actions, we implement a co-fine-tuning strategy that seamlessly\nbridges latent motion token prediction and real robot control. Extensive\nexperiments show that the fine-tuned Moto-GPT exhibits superior robustness and\nefficiency on robot manipulation benchmarks, underscoring its effectiveness in\ntransferring knowledge from video data to downstream visual manipulation tasks.\n","authors":["Yi Chen","Yuying Ge","Yizhuo Li","Yixiao Ge","Mingyu Ding","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04445v1.pdf","comment":"Project released at: https://chenyi99.github.io/moto/"},{"id":"http://arxiv.org/abs/2412.04441v1","updated":"2024-12-05T18:56:23Z","published":"2024-12-05T18:56:23Z","title":"Learning Artistic Signatures: Symmetry Discovery and Style Transfer","summary":" Despite nearly a decade of literature on style transfer, there is no\nundisputed definition of artistic style. State-of-the-art models produce\nimpressive results but are difficult to interpret since, without a coherent\ndefinition of style, the problem of style transfer is inherently ill-posed.\nEarly work framed style-transfer as an optimization problem but treated style\nas a measure only of texture. This led to artifacts in the outputs of early\nmodels where content features from the style image sometimes bled into the\noutput image. Conversely, more recent work with diffusion models offers\ncompelling empirical results but provides little theoretical grounding. To\naddress these issues, we propose an alternative definition of artistic style.\nWe suggest that style should be thought of as a set of global symmetries that\ndictate the arrangement of local textures. We validate this perspective\nempirically by learning the symmetries of a large dataset of paintings and\nshowing that symmetries are predictive of the artistic movement to which each\npainting belongs. Finally, we show that by considering both local and global\nfeatures, using both Lie generators and traditional measures of texture, we can\nquantitatively capture the stylistic similarity between artists better than\nwith either set of features alone. This approach not only aligns well with art\nhistorians' consensus but also offers a robust framework for distinguishing\nnuanced stylistic differences, allowing for a more interpretable, theoretically\ngrounded approach to style transfer.\n","authors":["Emma Finn","T. Anderson Keller","Emmanouil Theodosis","Demba E. Ba"],"pdf_url":"https://arxiv.org/pdf/2412.04441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04440v1","updated":"2024-12-05T18:56:05Z","published":"2024-12-05T18:56:05Z","title":"GenMAC: Compositional Text-to-Video Generation with Multi-Agent\n Collaboration","summary":" Text-to-video generation models have shown significant progress in the recent\nyears. However, they still struggle with generating complex dynamic scenes\nbased on compositional text prompts, such as attribute binding for multiple\nobjects, temporal dynamics associated with different objects, and interactions\nbetween objects. Our key motivation is that complex tasks can be decomposed\ninto simpler ones, each handled by a role-specialized MLLM agent. Multiple\nagents can collaborate together to achieve collective intelligence for complex\ngoals. We propose GenMAC, an iterative, multi-agent framework that enables\ncompositional text-to-video generation. The collaborative workflow includes\nthree stages: Design, Generation, and Redesign, with an iterative loop between\nthe Generation and Redesign stages to progressively verify and refine the\ngenerated videos. The Redesign stage is the most challenging stage that aims to\nverify the generated videos, suggest corrections, and redesign the text\nprompts, frame-wise layouts, and guidance scales for the next iteration of\ngeneration. To avoid hallucination of a single MLLM agent, we decompose this\nstage to four sequentially-executed MLLM-based agents: verification agent,\nsuggestion agent, correction agent, and output structuring agent. Furthermore,\nto tackle diverse scenarios of compositional text-to-video generation, we\ndesign a self-routing mechanism to adaptively select the proper correction\nagent from a collection of correction agents each specialized for one scenario.\nExtensive experiments demonstrate the effectiveness of GenMAC, achieving\nstate-of-the art performance in compositional text-to-video generation.\n","authors":["Kaiyi Huang","Yukun Huang","Xuefei Ning","Zinan Lin","Yu Wang","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04440v1.pdf","comment":"Project website: https://karine-h.github.io/GenMAC/"},{"id":"http://arxiv.org/abs/2407.08726v2","updated":"2024-12-05T18:54:08Z","published":"2024-07-11T17:57:22Z","title":"Map It Anywhere (MIA): Empowering Bird's Eye View Mapping using\n Large-scale Public Data","summary":" Top-down Bird's Eye View (BEV) maps are a popular representation for ground\nrobot navigation due to their richness and flexibility for downstream tasks.\nWhile recent methods have shown promise for predicting BEV maps from\nFirst-Person View (FPV) images, their generalizability is limited to small\nregions captured by current autonomous vehicle-based datasets. In this context,\nwe show that a more scalable approach towards generalizable map prediction can\nbe enabled by using two large-scale crowd-sourced mapping platforms, Mapillary\nfor FPV images and OpenStreetMap for BEV semantic maps. We introduce Map It\nAnywhere (MIA), a data engine that enables seamless curation and modeling of\nlabeled map prediction data from existing open-source map platforms. Using our\nMIA data engine, we display the ease of automatically collecting a dataset of\n1.2 million pairs of FPV images & BEV maps encompassing diverse geographies,\nlandscapes, environmental factors, camera models & capture scenarios. We\nfurther train a simple camera model-agnostic model on this data for BEV map\nprediction. Extensive evaluations using established benchmarks and our dataset\nshow that the data curated by MIA enables effective pretraining for\ngeneralizable BEV map prediction, with zero-shot performance far exceeding\nbaselines trained on existing datasets by 35%. Our analysis highlights the\npromise of using large-scale public maps for developing & testing generalizable\nBEV perception, paving the way for more robust autonomous navigation. Website:\nhttps://mapitanywhere.github.io/\n","authors":["Cherie Ho","Jiaye Zou","Omar Alama","Sai Mitheran Jagadesh Kumar","Benjamin Chiang","Taneesh Gupta","Chen Wang","Nikhil Keetha","Katia Sycara","Sebastian Scherer"],"pdf_url":"https://arxiv.org/pdf/2407.08726v2.pdf","comment":"Accepted at the 38th Conference on Neural Information Processing\n Systems (NeurIPS 2024) Track on Datasets and Benchmarks. Website:\n https://mapitanywhere.github.io/"},{"id":"http://arxiv.org/abs/2412.04434v1","updated":"2024-12-05T18:53:13Z","published":"2024-12-05T18:53:13Z","title":"Towards Real-Time Open-Vocabulary Video Instance Segmentation","summary":" In this paper, we address the challenge of performing open-vocabulary video\ninstance segmentation (OV-VIS) in real-time. We analyze the computational\nbottlenecks of state-of-the-art foundation models that performs OV-VIS, and\npropose a new method, TROY-VIS, that significantly improves processing speed\nwhile maintaining high accuracy. We introduce three key techniques: (1)\nDecoupled Attention Feature Enhancer to speed up information interaction\nbetween different modalities and scales; (2) Flash Embedding Memory for\nobtaining fast text embeddings of object categories; and, (3) Kernel\nInterpolation for exploiting the temporal continuity in videos. Our experiments\ndemonstrate that TROY-VIS achieves the best trade-off between accuracy and\nspeed on two large-scale OV-VIS benchmarks, BURST and LV-VIS, running 20x\nfaster than GLEE-Lite (25 FPS v.s. 1.25 FPS) with comparable or even better\naccuracy. These results demonstrate TROY-VIS's potential for real-time\napplications in dynamic environments such as mobile robotics and augmented\nreality. Code and model will be released at\nhttps://github.com/google-research/troyvis.\n","authors":["Bin Yan","Martin Sundermeyer","David Joseph Tan","Huchuan Lu","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2412.04434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04433v1","updated":"2024-12-05T18:53:06Z","published":"2024-12-05T18:53:06Z","title":"PBDyG: Position Based Dynamic Gaussians for Motion-Aware Clothed Human\n Avatars","summary":" This paper introduces a novel clothed human model that can be learned from\nmultiview RGB videos, with a particular emphasis on recovering physically\naccurate body and cloth movements. Our method, Position Based Dynamic Gaussians\n(PBDyG), realizes ``movement-dependent'' cloth deformation via physical\nsimulation, rather than merely relying on ``pose-dependent'' rigid\ntransformations. We model the clothed human holistically but with two distinct\nphysical entities in contact: clothing modeled as 3D Gaussians, which are\nattached to a skinned SMPL body that follows the movement of the person in the\ninput videos. The articulation of the SMPL body also drives physically-based\nsimulation of the clothes' Gaussians to transform the avatar to novel poses. In\norder to run position based dynamics simulation, physical properties including\nmass and material stiffness are estimated from the RGB videos through Dynamic\n3D Gaussian Splatting. Experiments demonstrate that our method not only\naccurately reproduces appearance but also enables the reconstruction of avatars\nwearing highly deformable garments, such as skirts or coats, which have been\nchallenging to reconstruct using existing methods.\n","authors":["Shota Sasaki","Jane Wu","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2412.04433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04432v1","updated":"2024-12-05T18:53:04Z","published":"2024-12-05T18:53:04Z","title":"Divot: Diffusion Powers Video Tokenizer for Comprehension and Generation","summary":" In recent years, there has been a significant surge of interest in unifying\nimage comprehension and generation within Large Language Models (LLMs). This\ngrowing interest has prompted us to explore extending this unification to\nvideos. The core challenge lies in developing a versatile video tokenizer that\ncaptures both the spatial characteristics and temporal dynamics of videos to\nobtain representations for LLMs, and the representations can be further decoded\ninto realistic video clips to enable video generation. In this work, we\nintroduce Divot, a Diffusion-Powered Video Tokenizer, which leverages the\ndiffusion process for self-supervised video representation learning. We posit\nthat if a video diffusion model can effectively de-noise video clips by taking\nthe features of a video tokenizer as the condition, then the tokenizer has\nsuccessfully captured robust spatial and temporal information. Additionally,\nthe video diffusion model inherently functions as a de-tokenizer, decoding\nvideos from their representations. Building upon the Divot tokenizer, we\npresent Divot-Vicuna through video-to-text autoregression and text-to-video\ngeneration by modeling the distributions of continuous-valued Divot features\nwith a Gaussian Mixture Model. Experimental results demonstrate that our\ndiffusion-based video tokenizer, when integrated with a pre-trained LLM,\nachieves competitive performance across various video comprehension and\ngeneration benchmarks. The instruction tuned Divot-Vicuna also excels in video\nstorytelling, generating interleaved narratives and corresponding videos.\n","authors":["Yuying Ge","Yizhuo Li","Yixiao Ge","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2412.04432v1.pdf","comment":"Project released at: https://github.com/TencentARC/Divot"},{"id":"http://arxiv.org/abs/2412.04431v1","updated":"2024-12-05T18:53:02Z","published":"2024-12-05T18:53:02Z","title":"Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution\n Image Synthesis","summary":" We present Infinity, a Bitwise Visual AutoRegressive Modeling capable of\ngenerating high-resolution, photorealistic images following language\ninstruction. Infinity redefines visual autoregressive model under a bitwise\ntoken prediction framework with an infinite-vocabulary tokenizer & classifier\nand bitwise self-correction mechanism, remarkably improving the generation\ncapacity and details. By theoretically scaling the tokenizer vocabulary size to\ninfinity and concurrently scaling the transformer size, our method\nsignificantly unleashes powerful scaling capabilities compared to vanilla VAR.\nInfinity sets a new record for autoregressive text-to-image models,\noutperforming top-tier diffusion models like SD3-Medium and SDXL. Notably,\nInfinity surpasses SD3-Medium by improving the GenEval benchmark score from\n0.62 to 0.73 and the ImageReward benchmark score from 0.87 to 0.96, achieving a\nwin rate of 66%. Without extra optimization, Infinity generates a high-quality\n1024x1024 image in 0.8 seconds, making it 2.6x faster than SD3-Medium and\nestablishing it as the fastest text-to-image model. Models and codes will be\nreleased to promote further exploration of Infinity for visual generation and\nunified tokenizer modeling.\n","authors":["Jian Han","Jinlai Liu","Yi Jiang","Bin Yan","Yuqi Zhang","Zehuan Yuan","Bingyue Peng","Xiaobing Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04431v1.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2412.04429v1","updated":"2024-12-05T18:52:00Z","published":"2024-12-05T18:52:00Z","title":"Grounding Descriptions in Images informs Zero-Shot Visual Recognition","summary":" Vision-language models (VLMs) like CLIP have been cherished for their ability\nto perform zero-shot visual recognition on open-vocabulary concepts. This is\nachieved by selecting the object category whose textual representation bears\nthe highest similarity with the query image. While successful in some domains,\nthis method struggles with identifying fine-grained entities as well as\ngeneralizing to unseen concepts that are not captured by the training\ndistribution. Recent works attempt to mitigate these challenges by integrating\ncategory descriptions at test time, albeit yielding modest improvements. We\nattribute these limited gains to a fundamental misalignment between image and\ndescription representations, which is rooted in the pretraining structure of\nCLIP. In this paper, we propose GRAIN, a new pretraining strategy aimed at\naligning representations at both fine and coarse levels simultaneously. Our\napproach learns to jointly ground textual descriptions in image regions along\nwith aligning overarching captions with global image representations. To drive\nthis pre-training, we leverage frozen Multimodal Large Language Models (MLLMs)\nto derive large-scale synthetic annotations. We demonstrate the enhanced\nzero-shot performance of our model compared to current state-of-the art methods\nacross 11 diverse image classification datasets. Additionally, we introduce\nProducts-2023, a newly curated, manually labeled dataset featuring novel\nconcepts, and showcase our model's ability to recognize these concepts by\nbenchmarking on it. Significant improvements achieved by our model on other\ndownstream tasks like retrieval further highlight the superior quality of\nrepresentations learned by our approach. Code available at\nhttps://github.com/shaunak27/grain-clip .\n","authors":["Shaunak Halbe","Junjiao Tian","K J Joseph","James Seale Smith","Katherine Stevo","Vineeth N Balasubramanian","Zsolt Kira"],"pdf_url":"https://arxiv.org/pdf/2412.04429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04424v1","updated":"2024-12-05T18:50:39Z","published":"2024-12-05T18:50:39Z","title":"Florence-VL: Enhancing Vision-Language Models with Generative Vision\n Encoder and Depth-Breadth Fusion","summary":" We present Florence-VL, a new family of multimodal large language models\n(MLLMs) with enriched visual representations produced by Florence-2, a\ngenerative vision foundation model. Unlike the widely used CLIP-style vision\ntransformer trained by contrastive learning, Florence-2 can capture different\nlevels and aspects of visual features, which are more versatile to be adapted\nto diverse downstream tasks. We propose a novel feature-fusion architecture and\nan innovative training recipe that effectively integrates Florence-2's visual\nfeatures into pretrained LLMs, such as Phi 3.5 and LLama 3. In particular, we\npropose \"depth-breath fusion (DBFusion)\" to fuse the visual features extracted\nfrom different depths and under multiple prompts. Our model training is\ncomposed of end-to-end pretraining of the whole model followed by finetuning of\nthe projection layer and the LLM, on a carefully designed recipe of diverse\nopen-source datasets that include high-quality image captions and\ninstruction-tuning pairs. Our quantitative analysis and visualization of\nFlorence-VL's visual features show its advantages over popular vision encoders\non vision-language alignment, where the enriched depth and breath play\nimportant roles. Florence-VL achieves significant improvements over existing\nstate-of-the-art MLLMs across various multi-modal and vision-centric benchmarks\ncovering general VQA, perception, hallucination, OCR, Chart,\nknowledge-intensive understanding, etc. To facilitate future research, our\nmodels and the complete training recipe are open-sourced.\nhttps://github.com/JiuhaiChen/Florence-VL\n","authors":["Jiuhai Chen","Jianwei Yang","Haiping Wu","Dianqi Li","Jianfeng Gao","Tianyi Zhou","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2412.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01339v2","updated":"2024-12-05T18:43:25Z","published":"2024-12-02T10:06:57Z","title":"Negative Token Merging: Image-based Adversarial Feature Guidance","summary":" Text-based adversarial guidance using a negative prompt has emerged as a\nwidely adopted approach to steer diffusion models away from producing undesired\nconcepts. While useful, performing adversarial guidance using text alone can be\ninsufficient to capture complex visual concepts or avoid specific visual\nelements like copyrighted characters. In this paper, for the first time we\nexplore an alternate modality in this direction by performing adversarial\nguidance directly using visual features from a reference image or other images\nin a batch. We introduce negative token merging (NegToMe), a simple but\neffective training-free approach which performs adversarial guidance through\nimages by selectively pushing apart matching visual features between reference\nand generated images during the reverse diffusion process. By simply adjusting\nthe used reference, NegToMe enables a diverse range of applications. Notably,\nwhen using other images in same batch as reference, we find that NegToMe\nsignificantly enhances output diversity (e.g., racial, gender, visual) by\nguiding features of each image away from others. Similarly, when used w.r.t.\ncopyrighted reference images, NegToMe reduces visual similarity to copyrighted\ncontent by 34.57%. NegToMe is simple to implement using just few-lines of code,\nuses only marginally higher (<4%) inference time and is compatible with\ndifferent diffusion architectures, including those like Flux, which don't\nnatively support the use of a negative prompt. Code is available at\nhttps://negtome.github.io\n","authors":["Jaskirat Singh","Lindsey Li","Weijia Shi","Ranjay Krishna","Yejin Choi","Pang Wei Koh","Michael F. Cohen","Stephen Gould","Liang Zheng","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2412.01339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04416v1","updated":"2024-12-05T18:42:29Z","published":"2024-12-05T18:42:29Z","title":"FedDUAL: A Dual-Strategy with Adaptive Loss and Dynamic Aggregation for\n Mitigating Data Heterogeneity in Federated Learning","summary":" Federated Learning (FL) marks a transformative approach to distributed model\ntraining by combining locally optimized models from various clients into a\nunified global model. While FL preserves data privacy by eliminating\ncentralized storage, it encounters significant challenges such as performance\ndegradation, slower convergence, and reduced robustness of the global model due\nto the heterogeneity in client data distributions. Among the various forms of\ndata heterogeneity, label skew emerges as a particularly formidable and\nprevalent issue, especially in domains such as image classification. To address\nthese challenges, we begin with comprehensive experiments to pinpoint the\nunderlying issues in the FL training process. Based on our findings, we then\nintroduce an innovative dual-strategy approach designed to effectively resolve\nthese issues. First, we introduce an adaptive loss function for client-side\ntraining, meticulously crafted to preserve previously acquired knowledge while\nmaintaining an optimal equilibrium between local optimization and global model\ncoherence. Secondly, we develop a dynamic aggregation strategy for aggregating\nclient models at the server. This approach adapts to each client's unique\nlearning patterns, effectively addressing the challenges of diverse data across\nthe network. Our comprehensive evaluation, conducted across three diverse\nreal-world datasets, coupled with theoretical convergence guarantees,\ndemonstrates the superior efficacy of our method compared to several\nestablished state-of-the-art approaches.\n","authors":["Pranab Sahoo","Ashutosh Tripathi","Sriparna Saha","Samrat Mondal"],"pdf_url":"https://arxiv.org/pdf/2412.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10162v3","updated":"2024-12-05T18:16:10Z","published":"2023-11-16T19:34:18Z","title":"Learning to Reconstruct Accelerated MRI Through K-space Cold Diffusion\n without Noise","summary":" Deep learning-based MRI reconstruction models have achieved superior\nperformance these days. Most recently, diffusion models have shown remarkable\nperformance in image generation, in-painting, super-resolution, image editing\nand more. As a generalized diffusion model, cold diffusion further broadens the\nscope and considers models built around arbitrary image transformations such as\nblurring, down-sampling, etc. In this paper, we propose a k-space cold\ndiffusion model that performs image degradation and restoration in k-space\nwithout the need for Gaussian noise. We provide comparisons with multiple deep\nlearning-based MRI reconstruction models and perform tests on a well-known\nlarge open-source MRI dataset. Our results show that this novel way of\nperforming degradation can generate high-quality reconstruction images for\naccelerated MRI.\n","authors":["Guoyao Shen","Mengyu Li","Chad W. Farris","Stephan Anderson","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.10162v3.pdf","comment":"21 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.10968v2","updated":"2024-12-05T18:07:33Z","published":"2023-08-21T18:26:35Z","title":"Regularization by Neural Style Transfer for MRI Field-Transfer\n Reconstruction with Limited Data","summary":" Recent advances in MRI reconstruction have achieved remarkable success with\ndeep learning-based models. However, most methods depend on large-scale,\ntask-specific datasets, leaving reconstruction in data-limited settings as a\ncritical but underexplored challenge. Regularization by denoising (RED) is a\ngeneral pipeline that incorporates a denoiser as a prior for image\nreconstruction, showing promising results in various image processing tasks,\nincluding denoising, deblurring, and super-resolution. In this work, we propose\na regularization by neural style transfer (RNST) method to further leverage the\npriors from the neural transfer and denoising engine. RNST effectively\nreconstructs high-quality images from noisy, low-quality inputs across varying\nimage styles, even with limited data. We validate RNST on clinical MRI scans,\ndemonstrating its ability to significantly improve image quality. These\nfindings underline the potential of RNST for MRI field-transfer reconstruction\nand its promise in addressing reconstruction tasks in data-constrained\nscenarios.\n","authors":["Guoyao Shen","Yancheng Zhu","Mengyu Li","Ryan McNaughton","Hernan Jara","Sean B. Andersson","Chad W. Farris","Stephan Anderson","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10968v2.pdf","comment":"31 pages, 9 figures, 3 tables, 1 algorithm chart"},{"id":"http://arxiv.org/abs/2412.04384v1","updated":"2024-12-05T17:59:58Z","published":"2024-12-05T17:59:58Z","title":"Probabilistic Gaussian Superposition for Efficient 3D Occupancy\n Prediction","summary":" 3D semantic occupancy prediction is an important task for robust\nvision-centric autonomous driving, which predicts fine-grained geometry and\nsemantics of the surrounding scene. Most existing methods leverage dense\ngrid-based scene representations, overlooking the spatial sparsity of the\ndriving scenes. Although 3D semantic Gaussian serves as an object-centric\nsparse alternative, most of the Gaussians still describe the empty region with\nlow efficiency. To address this, we propose a probabilistic Gaussian\nsuperposition model which interprets each Gaussian as a probability\ndistribution of its neighborhood being occupied and conforms to probabilistic\nmultiplication to derive the overall geometry. Furthermore, we adopt the exact\nGaussian mixture model for semantics calculation to avoid unnecessary\noverlapping of Gaussians. To effectively initialize Gaussians in non-empty\nregion, we design a distribution-based initialization module which learns the\npixel-aligned occupancy distribution instead of the depth of surfaces. We\nconduct extensive experiments on nuScenes and KITTI-360 datasets and our\nGaussianFormer-2 achieves state-of-the-art performance with high efficiency.\nCode: https://github.com/huang-yh/GaussianFormer.\n","authors":["Yuanhui Huang","Amonnut Thammatadatrakoon","Wenzhao Zheng","Yunpeng Zhang","Dalong Du","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04384v1.pdf","comment":"Code is available at: https://github.com/huang-yh/GaussianFormer"},{"id":"http://arxiv.org/abs/2412.04383v1","updated":"2024-12-05T17:58:43Z","published":"2024-12-05T17:58:43Z","title":"SeeGround: See and Ground for Zero-Shot Open-Vocabulary 3D Visual\n Grounding","summary":" 3D Visual Grounding (3DVG) aims to locate objects in 3D scenes based on\ntextual descriptions, which is essential for applications like augmented\nreality and robotics. Traditional 3DVG approaches rely on annotated 3D datasets\nand predefined object categories, limiting scalability and adaptability. To\novercome these limitations, we introduce SeeGround, a zero-shot 3DVG framework\nleveraging 2D Vision-Language Models (VLMs) trained on large-scale 2D data. We\npropose to represent 3D scenes as a hybrid of query-aligned rendered images and\nspatially enriched text descriptions, bridging the gap between 3D data and\n2D-VLMs input formats. We propose two modules: the Perspective Adaptation\nModule, which dynamically selects viewpoints for query-relevant image\nrendering, and the Fusion Alignment Module, which integrates 2D images with 3D\nspatial descriptions to enhance object localization. Extensive experiments on\nScanRefer and Nr3D demonstrate that our approach outperforms existing zero-shot\nmethods by large margins. Notably, we exceed weakly supervised methods and\nrival some fully supervised ones, outperforming previous SOTA by 7.7% on\nScanRefer and 7.1% on Nr3D, showcasing its effectiveness.\n","authors":["Rong Li","Shijie Li","Lingdong Kong","Xulei Yang","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2412.04383v1.pdf","comment":"Preprint; 19 pages, 10 figures, 9 tables; Project Page at\n https://seeground.github.io/"},{"id":"http://arxiv.org/abs/2412.04380v1","updated":"2024-12-05T17:57:09Z","published":"2024-12-05T17:57:09Z","title":"EmbodiedOcc: Embodied 3D Occupancy Prediction for Vision-based Online\n Scene Understanding","summary":" 3D occupancy prediction provides a comprehensive description of the\nsurrounding scenes and has become an essential task for 3D perception. Most\nexisting methods focus on offline perception from one or a few views and cannot\nbe applied to embodied agents which demands to gradually perceive the scene\nthrough progressive embodied exploration. In this paper, we formulate an\nembodied 3D occupancy prediction task to target this practical scenario and\npropose a Gaussian-based EmbodiedOcc framework to accomplish it. We initialize\nthe global scene with uniform 3D semantic Gaussians and progressively update\nlocal regions observed by the embodied agent. For each update, we extract\nsemantic and structural features from the observed image and efficiently\nincorporate them via deformable cross-attention to refine the regional\nGaussians. Finally, we employ Gaussian-to-voxel splatting to obtain the global\n3D occupancy from the updated 3D Gaussians. Our EmbodiedOcc assumes an unknown\n(i.e., uniformly distributed) environment and maintains an explicit global\nmemory of it with 3D Gaussians. It gradually gains knowledge through local\nrefinement of regional Gaussians, which is consistent with how humans\nunderstand new scenes through embodied exploration. We reorganize an\nEmbodiedOcc-ScanNet benchmark based on local annotations to facilitate the\nevaluation of the embodied 3D occupancy prediction task. Experiments\ndemonstrate that our EmbodiedOcc outperforms existing local prediction methods\nand accomplishes the embodied occupancy prediction with high accuracy and\nstrong expandability. Our code is available at:\nhttps://github.com/YkiWu/EmbodiedOcc.\n","authors":["Yuqi Wu","Wenzhao Zheng","Sicheng Zuo","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04380v1.pdf","comment":"Code: https://github.com/YkiWu/EmbodiedOcc"},{"id":"http://arxiv.org/abs/2411.17762v2","updated":"2024-12-05T17:54:29Z","published":"2024-11-26T03:33:52Z","title":"MUSE-VL: Modeling Unified VLM through Semantic Discrete Encoding","summary":" We introduce MUSE-VL, a Unified Vision-Language Model through Semantic\ndiscrete Encoding for multimodal understanding and generation. Recently, the\nresearch community has begun exploring unified models for visual generation and\nunderstanding. However, existing vision tokenizers (e.g., VQGAN) only consider\nlow-level information, which makes it difficult to align with texture semantic\nfeatures. This results in high training complexity and necessitates a large\namount of training data to achieve optimal performance. Additionally, their\nperformance is still far from dedicated understanding models. This paper\nproposes Semantic Discrete Encoding (SDE), which effectively aligns the\ninformation of visual tokens and language tokens by adding semantic constraints\nto the visual tokenizer. This greatly reduces training difficulty and improves\nthe performance of the unified model. The proposed model significantly\nsurpasses the previous state-of-the-art in various vision-language benchmarks\nand achieves better performance than dedicated understanding models.\n","authors":["Rongchang Xie","Chen Du","Ping Song","Chang Liu"],"pdf_url":"https://arxiv.org/pdf/2411.17762v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04378v1","updated":"2024-12-05T17:54:27Z","published":"2024-12-05T17:54:27Z","title":"Discriminative Fine-tuning of LVLMs","summary":" Contrastively-trained Vision-Language Models (VLMs) like CLIP have become the\nde facto approach for discriminative vision-language representation learning.\nHowever, these models have limited language understanding, often exhibiting a\n\"bag of words\" behavior. At the same time, Large Vision-Language Models\n(LVLMs), which combine vision encoders with LLMs, have been shown capable of\ndetailed vision-language reasoning, yet their autoregressive nature renders\nthem less suitable for discriminative tasks.\n In this work, we propose to combine \"the best of both worlds\": a new training\napproach for discriminative fine-tuning of LVLMs that results in strong\ndiscriminative and compositional capabilities. Essentially, our approach\nconverts a generative LVLM into a discriminative one, unlocking its capability\nfor powerful image-text discrimination combined with enhanced language\nunderstanding.\n Our contributions include: (1) A carefully designed training/optimization\nframework that utilizes image-text pairs of variable length and granularity for\ntraining the model with both contrastive and next-token prediction losses. This\nis accompanied by ablation studies that justify the necessity of our\nframework's components. (2) A parameter-efficient adaptation method using a\ncombination of soft prompting and LoRA adapters. (3) Significant improvements\nover state-of-the-art CLIP-like models of similar size, including standard\nimage-text retrieval benchmarks and notable gains in compositionality.\n","authors":["Yassine Ouali","Adrian Bulat","Alexandros Xenos","Anestis Zaganidis","Ioannis Maniadis Metaxas","Georgios Tzimiropoulos","Brais Martinez"],"pdf_url":"https://arxiv.org/pdf/2412.04378v1.pdf","comment":"Preprint. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2405.08807v2","updated":"2024-12-05T17:52:49Z","published":"2024-05-14T17:54:17Z","title":"SciFIBench: Benchmarking Large Multimodal Models for Scientific Figure\n Interpretation","summary":" Large multimodal models (LMMs) have proven flexible and generalisable across\nmany tasks and fields. Although they have strong potential to aid scientific\nresearch, their capabilities in this domain are not well characterised. A key\naspect of scientific research is the ability to understand and interpret\nfigures, which serve as a rich, compressed source of complex information. In\nthis work, we present SciFIBench, a scientific figure interpretation benchmark\nconsisting of 2000 questions split between two tasks across 8 categories. The\nquestions are curated from arXiv paper figures and captions, using adversarial\nfiltering to find hard negatives and human verification for quality control. We\nevaluate 28 LMMs on SciFIBench, finding it to be a challenging benchmark.\nFinally, we investigate the alignment and reasoning faithfulness of the LMMs on\naugmented question sets from our benchmark. We release SciFIBench to encourage\nprogress in this domain.\n","authors":["Jonathan Roberts","Kai Han","Neil Houlsby","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2405.08807v2.pdf","comment":"Accepted at NeurIPS 2024 (Datasets and Benchmarks Track)"},{"id":"http://arxiv.org/abs/2412.04377v1","updated":"2024-12-05T17:52:35Z","published":"2024-12-05T17:52:35Z","title":"A Hitchhiker's Guide to Understanding Performances of Two-Class\n Classifiers","summary":" Properly understanding the performances of classifiers is essential in\nvarious scenarios. However, the literature often relies only on one or two\nstandard scores to compare classifiers, which fails to capture the nuances of\napplication-specific requirements, potentially leading to suboptimal classifier\nselection. Recently, a paper on the foundations of the theory of\nperformance-based ranking introduced a tool, called the Tile, that organizes an\ninfinity of ranking scores into a 2D map. Thanks to the Tile, it is now\npossible to evaluate and compare classifiers efficiently, displaying all\npossible application-specific preferences instead of having to rely on a pair\nof scores. In this paper, we provide a first hitchhiker's guide for\nunderstanding the performances of two-class classifiers by presenting four\nscenarios, each showcasing a different user profile: a theoretical analyst, a\nmethod designer, a benchmarker, and an application developer. Particularly, we\nshow that we can provide different interpretative flavors that are adapted to\nthe user's needs by mapping different values on the Tile. As an illustration,\nwe leverage the newly introduced Tile tool and the different flavors to rank\nand analyze the performances of 74 state-of-the-art semantic segmentation\nmodels in two-class classification through the eyes of the four user profiles.\nThrough these user profiles, we demonstrate that the Tile effectively captures\nthe behavior of classifiers in a single visualization, while accommodating an\ninfinite number of ranking scores.\n","authors":["Anaïs Halin","Sébastien Piérard","Anthony Cioppa","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2412.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01951v2","updated":"2024-12-05T17:31:43Z","published":"2024-01-03T19:27:20Z","title":"GeoPos: A Minimal Positional Encoding for Enhanced Fine-Grained Details\n in Image Synthesis Using Convolutional Neural Networks","summary":" The enduring inability of image generative models to recreate intricate\ngeometric features, such as those present in human hands and fingers has been\nan ongoing problem in image generation for nearly a decade. While strides have\nbeen made by increasing model sizes and diversifying training datasets, this\nissue remains prevalent across all models, from denoising diffusion models to\nGenerative Adversarial Networks (GAN), pointing to a fundamental shortcoming in\nthe underlying architectures. In this paper, we demonstrate how this problem\ncan be mitigated by augmenting convolution layers geometric capabilities\nthrough providing them with a single input channel incorporating the relative\nn-dimensional Cartesian coordinate system. We show this drastically improves\nquality of images generated by Diffusion Models, GANs, and Variational\nAutoEncoders (VAE).\n","authors":["Mehran Hosseini","Peyman Hosseini"],"pdf_url":"https://arxiv.org/pdf/2401.01951v2.pdf","comment":"Accepted at WACV 2025. Contains 19 pages, 15 figures, and 9 tables"},{"id":"http://arxiv.org/abs/2412.04353v1","updated":"2024-12-05T17:12:35Z","published":"2024-12-05T17:12:35Z","title":"ActFusion: a Unified Diffusion Model for Action Segmentation and\n Anticipation","summary":" Temporal action segmentation and long-term action anticipation are two\npopular vision tasks for the temporal analysis of actions in videos. Despite\napparent relevance and potential complementarity, these two problems have been\ninvestigated as separate and distinct tasks. In this work, we tackle these two\nproblems, action segmentation and action anticipation, jointly using a unified\ndiffusion model dubbed ActFusion. The key idea to unification is to train the\nmodel to effectively handle both visible and invisible parts of the sequence in\nan integrated manner; the visible part is for temporal segmentation, and the\ninvisible part is for future anticipation. To this end, we introduce a new\nanticipative masking strategy during training in which a late part of the video\nframes is masked as invisible, and learnable tokens replace these frames to\nlearn to predict the invisible future. Experimental results demonstrate the\nbi-directional benefits between action segmentation and anticipation. ActFusion\nachieves the state-of-the-art performance across the standard benchmarks of 50\nSalads, Breakfast, and GTEA, outperforming task-specific models in both of the\ntwo tasks with a single unified model through joint learning.\n","authors":["Dayoung Gong","Suha Kwak","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2412.04353v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2312.00112v2","updated":"2024-12-05T17:09:46Z","published":"2023-11-30T18:59:11Z","title":"DynMF: Neural Motion Factorization for Real-time Dynamic View Synthesis\n with 3D Gaussian Splatting","summary":" Accurately and efficiently modeling dynamic scenes and motions is considered\nso challenging a task due to temporal dynamics and motion complexity. To\naddress these challenges, we propose DynMF, a compact and efficient\nrepresentation that decomposes a dynamic scene into a few neural trajectories.\nWe argue that the per-point motions of a dynamic scene can be decomposed into a\nsmall set of explicit or learned trajectories. Our carefully designed neural\nframework consisting of a tiny set of learned basis queried only in time allows\nfor rendering speed similar to 3D Gaussian Splatting, surpassing 120 FPS, while\nat the same time, requiring only double the storage compared to static scenes.\nOur neural representation adequately constrains the inherently underconstrained\nmotion field of a dynamic scene leading to effective and fast optimization.\nThis is done by biding each point to motion coefficients that enforce the\nper-point sharing of basis trajectories. By carefully applying a sparsity loss\nto the motion coefficients, we are able to disentangle the motions that\ncomprise the scene, independently control them, and generate novel motion\ncombinations that have never been seen before. We can reach state-of-the-art\nrender quality within just 5 minutes of training and in less than half an hour,\nwe can synthesize novel views of dynamic scenes with superior photorealistic\nquality. Our representation is interpretable, efficient, and expressive enough\nto offer real-time view synthesis of complex dynamic scene motions, in\nmonocular and multi-view scenarios.\n","authors":["Agelos Kratimenos","Jiahui Lei","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2312.00112v2.pdf","comment":"Project page: https://agelosk.github.io/dynmf/"},{"id":"http://arxiv.org/abs/2404.01282v3","updated":"2024-12-05T17:07:57Z","published":"2024-04-01T17:54:34Z","title":"LoSA: Long-Short-range Adapter for Scaling End-to-End Temporal Action\n Localization","summary":" Temporal Action Localization (TAL) involves localizing and classifying action\nsnippets in an untrimmed video. The emergence of large video foundation models\nhas led RGB-only video backbones to outperform previous methods needing both\nRGB and optical flow modalities. Leveraging these large models is often limited\nto training only the TAL head due to the prohibitively large GPU memory\nrequired to adapt the video backbone for TAL. To overcome this limitation, we\nintroduce LoSA, the first memory-and-parameter-efficient backbone adapter\ndesigned specifically for TAL to handle untrimmed videos. LoSA specializes for\nTAL by introducing Long-Short-range Adapters that adapt the intermediate layers\nof the video backbone over different temporal ranges. These adapters run\nparallel to the video backbone to significantly reduce memory footprint. LoSA\nalso includes Long-Short-range Gated Fusion that strategically combines the\noutput of these adapters from the video backbone layers to enhance the video\nfeatures provided to the TAL head. Experiments show that LoSA significantly\noutperforms all existing methods on standard TAL benchmarks, THUMOS-14 and\nActivityNet-v1.3, by scaling end-to-end backbone adaptation to\nbillion-parameter-plus models like VideoMAEv2~(ViT-g) and leveraging them\nbeyond head-only transfer learning.\n","authors":["Akshita Gupta","Gaurav Mittal","Ahmed Magooda","Ye Yu","Graham W. Taylor","Mei Chen"],"pdf_url":"https://arxiv.org/pdf/2404.01282v3.pdf","comment":"WACV 2025 Accepted"},{"id":"http://arxiv.org/abs/2310.07887v3","updated":"2024-12-05T17:04:59Z","published":"2023-10-11T20:48:20Z","title":"Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging\n Noise","summary":" Accurate analysis of microscopy images is hindered by the presence of noise.\nThis noise is usually signal-dependent and often additionally correlated along\nrows or columns of pixels. Current self- and unsupervised denoisers can address\nsignal-dependent noise, but none can reliably remove noise that is also row- or\ncolumn-correlated. Here, we present the first fully unsupervised deep\nlearning-based denoiser capable of handling imaging noise that is\nrow-correlated as well as signal-dependent. Our approach uses a Variational\nAutoencoder (VAE) with a specially designed autoregressive decoder. This\ndecoder is capable of modeling row-correlated and signal-dependent noise but is\nincapable of independently modeling underlying clean signal. The VAE therefore\nproduces latent variables containing only clean signal information, and these\nare mapped back into image space using a proposed second decoder network. Our\nmethod does not require a pre-trained noise model and can be trained from\nscratch using unpaired noisy data. We benchmark our approach on microscopy\ndatatsets from a range of imaging modalities and sensor types, each with row-\nor column-correlated, signal-dependent noise, and show that it outperforms\nexisting self- and unsupervised denoisers.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.07887v3.pdf","comment":"Accepted in IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2403.07369v2","updated":"2024-12-05T17:03:36Z","published":"2024-03-12T07:06:50Z","title":"Textual Knowledge Matters: Cross-Modality Co-Teaching for Generalized\n Visual Class Discovery","summary":" In this paper, we study the problem of Generalized Category Discovery (GCD),\nwhich aims to cluster unlabeled data from both known and unknown categories\nusing the knowledge of labeled data from known categories. Current GCD methods\nrely on only visual cues, which however neglect the multi-modality perceptive\nnature of human cognitive processes in discovering novel visual categories. To\naddress this, we propose a two-phase TextGCD framework to accomplish\nmulti-modality GCD by exploiting powerful Visual-Language Models. TextGCD\nmainly includes a retrieval-based text generation (RTG) phase and a\ncross-modality co-teaching (CCT) phase. First, RTG constructs a visual lexicon\nusing category tags from diverse datasets and attributes from Large Language\nModels, generating descriptive texts for images in a retrieval manner. Second,\nCCT leverages disparities between textual and visual modalities to foster\nmutual learning, thereby enhancing visual GCD. In addition, we design an\nadaptive class aligning strategy to ensure the alignment of category\nperceptions between modalities as well as a soft-voting mechanism to integrate\nmulti-modality cues. Experiments on eight datasets show the large superiority\nof our approach over state-of-the-art methods. Notably, our approach\noutperforms the best competitor, by 7.7% and 10.8% in All accuracy on\nImageNet-1k and CUB, respectively.\n","authors":["Haiyang Zheng","Nan Pu","Wenjing Li","Nicu Sebe","Zhun Zhong"],"pdf_url":"https://arxiv.org/pdf/2403.07369v2.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2412.04343v1","updated":"2024-12-05T17:01:09Z","published":"2024-12-05T17:01:09Z","title":"RMD: A Simple Baseline for More General Human Motion Generation via\n Training-free Retrieval-Augmented Motion Diffuse","summary":" While motion generation has made substantial progress, its practical\napplication remains constrained by dataset diversity and scale, limiting its\nability to handle out-of-distribution scenarios. To address this, we propose a\nsimple and effective baseline, RMD, which enhances the generalization of motion\ngeneration through retrieval-augmented techniques. Unlike previous\nretrieval-based methods, RMD requires no additional training and offers three\nkey advantages: (1) the external retrieval database can be flexibly replaced;\n(2) body parts from the motion database can be reused, with an LLM facilitating\nsplitting and recombination; and (3) a pre-trained motion diffusion model\nserves as a prior to improve the quality of motions obtained through retrieval\nand direct combination. Without any training, RMD achieves state-of-the-art\nperformance, with notable advantages on out-of-distribution data.\n","authors":["Zhouyingcheng Liao","Mingyuan Zhang","Wenjia Wang","Lei Yang","Taku Komura"],"pdf_url":"https://arxiv.org/pdf/2412.04343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04339v1","updated":"2024-12-05T16:58:45Z","published":"2024-12-05T16:58:45Z","title":"Likelihood-Scheduled Score-Based Generative Modeling for Fully 3D PET\n Image Reconstruction","summary":" Medical image reconstruction with pre-trained score-based generative models\n(SGMs) has advantages over other existing state-of-the-art deep-learned\nreconstruction methods, including improved resilience to different scanner\nsetups and advanced image distribution modeling. SGM-based reconstruction has\nrecently been applied to simulated positron emission tomography (PET) datasets,\nshowing improved contrast recovery for out-of-distribution lesions relative to\nthe state-of-the-art. However, existing methods for SGM-based reconstruction\nfrom PET data suffer from slow reconstruction, burdensome hyperparameter tuning\nand slice inconsistency effects (in 3D). In this work, we propose a practical\nmethodology for fully 3D reconstruction that accelerates reconstruction and\nreduces the number of critical hyperparameters by matching the likelihood of an\nSGM's reverse diffusion process to a current iterate of the maximum-likelihood\nexpectation maximization algorithm. Using the example of low-count\nreconstruction from simulated $[^{18}$F]DPA-714 datasets, we show our\nmethodology can match or improve on the NRMSE and SSIM of existing\nstate-of-the-art SGM-based PET reconstruction while reducing reconstruction\ntime and the need for hyperparameter tuning. We evaluate our methodology\nagainst state-of-the-art supervised and conventional reconstruction algorithms.\nFinally, we demonstrate a first-ever implementation of SGM-based reconstruction\nfor real 3D PET data, specifically $[^{18}$F]DPA-714 data, where we integrate\nperpendicular pre-trained SGMs to eliminate slice inconsistency issues.\n","authors":["George Webber","Yuya Mizuno","Oliver D. Howes","Alexander Hammers","Andrew P. King","Andrew J. Reader"],"pdf_url":"https://arxiv.org/pdf/2412.04339v1.pdf","comment":"11 pages, 12 figures. Submitted to Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2412.04337v1","updated":"2024-12-05T16:54:39Z","published":"2024-12-05T16:54:39Z","title":"Reflective Teacher: Semi-Supervised Multimodal 3D Object Detection in\n Bird's-Eye-View via Uncertainty Measure","summary":" Applying pseudo labeling techniques has been found to be advantageous in\nsemi-supervised 3D object detection (SSOD) in Bird's-Eye-View (BEV) for\nautonomous driving, particularly where labeled data is limited. In the\nliterature, Exponential Moving Average (EMA) has been used for adjustments of\nthe weights of teacher network by the student network. However, the same\ninduces catastrophic forgetting in the teacher network. In this work, we\naddress this issue by introducing a novel concept of Reflective Teacher where\nthe student is trained by both labeled and pseudo labeled data while its\nknowledge is progressively passed to the teacher through a regularizer to\nensure retention of previous knowledge. Additionally, we propose Geometry Aware\nBEV Fusion (GA-BEVFusion) for efficient alignment of multi-modal BEV features,\nthus reducing the disparity between the modalities - camera and LiDAR. This\nhelps to map the precise geometric information embedded among LiDAR points\nreliably with the spatial priors for extraction of semantic information from\ncamera images. Our experiments on the nuScenes and Waymo datasets demonstrate:\n1) improved performance over state-of-the-art methods in both fully supervised\nand semi-supervised settings; 2) Reflective Teacher achieves equivalent\nperformance with only 25% and 22% of labeled data for nuScenes and Waymo\ndatasets respectively, in contrast to other fully supervised methods that\nutilize the full labeled dataset.\n","authors":["Saheli Hazra","Sudip Das","Rohit Choudhary","Arindam Das","Ganesh Sistu","Ciaran Eising","Ujjwal Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2412.04337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04332v1","updated":"2024-12-05T16:48:16Z","published":"2024-12-05T16:48:16Z","title":"Liquid: Language Models are Scalable Multi-modal Generators","summary":" We present Liquid, an auto-regressive generation paradigm that seamlessly\nintegrates visual comprehension and generation by tokenizing images into\ndiscrete codes and learning these code embeddings alongside text tokens within\na shared feature space for both vision and language. Unlike previous multimodal\nlarge language model (MLLM), Liquid achieves this integration using a single\nlarge language model (LLM), eliminating the need for external pretrained visual\nembeddings such as CLIP. For the first time, Liquid uncovers a scaling law that\nperformance drop unavoidably brought by the unified training of visual and\nlanguage tasks diminishes as the model size increases. Furthermore, the unified\ntoken space enables visual generation and comprehension tasks to mutually\nenhance each other, effectively removing the typical interference seen in\nearlier models. We show that existing LLMs can serve as strong foundations for\nLiquid, saving 100x in training costs while outperforming Chameleon in\nmultimodal capabilities and maintaining language performance comparable to\nmainstream LLMs like LLAMA2. Liquid also outperforms models like SD v2.1 and\nSD-XL (FID of 5.47 on MJHQ-30K), excelling in both vision-language and\ntext-only tasks. This work demonstrates that LLMs such as LLAMA3.2 and GEMMA2\nare powerful multimodal generators, offering a scalable solution for enhancing\nboth vision-language understanding and generation. The code and models will be\nreleased.\n","authors":["Junfeng Wu","Yi Jiang","Chuofan Ma","Yuliang Liu","Hengshuang Zhao","Zehuan Yuan","Song Bai","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2412.04332v1.pdf","comment":"Technical report. Will be updated soon"},{"id":"http://arxiv.org/abs/2412.04324v1","updated":"2024-12-05T16:40:33Z","published":"2024-12-05T16:40:33Z","title":"Multi-Subject Image Synthesis as a Generative Prior for Single-Subject\n PET Image Reconstruction","summary":" Large high-quality medical image datasets are difficult to acquire but\nnecessary for many deep learning applications. For positron emission tomography\n(PET), reconstructed image quality is limited by inherent Poisson noise. We\npropose a novel method for synthesising diverse and realistic pseudo-PET images\nwith improved signal-to-noise ratio. We also show how our pseudo-PET images may\nbe exploited as a generative prior for single-subject PET image reconstruction.\nFirstly, we perform deep-learned deformable registration of multi-subject\nmagnetic resonance (MR) images paired to multi-subject PET images. We then use\nthe anatomically-learned deformation fields to transform multiple PET images to\nthe same reference space, before averaging random subsets of the transformed\nmulti-subject data to form a large number of varying pseudo-PET images. We\nobserve that using MR information for registration imbues the resulting\npseudo-PET images with improved anatomical detail compared to the originals. We\nconsider applications to PET image reconstruction, by generating pseudo-PET\nimages in the same space as the intended single-subject reconstruction and\nusing them as training data for a diffusion model-based reconstruction method.\nWe show visual improvement and reduced background noise in our 2D\nreconstructions as compared to OSEM, MAP-EM and an existing state-of-the-art\ndiffusion model-based approach. Our method shows the potential for utilising\nhighly subject-specific prior information within a generative reconstruction\nframework. Future work may compare the benefits of our approach to explicitly\nMR-guided reconstruction methodologies.\n","authors":["George Webber","Yuya Mizuno","Oliver D. Howes","Alexander Hammers","Andrew P. King","Andrew J. Reader"],"pdf_url":"https://arxiv.org/pdf/2412.04324v1.pdf","comment":"2 pages, 3 figures. Accepted as a poster presentation at IEEE NSS MIC\n RTSD 2024 (submitted May 2024; accepted July 2024; presented Nov 2024)"},{"id":"http://arxiv.org/abs/2412.04319v1","updated":"2024-12-05T16:35:43Z","published":"2024-12-05T16:35:43Z","title":"Generative-Model-Based Fully 3D PET Image Reconstruction by Conditional\n Diffusion Sampling","summary":" Score-based generative models (SGMs) have recently shown promising results\nfor image reconstruction on simulated positron emission tomography (PET)\ndatasets. In this work we have developed and implemented practical methodology\nfor 3D image reconstruction with SGMs, and perform (to our knowledge) the first\nSGM-based reconstruction of real fully 3D PET data. We train an SGM on\nfull-count reference brain images, and extend methodology to allow SGM-based\nreconstructions at very low counts (1% of original, to simulate low-dose or\nshort-duration scanning). We then perform reconstructions for multiple\nindependent realisations of 1% count data, allowing us to analyse the bias and\nvariance characteristics of the method. We sample from the learned posterior\ndistribution of the generative algorithm to calculate uncertainty images for\nour reconstructions. We evaluate the method's performance on real full- and\nlow-count PET data and compare with conventional OSEM and MAP-EM baselines,\nshowing that our SGM-based low-count reconstructions match full-dose\nreconstructions more closely and in a bias-variance trade-off comparison, our\nSGM-reconstructed images have lower variance than existing baselines. Future\nwork will compare to supervised deep-learned methods, with other avenues for\ninvestigation including how data conditioning affects the SGM's posterior\ndistribution and the algorithm's performance with different tracers.\n","authors":["George Webber","Yuya Mizuno","Oliver D. Howes","Alexander Hammers","Andrew P. King","Andrew J. Reader"],"pdf_url":"https://arxiv.org/pdf/2412.04319v1.pdf","comment":"2 pages, 2 figures. Accepted for oral presentation at IEEE NSS MIC\n RTSD 2024 (submitted May 2024; accepted July 2024; presented Nov 2024)"},{"id":"http://arxiv.org/abs/2311.12068v3","updated":"2024-12-05T16:34:21Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":" In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2412.04317v1","updated":"2024-12-05T16:34:07Z","published":"2024-12-05T16:34:07Z","title":"FlashSloth: Lightning Multimodal Large Language Models via Embedded\n Visual Compression","summary":" Despite a big leap forward in capability, multimodal large language models\n(MLLMs) tend to behave like a sloth in practical use, i.e., slow response and\nlarge latency. Recent efforts are devoted to building tiny MLLMs for better\nefficiency, but the plethora of visual tokens still used limit their actual\nspeedup. In this paper, we propose a powerful and fast tiny MLLM called\nFlashSloth. Different from previous efforts, FlashSloth focuses on improving\nthe descriptive power of visual tokens in the process of compressing their\nredundant semantics. In particular, FlashSloth introduces embedded visual\ncompression designs to capture both visually salient and instruction-related\nimage information, so as to achieving superior multimodal performance with\nfewer visual tokens. Extensive experiments are conducted to validate the\nproposed FlashSloth, and a bunch of tiny but strong MLLMs are also\ncomprehensively compared, e.g., InternVL2, MiniCPM-V2 and Qwen2-VL. The\nexperimental results show that compared with these advanced tiny MLLMs, our\nFlashSloth can greatly reduce the number of visual tokens, training memory and\ncomputation complexity while retaining high performance on various VL tasks.\n","authors":["Bo Tong","Bokai Lai","Yiyi Zhou","Gen Luo","Yunhang Shen","Ke Li","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2412.04317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04314v1","updated":"2024-12-05T16:30:54Z","published":"2024-12-05T16:30:54Z","title":"LocalSR: Image Super-Resolution in Local Region","summary":" Standard single-image super-resolution (SR) upsamples and restores entire\nimages. Yet several real-world applications require higher resolutions only in\nspecific regions, such as license plates or faces, making the super-resolution\nof the entire image, along with the associated memory and computational cost,\nunnecessary. We propose a novel task, called LocalSR, to restore only local\nregions of the low-resolution image. For this problem setting, we propose a\ncontext-based local super-resolution (CLSR) to super-resolve only specified\nregions of interest (ROI) while leveraging the entire image as context. Our\nmethod uses three parallel processing modules: a base module for\nsuper-resolving the ROI, a global context module for gathering helpful features\nfrom across the image, and a proximity integration module for concentrating on\nareas surrounding the ROI, progressively propagating features from distant\npixels to the target region. Experimental results indicate that our approach,\nwith its reduced low complexity, outperforms variants that focus exclusively on\nthe ROI.\n","authors":["Bo Ji","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2412.04314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04309v1","updated":"2024-12-05T16:27:59Z","published":"2024-12-05T16:27:59Z","title":"The Tile: A 2D Map of Ranking Scores for Two-Class Classification","summary":" In the computer vision and machine learning communities, as well as in many\nother research domains, rigorous evaluation of any new method, including\nclassifiers, is essential. One key component of the evaluation process is the\nability to compare and rank methods. However, ranking classifiers and\naccurately comparing their performances, especially when taking\napplication-specific preferences into account, remains challenging. For\ninstance, commonly used evaluation tools like Receiver Operating Characteristic\n(ROC) and Precision/Recall (PR) spaces display performances based on two\nscores. Hence, they are inherently limited in their ability to compare\nclassifiers across a broader range of scores and lack the capability to\nestablish a clear ranking among classifiers. In this paper, we present a novel\nversatile tool, named the Tile, that organizes an infinity of ranking scores in\na single 2D map for two-class classifiers, including common evaluation scores\nsuch as the accuracy, the true positive rate, the positive predictive value,\nJaccard's coefficient, and all F-beta scores. Furthermore, we study the\nproperties of the underlying ranking scores, such as the influence of the\npriors or the correspondences with the ROC space, and depict how to\ncharacterize any other score by comparing them to the Tile. Overall, we\ndemonstrate that the Tile is a powerful tool that effectively captures all the\nrankings in a single visualization and allows interpreting them.\n","authors":["Sébastien Piérard","Anaïs Halin","Anthony Cioppa","Adrien Deliège","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2412.04309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04304v1","updated":"2024-12-05T16:25:27Z","published":"2024-12-05T16:25:27Z","title":"Towards Zero-shot 3D Anomaly Localization","summary":" 3D anomaly detection and localization is of great significance for industrial\ninspection. Prior 3D anomaly detection and localization methods focus on the\nsetting that the testing data share the same category as the training data\nwhich is normal. However, in real-world applications, the normal training data\nfor the target 3D objects can be unavailable due to issues like data privacy or\nexport control regulation. To tackle these challenges, we identify a new task\n-- zero-shot 3D anomaly detection and localization, where the training and\ntesting classes do not overlap. To this end, we design 3DzAL, a novel\npatch-level contrastive learning framework based on pseudo anomalies generated\nusing the inductive bias from task-irrelevant 3D xyz data to learn more\nrepresentative feature representations. Furthermore, we train a normalcy\nclassifier network to classify the normal patches and pseudo anomalies and\nutilize the classification result jointly with feature distance to design\nanomaly scores. Instead of directly using the patch point clouds, we introduce\nadversarial perturbations to the input patch xyz data before feeding into the\n3D normalcy classifier for the classification-based anomaly score. We show that\n3DzAL outperforms the state-of-the-art anomaly detection and localization\nperformance.\n","authors":["Yizhou Wang","Kuan-Chuan Peng","Yun Fu"],"pdf_url":"https://arxiv.org/pdf/2412.04304v1.pdf","comment":"This paper is accepted to WACV 2025"},{"id":"http://arxiv.org/abs/2409.17978v2","updated":"2024-12-05T16:24:15Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v2.pdf","comment":"Accepted at NeurIPS'24, please cite the conference version"},{"id":"http://arxiv.org/abs/2412.04301v1","updated":"2024-12-05T16:23:11Z","published":"2024-12-05T16:23:11Z","title":"SwiftEdit: Lightning Fast Text-Guided Image Editing via One-Step\n Diffusion","summary":" Recent advances in text-guided image editing enable users to perform image\nedits through simple text inputs, leveraging the extensive priors of multi-step\ndiffusion-based text-to-image models. However, these methods often fall short\nof the speed demands required for real-world and on-device applications due to\nthe costly multi-step inversion and sampling process involved. In response to\nthis, we introduce SwiftEdit, a simple yet highly efficient editing tool that\nachieve instant text-guided image editing (in 0.23s). The advancement of\nSwiftEdit lies in its two novel contributions: a one-step inversion framework\nthat enables one-step image reconstruction via inversion and a mask-guided\nediting technique with our proposed attention rescaling mechanism to perform\nlocalized image editing. Extensive experiments are provided to demonstrate the\neffectiveness and efficiency of SwiftEdit. In particular, SwiftEdit enables\ninstant text-guided image editing, which is extremely faster than previous\nmulti-step methods (at least 50 times faster) while maintain a competitive\nperformance in editing results. Our project page is at:\nhttps://swift-edit.github.io/\n","authors":["Trong-Tung Nguyen","Quang Nguyen","Khoi Nguyen","Anh Tran","Cuong Pham"],"pdf_url":"https://arxiv.org/pdf/2412.04301v1.pdf","comment":"16 pages, 15 figures"},{"id":"http://arxiv.org/abs/2412.04300v1","updated":"2024-12-05T16:21:01Z","published":"2024-12-05T16:21:01Z","title":"T2I-FactualBench: Benchmarking the Factuality of Text-to-Image Models\n with Knowledge-Intensive Concepts","summary":" Evaluating the quality of synthesized images remains a significant challenge\nin the development of text-to-image (T2I) generation. Most existing studies in\nthis area primarily focus on evaluating text-image alignment, image quality,\nand object composition capabilities, with comparatively fewer studies\naddressing the evaluation of the factuality of T2I models, particularly when\nthe concepts involved are knowledge-intensive. To mitigate this gap, we present\nT2I-FactualBench in this work - the largest benchmark to date in terms of the\nnumber of concepts and prompts specifically designed to evaluate the factuality\nof knowledge-intensive concept generation. T2I-FactualBench consists of a\nthree-tiered knowledge-intensive text-to-image generation framework, ranging\nfrom the basic memorization of individual knowledge concepts to the more\ncomplex composition of multiple knowledge concepts. We further introduce a\nmulti-round visual question answering (VQA) based evaluation framework to\nassess the factuality of three-tiered knowledge-intensive text-to-image\ngeneration tasks. Experiments on T2I-FactualBench indicate that current\nstate-of-the-art (SOTA) T2I models still leave significant room for\nimprovement.\n","authors":["Ziwei Huang","Wanggui He","Quanyu Long","Yandi Wang","Haoyuan Li","Zhelun Yu","Fangxun Shu","Long Chen","Hao Jiang","Leilei Gan"],"pdf_url":"https://arxiv.org/pdf/2412.04300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04296v1","updated":"2024-12-05T16:15:32Z","published":"2024-12-05T16:15:32Z","title":"Structure-Aware Stylized Image Synthesis for Robust Medical Image\n Segmentation","summary":" Accurate medical image segmentation is essential for effective diagnosis and\ntreatment planning but is often challenged by domain shifts caused by\nvariations in imaging devices, acquisition conditions, and patient-specific\nattributes. Traditional domain generalization methods typically require\ninclusion of parts of the test domain within the training set, which is not\nalways feasible in clinical settings with limited diverse data. Additionally,\nalthough diffusion models have demonstrated strong capabilities in image\ngeneration and style transfer, they often fail to preserve the critical\nstructural information necessary for precise medical analysis. To address these\nissues, we propose a novel medical image segmentation method that combines\ndiffusion models and Structure-Preserving Network for structure-aware one-shot\nimage stylization. Our approach effectively mitigates domain shifts by\ntransforming images from various sources into a consistent style while\nmaintaining the location, size, and shape of lesions. This ensures robust and\naccurate segmentation even when the target domain is absent from the training\ndata. Experimental evaluations on colonoscopy polyp segmentation and skin\nlesion segmentation datasets show that our method enhances the robustness and\naccuracy of segmentation models, achieving superior performance metrics\ncompared to baseline models without style transfer. This structure-aware\nstylization framework offers a practical solution for improving medical image\nsegmentation across diverse domains, facilitating more reliable clinical\ndiagnoses.\n","authors":["Jie Bao","Zhixin Zhou","Wen Jung Li","Rui Luo"],"pdf_url":"https://arxiv.org/pdf/2412.04296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04292v1","updated":"2024-12-05T16:12:25Z","published":"2024-12-05T16:12:25Z","title":"SIDA: Social Media Image Deepfake Detection, Localization and\n Explanation with Large Multimodal Model","summary":" The rapid advancement of generative models in creating highly realistic\nimages poses substantial risks for misinformation dissemination. For instance,\na synthetic image, when shared on social media, can mislead extensive audiences\nand erode trust in digital content, resulting in severe repercussions. Despite\nsome progress, academia has not yet created a large and diversified deepfake\ndetection dataset for social media, nor has it devised an effective solution to\naddress this issue. In this paper, we introduce the Social media Image\nDetection dataSet (SID-Set), which offers three key advantages: (1) extensive\nvolume, featuring 300K AI-generated/tampered and authentic images with\ncomprehensive annotations, (2) broad diversity, encompassing fully synthetic\nand tampered images across various classes, and (3) elevated realism, with\nimages that are predominantly indistinguishable from genuine ones through mere\nvisual inspection. Furthermore, leveraging the exceptional capabilities of\nlarge multimodal models, we propose a new image deepfake detection,\nlocalization, and explanation framework, named SIDA (Social media Image\nDetection, localization, and explanation Assistant). SIDA not only discerns the\nauthenticity of images, but also delineates tampered regions through mask\nprediction and provides textual explanations of the model's judgment criteria.\nCompared with state-of-the-art deepfake detection models on SID-Set and other\nbenchmarks, extensive experiments demonstrate that SIDA achieves superior\nperformance among diversified settings. The code, model, and dataset will be\nreleased.\n","authors":["Zhenglin Huang","Jinwei Hu","Xiangtai Li","Yiwei He","Xingyu Zhao","Bei Peng","Baoyuan Wu","Xiaowei Huang","Guangliang Cheng"],"pdf_url":"https://arxiv.org/pdf/2412.04292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04282v1","updated":"2024-12-05T16:03:37Z","published":"2024-12-05T16:03:37Z","title":"Learnable Infinite Taylor Gaussian for Dynamic View Rendering","summary":" Capturing the temporal evolution of Gaussian properties such as position,\nrotation, and scale is a challenging task due to the vast number of\ntime-varying parameters and the limited photometric data available, which\ngenerally results in convergence issues, making it difficult to find an optimal\nsolution. While feeding all inputs into an end-to-end neural network can\neffectively model complex temporal dynamics, this approach lacks explicit\nsupervision and struggles to generate high-quality transformation fields. On\nthe other hand, using time-conditioned polynomial functions to model Gaussian\ntrajectories and orientations provides a more explicit and interpretable\nsolution, but requires significant handcrafted effort and lacks\ngeneralizability across diverse scenes. To overcome these limitations, this\npaper introduces a novel approach based on a learnable infinite Taylor Formula\nto model the temporal evolution of Gaussians. This method offers both the\nflexibility of an implicit network-based approach and the interpretability of\nexplicit polynomial functions, allowing for more robust and generalizable\nmodeling of Gaussian dynamics across various dynamic scenes. Extensive\nexperiments on dynamic novel view rendering tasks are conducted on public\ndatasets, demonstrating that the proposed method achieves state-of-the-art\nperformance in this domain. More information is available on our project\npage(https://ellisonking.github.io/TaylorGaussian).\n","authors":["Bingbing Hu","Yanyan Li","Rui Xie","Bo Xu","Haoye Dong","Junfeng Yao","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03200v2","updated":"2024-12-05T16:02:22Z","published":"2024-12-04T10:40:17Z","title":"Fab-ME: A Vision State-Space and Attention-Enhanced Framework for Fabric\n Defect Detection","summary":" Effective defect detection is critical for ensuring the quality,\nfunctionality, and economic value of textile products. However, existing\nmethods face challenges in achieving high accuracy, real-time performance, and\nefficient global information extraction. To address these issues, we propose\nFab-ME, an advanced framework based on YOLOv8s, specifically designed for the\naccurate detection of 20 fabric defect types. Our contributions include the\nintroduction of the cross-stage partial bottleneck with two convolutions (C2F)\nvision state-space (C2F-VMamba) module, which integrates visual state-space\n(VSS) blocks into the YOLOv8s feature fusion network neck, enhancing the\ncapture of intricate details and global context while maintaining high\nprocessing speeds. Additionally, we incorporate an enhanced multi-scale channel\nattention (EMCA) module into the final layer of the feature extraction network,\nsignificantly improving sensitivity to small targets. Experimental results on\nthe Tianchi fabric defect detection dataset demonstrate that Fab-ME achieves a\n3.5% improvement in mAP@0.5 compared to the original YOLOv8s, validating its\neffectiveness for precise and efficient fabric defect detection.\n","authors":["Shuai Wang","Huiyan Kong","Baotian Li","Fa Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.03200v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.04280v1","updated":"2024-12-05T16:00:59Z","published":"2024-12-05T16:00:59Z","title":"HumanEdit: A High-Quality Human-Rewarded Dataset for Instruction-based\n Image Editing","summary":" We present HumanEdit, a high-quality, human-rewarded dataset specifically\ndesigned for instruction-guided image editing, enabling precise and diverse\nimage manipulations through open-form language instructions. Previous\nlarge-scale editing datasets often incorporate minimal human feedback, leading\nto challenges in aligning datasets with human preferences. HumanEdit bridges\nthis gap by employing human annotators to construct data pairs and\nadministrators to provide feedback. With meticulously curation, HumanEdit\ncomprises 5,751 images and requires more than 2,500 hours of human effort\nacross four stages, ensuring both accuracy and reliability for a wide range of\nimage editing tasks. The dataset includes six distinct types of editing\ninstructions: Action, Add, Counting, Relation, Remove, and Replace,\nencompassing a broad spectrum of real-world scenarios. All images in the\ndataset are accompanied by masks, and for a subset of the data, we ensure that\nthe instructions are sufficiently detailed to support mask-free editing.\nFurthermore, HumanEdit offers comprehensive diversity and high-resolution $1024\n\\times 1024$ content sourced from various domains, setting a new versatile\nbenchmark for instructional image editing datasets. With the aim of advancing\nfuture research and establishing evaluation benchmarks in the field of image\nediting, we release HumanEdit at\n\\url{https://huggingface.co/datasets/BryanW/HumanEdit}.\n","authors":["Jinbin Bai","Wei Chow","Ling Yang","Xiangtai Li","Juncheng Li","Hanwang Zhang","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2412.04280v1.pdf","comment":"Codes and Supplementary Material: https://github.com/viiika/HumanEdit"},{"id":"http://arxiv.org/abs/2412.04279v1","updated":"2024-12-05T16:00:55Z","published":"2024-12-05T16:00:55Z","title":"Targeted Hard Sample Synthesis Based on Estimated Pose and Occlusion\n Error for Improved Object Pose Estimation","summary":" 6D Object pose estimation is a fundamental component in robotics enabling\nefficient interaction with the environment. It is particularly challenging in\nbin-picking applications, where objects may be textureless and in difficult\nposes, and occlusion between objects of the same type may cause confusion even\nin well-trained models. We propose a novel method of hard example synthesis\nthat is model-agnostic, using existing simulators and the modeling of pose\nerror in both the camera-to-object viewsphere and occlusion space. Through\nevaluation of the model performance with respect to the distribution of object\nposes and occlusions, we discover regions of high error and generate realistic\ntraining samples to specifically target these regions. With our training\napproach, we demonstrate an improvement in correct detection rate of up to 20%\nacross several ROBI-dataset objects using state-of-the-art pose estimation\nmodels.\n","authors":["Alan Li","Angela P. Schoellig"],"pdf_url":"https://arxiv.org/pdf/2412.04279v1.pdf","comment":"To be published in IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2408.02993v3","updated":"2024-12-05T15:59:27Z","published":"2024-08-06T06:59:15Z","title":"DreamLCM: Towards High-Quality Text-to-3D Generation via Latent\n Consistency Model","summary":" Recently, the text-to-3D task has developed rapidly due to the appearance of\nthe SDS method. However, the SDS method always generates 3D objects with poor\nquality due to the over-smooth issue. This issue is attributed to two factors:\n1) the DDPM single-step inference produces poor guidance gradients; 2) the\nrandomness from the input noises and timesteps averages the details of the 3D\ncontents. In this paper, to address the issue, we propose DreamLCM which\nincorporates the Latent Consistency Model (LCM). DreamLCM leverages the\npowerful image generation capabilities inherent in LCM, enabling generating\nconsistent and high-quality guidance, i.e., predicted noises or images. Powered\nby the improved guidance, the proposed method can provide accurate and detailed\ngradients to optimize the target 3D models. In addition, we propose two\nstrategies to enhance the generation quality further. Firstly, we propose a\nguidance calibration strategy, utilizing Euler Solver to calibrate the guidance\ndistribution to accelerate 3D models to converge. Secondly, we propose a dual\ntimestep strategy, increasing the consistency of guidance and optimizing 3D\nmodels from geometry to appearance in DreamLCM. Experiments show that DreamLCM\nachieves state-of-the-art results in both generation quality and training\nefficiency. The code is available at https://github.com/1YimingZhong/DreamLCM.\n","authors":["Yiming Zhong","Xiaolin Zhang","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2408.02993v3.pdf","comment":"15 pages, 9 figures, ACM MM 2024"},{"id":"http://arxiv.org/abs/2412.04273v1","updated":"2024-12-05T15:55:23Z","published":"2024-12-05T15:55:23Z","title":"Reinforcement Learning from Wild Animal Videos","summary":" We propose to learn legged robot locomotion skills by watching thousands of\nwild animal videos from the internet, such as those featured in nature\ndocumentaries. Indeed, such videos offer a rich and diverse collection of\nplausible motion examples, which could inform how robots should move. To\nachieve this, we introduce Reinforcement Learning from Wild Animal Videos\n(RLWAV), a method to ground these motions into physical robots. We first train\na video classifier on a large-scale animal video dataset to recognize actions\nfrom RGB clips of animals in their natural habitats. We then train a\nmulti-skill policy to control a robot in a physics simulator, using the\nclassification score of a third-person camera capturing videos of the robot's\nmovements as a reward for reinforcement learning. Finally, we directly transfer\nthe learned policy to a real quadruped Solo. Remarkably, despite the extreme\ngap in both domain and embodiment between animals in the wild and robots, our\napproach enables the policy to learn diverse skills such as walking, jumping,\nand keeping still, without relying on reference trajectories nor skill-specific\nrewards.\n","authors":["Elliot Chane-Sane","Constant Roux","Olivier Stasse","Nicolas Mansard"],"pdf_url":"https://arxiv.org/pdf/2412.04273v1.pdf","comment":"Project website: https://elliotchanesane31.github.io/RLWAV/"},{"id":"http://arxiv.org/abs/2411.17440v2","updated":"2024-12-05T15:54:00Z","published":"2024-11-26T13:58:24Z","title":"Identity-Preserving Text-to-Video Generation by Frequency Decomposition","summary":" Identity-preserving text-to-video (IPT2V) generation aims to create\nhigh-fidelity videos with consistent human identity. It is an important task in\nvideo generation but remains an open problem for generative models. This paper\npushes the technical frontier of IPT2V in two directions that have not been\nresolved in literature: (1) A tuning-free pipeline without tedious case-by-case\nfinetuning, and (2) A frequency-aware heuristic identity-preserving DiT-based\ncontrol scheme. We propose ConsisID, a tuning-free DiT-based controllable IPT2V\nmodel to keep human identity consistent in the generated video. Inspired by\nprior findings in frequency analysis of diffusion transformers, it employs\nidentity-control signals in the frequency domain, where facial features can be\ndecomposed into low-frequency global features and high-frequency intrinsic\nfeatures. First, from a low-frequency perspective, we introduce a global facial\nextractor, which encodes reference images and facial key points into a latent\nspace, generating features enriched with low-frequency information. These\nfeatures are then integrated into shallow layers of the network to alleviate\ntraining challenges associated with DiT. Second, from a high-frequency\nperspective, we design a local facial extractor to capture high-frequency\ndetails and inject them into transformer blocks, enhancing the model's ability\nto preserve fine-grained features. We propose a hierarchical training strategy\nto leverage frequency information for identity preservation, transforming a\nvanilla pre-trained video generation model into an IPT2V model. Extensive\nexperiments demonstrate that our frequency-aware heuristic scheme provides an\noptimal control solution for DiT-based models. Thanks to this scheme, our\nConsisID generates high-quality, identity-preserving videos, making strides\ntowards more effective IPT2V.\n","authors":["Shenghai Yuan","Jinfa Huang","Xianyi He","Yunyuan Ge","Yujun Shi","Liuhan Chen","Jiebo Luo","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.17440v2.pdf","comment":"12 pages, 8 figures, Code: https://github.com/PKU-YuanGroup/ConsisID"},{"id":"http://arxiv.org/abs/2412.04260v1","updated":"2024-12-05T15:39:54Z","published":"2024-12-05T15:39:54Z","title":"Enhancing Whole Slide Image Classification through Supervised\n Contrastive Domain Adaptation","summary":" Domain shift in the field of histopathological imaging is a common phenomenon\ndue to the intra- and inter-hospital variability of staining and digitization\nprotocols. The implementation of robust models, capable of creating generalized\ndomains, represents a need to be solved. In this work, a new domain adaptation\nmethod to deal with the variability between histopathological images from\nmultiple centers is presented. In particular, our method adds a training\nconstraint to the supervised contrastive learning approach to achieve domain\nadaptation and improve inter-class separability. Experiments performed on\ndomain adaptation and classification of whole-slide images of six skin cancer\nsubtypes from two centers demonstrate the method's usefulness. The results\nreflect superior performance compared to not using domain adaptation after\nfeature extraction or staining normalization.\n","authors":["Ilán Carretero","Pablo Meseguer","Rocío del Amor","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2412.04260v1.pdf","comment":"Accepted in CASEIB 2024"},{"id":"http://arxiv.org/abs/2408.13877v2","updated":"2024-12-05T15:38:08Z","published":"2024-08-25T15:56:33Z","title":"Camouflaged Object Tracking: A Benchmark","summary":" Visual tracking has seen remarkable advancements, largely driven by the\navailability of large-scale training datasets that have enabled the development\nof highly accurate and robust algorithms. While significant progress has been\nmade in tracking general objects, research on more challenging scenarios, such\nas tracking camouflaged objects, remains limited. Camouflaged objects, which\nblend seamlessly with their surroundings or other objects, present unique\nchallenges for detection and tracking in complex environments. This challenge\nis particularly critical in applications such as military, security,\nagriculture, and marine monitoring, where precise tracking of camouflaged\nobjects is essential. To address this gap, we introduce the Camouflaged Object\nTracking Dataset (COTD), a specialized benchmark designed specifically for\nevaluating camouflaged object tracking methods. The COTD dataset comprises 200\nsequences and approximately 80,000 frames, each annotated with detailed\nbounding boxes. Our evaluation of 20 existing tracking algorithms reveals\nsignificant deficiencies in their performance with camouflaged objects. To\naddress these issues, we propose a novel tracking framework, HiPTrack-MLS,\nwhich demonstrates promising results in improving tracking performance for\ncamouflaged objects. COTD and code are avialable at\nhttps://github.com/openat25/HIPTrack-MLS.\n","authors":["Xiaoyu Guo","Pengzhi Zhong","Hao Zhang","Defeng Huang","Huikai Shao","Qijun Zhao","Shuiwang Li"],"pdf_url":"https://arxiv.org/pdf/2408.13877v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17010v2","updated":"2024-12-05T15:33:29Z","published":"2024-03-25T17:59:59Z","title":"Calib3D: Calibrating Model Preferences for Reliable 3D Scene\n Understanding","summary":" Safety-critical 3D scene understanding tasks necessitate not only accurate\nbut also confident predictions from 3D perception models. This study introduces\nCalib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D\nscene understanding models from an uncertainty estimation viewpoint. We\ncomprehensively evaluate 28 state-of-the-art models across 10 diverse 3D\ndatasets, uncovering insightful phenomena that cope with both the aleatoric and\nepistemic uncertainties in 3D scene understanding. We discover that despite\nachieving impressive levels of accuracy, existing models frequently fail to\nprovide reliable uncertainty estimates -- a pitfall that critically undermines\ntheir applicability in safety-sensitive contexts. Through extensive analysis of\nkey factors such as network capacity, LiDAR representations, rasterization\nresolutions, and 3D data augmentation techniques, we correlate these aspects\ndirectly with the model calibration efficacy. Furthermore, we introduce DeptS,\na novel depth-aware scaling approach aimed at enhancing 3D model calibration.\nExtensive experiments across a wide range of configurations validate the\nsuperiority of our method. We hope this work could serve as a cornerstone for\nfostering reliable 3D scene understanding. Code and benchmark toolkit are\npublicly available.\n","authors":["Lingdong Kong","Xiang Xu","Jun Cen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17010v2.pdf","comment":"WACV 2025; 26 pages, 8 figures, 12 tables; Code at\n https://github.com/ldkong1205/Calib3D"},{"id":"http://arxiv.org/abs/2412.04247v1","updated":"2024-12-05T15:27:58Z","published":"2024-12-05T15:27:58Z","title":"3D Part Segmentation via Geometric Aggregation of 2D Visual Features","summary":" Supervised 3D part segmentation models are tailored for a fixed set of\nobjects and parts, limiting their transferability to open-set, real-world\nscenarios. Recent works have explored vision-language models (VLMs) as a\npromising alternative, using multi-view rendering and textual prompting to\nidentify object parts. However, naively applying VLMs in this context\nintroduces several drawbacks, such as the need for meticulous prompt\nengineering, and fails to leverage the 3D geometric structure of objects. To\naddress these limitations, we propose COPS, a COmprehensive model for Parts\nSegmentation that blends the semantics extracted from visual concepts and 3D\ngeometry to effectively identify object parts. COPS renders a point cloud from\nmultiple viewpoints, extracts 2D features, projects them back to 3D, and uses a\nnovel geometric-aware feature aggregation procedure to ensure spatial and\nsemantic consistency. Finally, it clusters points into parts and labels them.\nWe demonstrate that COPS is efficient, scalable, and achieves zero-shot\nstate-of-the-art performance across five datasets, covering synthetic and\nreal-world data, texture-less and coloured objects, as well as rigid and\nnon-rigid shapes. The code is available at https://3d-cops.github.io.\n","authors":["Marco Garosi","Riccardo Tedoldi","Davide Boscaini","Massimiliano Mancini","Nicu Sebe","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2412.04247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04245v1","updated":"2024-12-05T15:27:39Z","published":"2024-12-05T15:27:39Z","title":"Intriguing Properties of Robust Classification","summary":" Despite extensive research since the community learned about adversarial\nexamples 10 years ago, we still do not know how to train high-accuracy\nclassifiers that are guaranteed to be robust to small perturbations of their\ninputs. Previous works often argued that this might be because no classifier\nexists that is robust and accurate at the same time. However, in computer\nvision this assumption does not match reality where humans are usually accurate\nand robust on most tasks of interest. We offer an alternative explanation and\nshow that in certain settings robust generalization is only possible with\nunrealistically large amounts of data. More precisely we find a setting where a\nrobust classifier exists, it is easy to learn an accurate classifier, yet it\nrequires an exponential amount of data to learn a robust classifier. Based on\nthis theoretical result, we explore how well robust classifiers generalize on\ndatasets such as CIFAR-10. We come to the conclusion that on this datasets, the\nlimitation of current robust models also lies in the generalization, and that\nthey require a lot of data to do well on the test set. We also show that the\nproblem is not in the expressiveness or generalization capabilities of current\narchitectures, and that there are low magnitude features in the data which are\nuseful for non-robust generalization but are not available for robust\nclassifiers.\n","authors":["Bernd Prach","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2412.04245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04244v1","updated":"2024-12-05T15:26:51Z","published":"2024-12-05T15:26:51Z","title":"GigaHands: A Massive Annotated Dataset of Bimanual Hand Activities","summary":" Understanding bimanual human hand activities is a critical problem in AI and\nrobotics. We cannot build large models of bimanual activities because existing\ndatasets lack the scale, coverage of diverse hand activities, and detailed\nannotations. We introduce GigaHands, a massive annotated dataset capturing 34\nhours of bimanual hand activities from 56 subjects and 417 objects, totaling\n14k motion clips derived from 183 million frames paired with 84k text\nannotations. Our markerless capture setup and data acquisition protocol enable\nfully automatic 3D hand and object estimation while minimizing the effort\nrequired for text annotation. The scale and diversity of GigaHands enable broad\napplications, including text-driven action synthesis, hand motion captioning,\nand dynamic radiance field reconstruction.\n","authors":["Rao Fu","Dingxi Zhang","Alex Jiang","Wanjia Fu","Austin Funk","Daniel Ritchie","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2412.04244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04243v1","updated":"2024-12-05T15:25:51Z","published":"2024-12-05T15:25:51Z","title":"Quantifying the Limits of Segment Anything Model: Analyzing Challenges\n in Segmenting Tree-Like and Low-Contrast Structures","summary":" Segment Anything Model (SAM) has shown impressive performance in interactive\nand zero-shot segmentation across diverse domains, suggesting that they have\nlearned a general concept of \"objects\" from their large-scale training.\nHowever, we observed that SAM struggles with certain types of objects,\nparticularly those featuring dense, tree-like structures and low textural\ncontrast from their surroundings. These failure modes are critical for\nunderstanding its limitations in real-world use. In order to systematically\nexamine this issue, we propose metrics to quantify two key object\ncharacteristics: tree-likeness and textural separability. Through extensive\ncontrolled synthetic experiments and testing on real datasets, we demonstrate\nthat SAM's performance is noticeably correlated with these factors. We link\nthese behaviors under the concept of \"textural confusion\", where SAM\nmisinterprets local structure as global texture, leading to over-segmentation,\nor struggles to differentiate objects from similarly textured backgrounds.\nThese findings offer the first quantitative framework to model SAM's\nchallenges, providing valuable insights into its limitations and guiding future\nimprovements for vision foundation models.\n","authors":["Yixin Zhang","Nicholas Konz","Kevin Kramer","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2412.04243v1.pdf","comment":"Code: https://github.com/mazurowski-lab/SAM-TexturalConfusion-Metrics"},{"id":"http://arxiv.org/abs/2403.06601v2","updated":"2024-12-05T15:19:47Z","published":"2024-03-11T10:48:56Z","title":"Cross-domain and Cross-dimension Learning for Image-to-Graph\n Transformers","summary":" Direct image-to-graph transformation is a challenging task that involves\nsolving object detection and relationship prediction in a single model. Due to\nthis task's complexity, large training datasets are rare in many domains,\nmaking the training of deep-learning methods challenging. This data sparsity\nnecessitates transfer learning strategies akin to the state-of-the-art in\ngeneral computer vision. In this work, we introduce a set of methods enabling\ncross-domain and cross-dimension learning for image-to-graph transformers. We\npropose (1) a regularized edge sampling loss to effectively learn object\nrelations in multiple domains with different numbers of edges, (2) a domain\nadaptation framework for image-to-graph transformers aligning image- and\ngraph-level features from different domains, and (3) a projection function that\nallows using 2D data for training 3D transformers. We demonstrate our method's\nutility in cross-domain and cross-dimension experiments, where we utilize\nlabeled data from 2D road networks for simultaneous learning in vastly\ndifferent target domains. Our method consistently outperforms standard transfer\nlearning and self-supervised pretraining on challenging benchmarks, such as\nretinal or whole-brain vessel graph extraction.\n","authors":["Alexander H. Berger","Laurin Lux","Suprosanna Shit","Ivan Ezhov","Georgios Kaissis","Martin J. Menten","Daniel Rueckert","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2403.06601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04237v1","updated":"2024-12-05T15:17:06Z","published":"2024-12-05T15:17:06Z","title":"VASCAR: Content-Aware Layout Generation via Visual-Aware Self-Correction","summary":" Large language models (LLMs) have proven effective for layout generation due\nto their ability to produce structure-description languages, such as HTML or\nJSON, even without access to visual information. Recently, LLM providers have\nevolved these models into large vision-language models (LVLM), which shows\nprominent multi-modal understanding capabilities. Then, how can we leverage\nthis multi-modal power for layout generation? To answer this, we propose\nVisual-Aware Self-Correction LAyout GeneRation (VASCAR) for LVLM-based\ncontent-aware layout generation. In our method, LVLMs iteratively refine their\noutputs with reference to rendered layout images, which are visualized as\ncolored bounding boxes on poster backgrounds. In experiments, we demonstrate\nthat our method combined with the Gemini. Without any additional training,\nVASCAR achieves state-of-the-art (SOTA) layout generation quality outperforming\nboth existing layout-specific generative models and other LLM-based methods.\n","authors":["Jiahao Zhang","Ryota Yoshihashi","Shunsuke Kitada","Atsuki Osanai","Yuta Nakashima"],"pdf_url":"https://arxiv.org/pdf/2412.04237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04234v1","updated":"2024-12-05T15:10:13Z","published":"2024-12-05T15:10:13Z","title":"DEIM: DETR with Improved Matching for Fast Convergence","summary":" We introduce DEIM, an innovative and efficient training framework designed to\naccelerate convergence in real-time object detection with Transformer-based\narchitectures (DETR). To mitigate the sparse supervision inherent in one-to-one\n(O2O) matching in DETR models, DEIM employs a Dense O2O matching strategy. This\napproach increases the number of positive samples per image by incorporating\nadditional targets, using standard data augmentation techniques. While Dense\nO2O matching speeds up convergence, it also introduces numerous low-quality\nmatches that could affect performance. To address this, we propose the\nMatchability-Aware Loss (MAL), a novel loss function that optimizes matches\nacross various quality levels, enhancing the effectiveness of Dense O2O.\nExtensive experiments on the COCO dataset validate the efficacy of DEIM. When\nintegrated with RT-DETR and D-FINE, it consistently boosts performance while\nreducing training time by 50%. Notably, paired with RT-DETRv2, DEIM achieves\n53.2% AP in a single day of training on an NVIDIA 4090 GPU. Additionally,\nDEIM-trained real-time models outperform leading real-time object detectors,\nwith DEIM-D-FINE-L and DEIM-D-FINE-X achieving 54.7% and 56.5% AP at 124 and 78\nFPS on an NVIDIA T4 GPU, respectively, without the need for additional data. We\nbelieve DEIM sets a new baseline for advancements in real-time object\ndetection. Our code and pre-trained models are available at\nhttps://github.com/ShihuaHuang95/DEIM.\n","authors":["Shihua Huang","Zhichao Lu","Xiaodong Cun","Yongjun Yu","Xiao Zhou","Xi Shen"],"pdf_url":"https://arxiv.org/pdf/2412.04234v1.pdf","comment":"Exceeding all existing real-time object detectors, including YOLOv11\n and D-FINE"},{"id":"http://arxiv.org/abs/2312.09256v2","updated":"2024-12-05T15:05:55Z","published":"2023-12-14T18:59:59Z","title":"LIME: Localized Image Editing via Attention Regularization in Diffusion\n Models","summary":" Diffusion models (DMs) have gained prominence due to their ability to\ngenerate high-quality varied images with recent advancements in text-to-image\ngeneration. The research focus is now shifting towards the controllability of\nDMs. A significant challenge within this domain is localized editing, where\nspecific areas of an image are modified without affecting the rest of the\ncontent. This paper introduces LIME for localized image editing in diffusion\nmodels. LIME does not require user-specified regions of interest (RoI) or\nadditional text input, but rather employs features from pre-trained methods and\na straightforward clustering method to obtain precise editing mask. Then, by\nleveraging cross-attention maps, it refines these segments for finding regions\nto obtain localized edits. Finally, we propose a novel cross-attention\nregularization technique that penalizes unrelated cross-attention scores in the\nRoI during the denoising steps, ensuring localized edits. Our approach, without\nre-training, fine-tuning and additional user inputs, consistently improves the\nperformance of existing methods in various editing benchmarks. The project page\ncan be found at https://enisimsar.github.io/LIME/.\n","authors":["Enis Simsar","Alessio Tonioni","Yongqin Xian","Thomas Hofmann","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2312.09256v2.pdf","comment":"WACV'25"},{"id":"http://arxiv.org/abs/2412.04227v1","updated":"2024-12-05T15:05:25Z","published":"2024-12-05T15:05:25Z","title":"Foundations of the Theory of Performance-Based Ranking","summary":" Ranking entities such as algorithms, devices, methods, or models based on\ntheir performances, while accounting for application-specific preferences, is a\nchallenge. To address this challenge, we establish the foundations of a\nuniversal theory for performance-based ranking. First, we introduce a rigorous\nframework built on top of both the probability and order theories. Our new\nframework encompasses the elements necessary to (1) manipulate performances as\nmathematical objects, (2) express which performances are worse than or\nequivalent to others, (3) model tasks through a variable called satisfaction,\n(4) consider properties of the evaluation, (5) define scores, and (6) specify\napplication-specific preferences through a variable called importance. On top\nof this framework, we propose the first axiomatic definition of performance\norderings and performance-based rankings. Then, we introduce a universal\nparametric family of scores, called ranking scores, that can be used to\nestablish rankings satisfying our axioms, while considering\napplication-specific preferences. Finally, we show, in the case of two-class\nclassification, that the family of ranking scores encompasses well-known\nperformance scores, including the accuracy, the true positive rate (recall,\nsensitivity), the true negative rate (specificity), the positive predictive\nvalue (precision), and F1. However, we also show that some other scores\ncommonly used to compare classifiers are unsuitable to derive performance\norderings satisfying the axioms. Therefore, this paper provides the computer\nvision and machine learning communities with a rigorous framework for\nevaluating and ranking entities.\n","authors":["Sébastien Piérard","Anaïs Halin","Anthony Cioppa","Adrien Deliège","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2412.04227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04220v1","updated":"2024-12-05T14:54:31Z","published":"2024-12-05T14:54:31Z","title":"Customize Segment Anything Model for Multi-Modal Semantic Segmentation\n with Mixture of LoRA Experts","summary":" The recent Segment Anything Model (SAM) represents a significant breakthrough\nin scaling segmentation models, delivering strong performance across various\ndownstream applications in the RGB modality. However, directly applying SAM to\nemerging visual modalities, such as depth and event data results in suboptimal\nperformance in multi-modal segmentation tasks. In this paper, we make the first\nattempt to adapt SAM for multi-modal semantic segmentation by proposing a\nMixture of Low-Rank Adaptation Experts (MoE-LoRA) tailored for different input\nvisual modalities. By training only the MoE-LoRA layers while keeping SAM's\nweights frozen, SAM's strong generalization and segmentation capabilities can\nbe preserved for downstream tasks. Specifically, to address cross-modal\ninconsistencies, we propose a novel MoE routing strategy that adaptively\ngenerates weighted features across modalities, enhancing multi-modal feature\nintegration. Additionally, we incorporate multi-scale feature extraction and\nfusion by adapting SAM's segmentation head and introducing an auxiliary\nsegmentation head to combine multi-scale features for improved segmentation\nperformance effectively. Extensive experiments were conducted on three\nmulti-modal benchmarks: DELIVER, MUSES, and MCubeS. The results consistently\ndemonstrate that the proposed method significantly outperforms state-of-the-art\napproaches across diverse scenarios. Notably, under the particularly\nchallenging condition of missing modalities, our approach exhibits a\nsubstantial performance gain, achieving an improvement of 32.15% compared to\nexisting methods.\n","authors":["Chenyang Zhu","Bin Xiao","Lin Shi","Shoukun Xu","Xu Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.04220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10886v2","updated":"2024-12-05T14:51:55Z","published":"2024-11-16T20:59:01Z","title":"MetricGold: Leveraging Text-To-Image Latent Diffusion Models for Metric\n Depth Estimation","summary":" Recovering metric depth from a single image remains a fundamental challenge\nin computer vision, requiring both scene understanding and accurate scaling.\nWhile deep learning has advanced monocular depth estimation, current models\noften struggle with unfamiliar scenes and layouts, particularly in zero-shot\nscenarios and when predicting scale-ergodic metric depth. We present\nMetricGold, a novel approach that harnesses generative diffusion model's rich\npriors to improve metric depth estimation. Building upon recent advances in\nMariGold, DDVM and Depth Anything V2 respectively, our method combines latent\ndiffusion, log-scaled metric depth representation, and synthetic data training.\nMetricGold achieves efficient training on a single RTX 3090 within two days\nusing photo-realistic synthetic data from HyperSIM, VirtualKitti, and\nTartanAir. Our experiments demonstrate robust generalization across diverse\ndatasets, producing sharper and higher quality metric depth estimates compared\nto existing approaches.\n","authors":["Ansh Shah","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.10886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04217v1","updated":"2024-12-05T14:50:11Z","published":"2024-12-05T14:50:11Z","title":"Aligned Music Notation and Lyrics Transcription","summary":" The digitization of vocal music scores presents unique challenges that go\nbeyond traditional Optical Music Recognition (OMR) and Optical Character\nRecognition (OCR), as it necessitates preserving the critical alignment between\nmusic notation and lyrics. This alignment is essential for proper\ninterpretation and processing in practical applications. This paper introduces\nand formalizes, for the first time, the Aligned Music Notation and Lyrics\nTranscription (AMNLT) challenge, which addresses the complete transcription of\nvocal scores by jointly considering music symbols, lyrics, and their\nsynchronization. We analyze different approaches to address this challenge,\nranging from traditional divide-and-conquer methods that handle music and\nlyrics separately, to novel end-to-end solutions including direct\ntranscription, unfolding mechanisms, and language modeling. To evaluate these\nmethods, we introduce four datasets of Gregorian chants, comprising both real\nand synthetic sources, along with custom metrics specifically designed to\nassess both transcription and alignment accuracy. Our experimental results\ndemonstrate that end-to-end approaches generally outperform heuristic methods\nin the alignment challenge, with language models showing particular promise in\nscenarios where sufficient training data is available. This work establishes\nthe first comprehensive framework for AMNLT, providing both theoretical\nfoundations and practical solutions for preserving and digitizing vocal music\nheritage.\n","authors":["Eliseo Fuentes-Martínez","Antonio Ríos-Vila","Juan C. Martinez-Sevilla","David Rizo","Jorge Calvo-Zaragoza"],"pdf_url":"https://arxiv.org/pdf/2412.04217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04204v1","updated":"2024-12-05T14:40:41Z","published":"2024-12-05T14:40:41Z","title":"PANGAEA: A Global and Inclusive Benchmark for Geospatial Foundation\n Models","summary":" Geospatial Foundation Models (GFMs) have emerged as powerful tools for\nextracting representations from Earth observation data, but their evaluation\nremains inconsistent and narrow. Existing works often evaluate on suboptimal\ndownstream datasets and tasks, that are often too easy or too narrow, limiting\nthe usefulness of the evaluations to assess the real-world applicability of\nGFMs. Additionally, there is a distinct lack of diversity in current evaluation\nprotocols, which fail to account for the multiplicity of image resolutions,\nsensor types, and temporalities, which further complicates the assessment of\nGFM performance. In particular, most existing benchmarks are geographically\nbiased towards North America and Europe, questioning the global applicability\nof GFMs. To overcome these challenges, we introduce PANGAEA, a standardized\nevaluation protocol that covers a diverse set of datasets, tasks, resolutions,\nsensor modalities, and temporalities. It establishes a robust and widely\napplicable benchmark for GFMs. We evaluate the most popular GFMs openly\navailable on this benchmark and analyze their performance across several\ndomains. In particular, we compare these models to supervised baselines (e.g.\nUNet and vanilla ViT), and assess their effectiveness when faced with limited\nlabeled data. Our findings highlight the limitations of GFMs, under different\nscenarios, showing that they do not consistently outperform supervised models.\nPANGAEA is designed to be highly extensible, allowing for the seamless\ninclusion of new datasets, models, and tasks in future research. By releasing\nthe evaluation code and benchmark, we aim to enable other researchers to\nreplicate our experiments and build upon our work, fostering a more principled\nevaluation protocol for large pre-trained geospatial models. The code is\navailable at https://github.com/VMarsocci/pangaea-bench.\n","authors":["Valerio Marsocci","Yuru Jia","Georges Le Bellier","David Kerekes","Liang Zeng","Sebastian Hafner","Sebastian Gerard","Eric Brune","Ritu Yadav","Ali Shibli","Heng Fang","Yifang Ban","Maarten Vergauwen","Nicolas Audebert","Andrea Nascetti"],"pdf_url":"https://arxiv.org/pdf/2412.04204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04201v1","updated":"2024-12-05T14:39:29Z","published":"2024-12-05T14:39:29Z","title":"Hipandas: Hyperspectral Image Joint Denoising and Super-Resolution by\n Image Fusion with the Panchromatic Image","summary":" Hyperspectral images (HSIs) are frequently noisy and of low resolution due to\nthe constraints of imaging devices. Recently launched satellites can\nconcurrently acquire HSIs and panchromatic (PAN) images, enabling the\nrestoration of HSIs to generate clean and high-resolution imagery through\nfusing PAN images for denoising and super-resolution. However, previous studies\ntreated these two tasks as independent processes, resulting in accumulated\nerrors. This paper introduces \\textbf{H}yperspectral \\textbf{I}mage Joint\n\\textbf{Pand}enoising \\textbf{a}nd Pan\\textbf{s}harpening (Hipandas), a novel\nlearning paradigm that reconstructs HRHS images from noisy low-resolution HSIs\n(LRHS) and high-resolution PAN images. The proposed zero-shot Hipandas\nframework consists of a guided denoising network, a guided super-resolution\nnetwork, and a PAN reconstruction network, utilizing an HSI low-rank prior and\na newly introduced detail-oriented low-rank prior. The interconnection of these\nnetworks complicates the training process, necessitating a two-stage training\nstrategy to ensure effective training. Experimental results on both simulated\nand real-world datasets indicate that the proposed method surpasses\nstate-of-the-art algorithms, yielding more accurate and visually pleasing HRHS\nimages.\n","authors":["Shuang Xu","Zixiang Zhao","Haowen Bai","Chang Yu","Jiangjun Peng","Xiangyong Cao","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2412.04201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15996v3","updated":"2024-12-05T14:38:12Z","published":"2024-08-28T17:59:05Z","title":"Spatio-Temporal Context Prompting for Zero-Shot Action Detection","summary":" Spatio-temporal action detection encompasses the tasks of localizing and\nclassifying individual actions within a video. Recent works aim to enhance this\nprocess by incorporating interaction modeling, which captures the relationship\nbetween people and their surrounding context. However, these approaches have\nprimarily focused on fully-supervised learning, and the current limitation lies\nin the lack of generalization capability to recognize unseen action categories.\nIn this paper, we aim to adapt the pretrained image-language models to detect\nunseen actions. To this end, we propose a method which can effectively leverage\nthe rich knowledge of visual-language models to perform Person-Context\nInteraction. Meanwhile, our Context Prompting module will utilize contextual\ninformation to prompt labels, thereby enhancing the generation of more\nrepresentative text features. Moreover, to address the challenge of recognizing\ndistinct actions by multiple people at the same timestamp, we design the\nInterest Token Spotting mechanism which employs pretrained visual knowledge to\nfind each person's interest context tokens, and then these tokens will be used\nfor prompting to generate text features tailored to each individual. To\nevaluate the ability to detect unseen actions, we propose a comprehensive\nbenchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our\nmethod achieves superior results compared to previous approaches and can be\nfurther extended to multi-action videos, bringing it closer to real-world\napplications. The code and data can be found in\nhttps://webber2933.github.io/ST-CLIP-project-page.\n","authors":["Wei-Jhe Huang","Min-Hung Chen","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15996v3.pdf","comment":"Accepted by WACV2025. Project page:\n https://webber2933.github.io/ST-CLIP-project-page"},{"id":"http://arxiv.org/abs/2412.04189v1","updated":"2024-12-05T14:29:10Z","published":"2024-12-05T14:29:10Z","title":"Instructional Video Generation","summary":" Despite the recent strides in video generation, state-of-the-art methods\nstill struggle with elements of visual detail. One particularly challenging\ncase is the class of egocentric instructional videos in which the intricate\nmotion of the hand coupled with a mostly stable and non-distracting environment\nis necessary to convey the appropriate visual action instruction. To address\nthese challenges, we introduce a new method for instructional video generation.\nOur diffusion-based method incorporates two distinct innovations. First, we\npropose an automatic method to generate the expected region of motion, guided\nby both the visual context and the action text. Second, we introduce a critical\nhand structure loss to guide the diffusion model to focus on smooth and\nconsistent hand poses. We evaluate our method on augmented instructional\ndatasets based on EpicKitchens and Ego4D, demonstrating significant\nimprovements over state-of-the-art methods in terms of instructional clarity,\nespecially of the hand motion in the target region, across diverse environments\nand actions.Video results can be found on the project webpage:\nhttps://excitedbutter.github.io/Instructional-Video-Generation/\n","authors":["Yayuan Li","Zhi Cao","Jason J. Corso"],"pdf_url":"https://arxiv.org/pdf/2412.04189v1.pdf","comment":"14 pages, 5 figures and 4 tables"},{"id":"http://arxiv.org/abs/2409.17146v2","updated":"2024-12-05T14:28:40Z","published":"2024-09-25T17:59:51Z","title":"Molmo and PixMo: Open Weights and Open Data for State-of-the-Art\n Vision-Language Models","summary":" Today's most advanced vision-language models (VLMs) remain proprietary. The\nstrongest open-weight models rely heavily on synthetic data from proprietary\nVLMs to achieve good performance, effectively distilling these closed VLMs into\nopen ones. As a result, the community has been missing foundational knowledge\nabout how to build performant VLMs from scratch. We present Molmo, a new family\nof VLMs that are state-of-the-art in their class of openness. Our key\ncontribution is a collection of new datasets called PixMo, including a dataset\nof highly detailed image captions for pre-training, a free-form image Q&A\ndataset for fine-tuning, and an innovative 2D pointing dataset, all collected\nwithout the use of external VLMs. The success of our approach relies on careful\nmodeling choices, a well-tuned training pipeline, and, most critically, the\nquality of our newly collected datasets. Our best-in-class 72B model not only\noutperforms others in the class of open weight and data models, but also\noutperforms larger proprietary models including Claude 3.5 Sonnet, and Gemini\n1.5 Pro and Flash, second only to GPT-4o based on both academic benchmarks and\non a large human evaluation. Our model weights, new datasets, and source code\nare available at https://molmo.allenai.org/blog.\n","authors":["Matt Deitke","Christopher Clark","Sangho Lee","Rohun Tripathi","Yue Yang","Jae Sung Park","Mohammadreza Salehi","Niklas Muennighoff","Kyle Lo","Luca Soldaini","Jiasen Lu","Taira Anderson","Erin Bransom","Kiana Ehsani","Huong Ngo","YenSung Chen","Ajay Patel","Mark Yatskar","Chris Callison-Burch","Andrew Head","Rose Hendrix","Favyen Bastani","Eli VanderBilt","Nathan Lambert","Yvonne Chou","Arnavi Chheda","Jenna Sparks","Sam Skjonsberg","Michael Schmitz","Aaron Sarnat","Byron Bischoff","Pete Walsh","Chris Newell","Piper Wolters","Tanmay Gupta","Kuo-Hao Zeng","Jon Borchardt","Dirk Groeneveld","Crystal Nam","Sophie Lebrecht","Caitlin Wittlif","Carissa Schoenick","Oscar Michel","Ranjay Krishna","Luca Weihs","Noah A. Smith","Hannaneh Hajishirzi","Ross Girshick","Ali Farhadi","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2409.17146v2.pdf","comment":"Updated with ablations and more technical details"},{"id":"http://arxiv.org/abs/2406.01551v2","updated":"2024-12-05T14:25:56Z","published":"2024-06-03T17:32:23Z","title":"ELSA: Evaluating Localization of Social Activities in Urban Streets\n using Open-Vocabulary Detection","summary":" Existing Open Vocabulary Detection (OVD) models exhibit a number of\nchallenges. They often struggle with semantic consistency across diverse\ninputs, and are often sensitive to slight variations in input phrasing, leading\nto inconsistent performance. The calibration of their predictive confidence,\nespecially in complex multi-label scenarios, remains suboptimal, frequently\nresulting in overconfident predictions that do not accurately reflect their\ncontext understanding. To understand these limitations, multi-label detection\nbenchmarks are needed. A particularly challenging domain for such benchmarking\nis social activities. Due to the lack of multi-label benchmarks for social\ninteractions, in this work we present ELSA: Evaluating Localization of Social\nActivities. ELSA draws on theoretical frameworks in urban sociology and design\nand uses in-the-wild street-level imagery, where the size of groups and the\ntypes of activities vary significantly. ELSA includes more than 900 manually\nannotated images with more than 4,300 multi-labeled bounding boxes for\nindividual and group activities. We introduce a novel confidence score\ncomputation method NLSE and a novel Dynamic Box Aggregation (DBA) algorithm to\nassess semantic consistency in overlapping predictions. We report our results\non the widely-used SOTA models Grounding DINO, Detic, OWL, and MDETR. Our\nevaluation protocol considers semantic stability and localization accuracy and\nfurther exposes the limitations of existing approaches.\n","authors":["Maryam Hosseini","Marco Cipriano","Sedigheh Eslami","Daniel Hodczak","Liu Liu","Andres Sevtsuk","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2406.01551v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.00256v2","updated":"2024-12-05T14:24:39Z","published":"2024-11-29T21:00:08Z","title":"Excretion Detection in Pigsties Using Convolutional and Transformerbased\n Deep Neural Networks","summary":" Animal excretions in form of urine puddles and feces are a significant source\nof emissions in livestock farming. Automated detection of soiled floor in barns\ncan contribute to improved management processes but also the derived\ninformation can be used to model emission dynamics. Previous research\napproaches to determine the puddle area require manual detection of the puddle\nin the barn. While humans can detect animal excretions on thermal images of a\nlivestock barn, automated approaches using thresholds fail due to other objects\nof the same temperature, such as the animals themselves. In addition, various\nparameters such as the type of housing, animal species, age, sex, weather and\nunknown factors can influence the type and shape of excretions. Due to this\nheterogeneity, a method for automated detection of excretions must therefore be\nnot only be accurate but also robust to varying conditions. These requirements\ncan be met by using contemporary deep learning models from the field of\nartificial intelligence. This work is the first to investigate the suitability\nof different deep learning models for the detection of excretions in pigsties,\nthereby comparing established convolutional architectures with recent\ntransformer-based approaches. The detection models Faster R-CNN, YOLOv8, DETR\nand DAB-DETR are compared and statistically assessed on two created training\ndatasets representing two pig houses. We apply a method derived from nested\ncross-validation and report on the results in terms of eight common detection\nmetrics. Our work demonstrates that all investigated deep learning models are\ngenerally suitable for reliably detecting excretions with an average precision\nof over 90%. The models also show robustness on out of distribution data that\npossesses differences from the conditions in the training data, however, with\nexpected slight decreases in the overall detection performance.\n","authors":["Simon Mielke","Anthony Stein"],"pdf_url":"https://arxiv.org/pdf/2412.00256v2.pdf","comment":"Keywords: Artificial Intelligence, Objected detection, Pig, Urine\n puddle, Thermal IR data, CNN vs Transformer, Precision Livestock Farming;\n Stats: 54 pages, 13 figures, 1 graphical abstract"},{"id":"http://arxiv.org/abs/2312.02772v3","updated":"2024-12-05T14:19:26Z","published":"2023-12-05T14:01:43Z","title":"FG-MDM: Towards Zero-Shot Human Motion Generation via ChatGPT-Refined\n Descriptions","summary":" Recently, significant progress has been made in text-based motion generation,\nenabling the generation of diverse and high-quality human motions that conform\nto textual descriptions. However, generating motions beyond the distribution of\noriginal datasets remains challenging, i.e., zero-shot generation. By adopting\na divide-and-conquer strategy, we propose a new framework named Fine-Grained\nHuman Motion Diffusion Model (FG-MDM) for zero-shot human motion generation.\nSpecifically, we first parse previous vague textual annotations into\nfine-grained descriptions of different body parts by leveraging a large\nlanguage model. We then use these fine-grained descriptions to guide a\ntransformer-based diffusion model, which further adopts a design of part\ntokens. FG-MDM can generate human motions beyond the scope of original datasets\nowing to descriptions that are closer to motion essence. Our experimental\nresults demonstrate the superiority of FG-MDM over previous methods in\nzero-shot settings. We will release our fine-grained textual annotations for\nHumanML3D and KIT.\n","authors":["Xu Shi","Wei Yao","Chuanchen Luo","Junran Peng","Hongwen Zhang","Yunlian Sun"],"pdf_url":"https://arxiv.org/pdf/2312.02772v3.pdf","comment":"Project Page: https://sx0207.github.io/fg-mdm/"},{"id":"http://arxiv.org/abs/2412.03079v2","updated":"2024-12-05T14:16:07Z","published":"2024-12-04T07:09:59Z","title":"Align3R: Aligned Monocular Depth Estimation for Dynamic Videos","summary":" Recent developments in monocular depth estimation methods enable high-quality\ndepth estimation of single-view images but fail to estimate consistent video\ndepth across different frames. Recent works address this problem by applying a\nvideo diffusion model to generate video depth conditioned on the input video,\nwhich is training-expensive and can only produce scale-invariant depth values\nwithout camera poses. In this paper, we propose a novel video-depth estimation\nmethod called Align3R to estimate temporal consistent depth maps for a dynamic\nvideo. Our key idea is to utilize the recent DUSt3R model to align estimated\nmonocular depth maps of different timesteps. First, we fine-tune the DUSt3R\nmodel with additional estimated monocular depth as inputs for the dynamic\nscenes. Then, we apply optimization to reconstruct both depth maps and camera\nposes. Extensive experiments demonstrate that Align3R estimates consistent\nvideo depth and camera poses for a monocular video with superior performance\nthan baseline methods.\n","authors":["Jiahao Lu","Tianyu Huang","Peng Li","Zhiyang Dou","Cheng Lin","Zhiming Cui","Zhen Dong","Sai-Kit Yeung","Wenping Wang","Yuan Liu"],"pdf_url":"https://arxiv.org/pdf/2412.03079v2.pdf","comment":"Project Page: https://igl-hkust.github.io/Align3R.github.io/"},{"id":"http://arxiv.org/abs/2408.09647v2","updated":"2024-12-05T14:06:14Z","published":"2024-08-19T02:14:25Z","title":"C2P-CLIP: Injecting Category Common Prompt in CLIP to Enhance\n Generalization in Deepfake Detection","summary":" This work focuses on AIGC detection to develop universal detectors capable of\nidentifying various types of forgery images. Recent studies have found large\npre-trained models, such as CLIP, are effective for generalizable deepfake\ndetection along with linear classifiers. However, two critical issues remain\nunresolved: 1) understanding why CLIP features are effective on deepfake\ndetection through a linear classifier; and 2) exploring the detection potential\nof CLIP. In this study, we delve into the underlying mechanisms of CLIP's\ndetection capabilities by decoding its detection features into text and\nperforming word frequency analysis. Our finding indicates that CLIP detects\ndeepfakes by recognizing similar concepts (Fig. \\ref{fig:fig1} a). Building on\nthis insight, we introduce Category Common Prompt CLIP, called C2P-CLIP, which\nintegrates the category common prompt into the text encoder to inject\ncategory-related concepts into the image encoder, thereby enhancing detection\nperformance (Fig. \\ref{fig:fig1} b). Our method achieves a 12.41\\% improvement\nin detection accuracy compared to the original CLIP, without introducing\nadditional parameters during testing. Comprehensive experiments conducted on\ntwo widely-used datasets, encompassing 20 generation models, validate the\nefficacy of the proposed method, demonstrating state-of-the-art performance.\nThe code is available at\n\\url{https://github.com/chuangchuangtan/C2P-CLIP-DeepfakeDetection}\n","authors":["Chuangchuang Tan","Renshuai Tao","Huan Liu","Guanghua Gu","Baoyuan Wu","Yao Zhao","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2408.09647v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.02210v2","updated":"2024-12-05T13:51:57Z","published":"2024-12-03T07:03:25Z","title":"CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating\n Large Multimodal Models in Literacy","summary":" Large Multimodal Models (LMMs) have demonstrated impressive performance on\nrecognizing document images with natural language instructions. However, it\nremains unclear to what extent capabilities in literacy with rich structure and\nfine-grained visual challenges. The current landscape lacks a comprehensive\nbenchmark to effectively measure the literate capabilities of LMMs. Existing\nbenchmarks are often limited by narrow scenarios and specified tasks. To this\nend, we introduce CC-OCR, a comprehensive benchmark that possess a diverse\nrange of scenarios, tasks, and challenges. CC-OCR comprises four OCR-centric\ntracks: multi-scene text reading, multilingual text reading, document parsing,\nand key information extraction. It includes 39 subsets with 7,058 full\nannotated images, of which 41% are sourced from real applications, being\nreleased for the first time. Furthermore, we evaluate nine prominent LMMs and\nreveal both the strengths and weaknesses of these models, particularly in text\ngrounding, multi-orientation, and hallucination of repetition. CC-OCR aims to\ncomprehensively evaluate the capabilities of LMMs on OCR-centered tasks,\ndriving advancement in LMMs.\n","authors":["Zhibo Yang","Jun Tang","Zhaohai Li","Pengfei Wang","Jianqiang Wan","Humen Zhong","Xuejing Liu","Mingkun Yang","Peng Wang","Yuliang Liu","LianWen Jin","Xiang Bai","Shuai Bai","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2412.02210v2.pdf","comment":"23 pages, 14 figures; The code will released at\n https://github.com/QwenLM/CC-OCR"},{"id":"http://arxiv.org/abs/2406.18809v2","updated":"2024-12-05T13:47:16Z","published":"2024-06-27T00:54:11Z","title":"Divide, Ensemble and Conquer: The Last Mile on Unsupervised Domain\n Adaptation for Semantic Segmentation","summary":" The last mile of unsupervised domain adaptation (UDA) for semantic\nsegmentation is the challenge of solving the syn-to-real domain gap. Recent UDA\nmethods have progressed significantly, yet they often rely on strategies\ncustomized for synthetic single-source datasets (e.g., GTA5), which limits\ntheir generalisation to multi-source datasets. Conversely, synthetic\nmulti-source datasets hold promise for advancing the last mile of UDA but\nremain underutilized in current research. Thus, we propose DEC, a flexible UDA\nframework for multi-source datasets. Following a divide-and-conquer strategy,\nDEC simplifies the task by categorizing semantic classes, training models for\neach category, and fusing their outputs by an ensemble model trained\nexclusively on synthetic datasets to obtain the final segmentation mask. DEC\ncan integrate with existing UDA methods, achieving state-of-the-art performance\non Cityscapes, BDD100K, and Mapillary Vistas, significantly narrowing the\nsyn-to-real domain gap.\n","authors":["Tao Lian","Jose L. Gómez","Antonio M. López"],"pdf_url":"https://arxiv.org/pdf/2406.18809v2.pdf","comment":"Accepted by TIV"},{"id":"http://arxiv.org/abs/2412.01798v2","updated":"2024-12-05T13:39:06Z","published":"2024-12-02T18:46:12Z","title":"SEAL: Semantic Attention Learning for Long Video Representation","summary":" Long video understanding presents challenges due to the inherent high\ncomputational complexity and redundant temporal information. An effective\nrepresentation for long videos must process such redundancy efficiently while\npreserving essential contents for downstream tasks. This paper introduces\nSEmantic Attention Learning (SEAL), a novel unified representation for long\nvideos. To reduce computational complexity, long videos are decomposed into\nthree distinct types of semantic entities: scenes, objects, and actions,\nallowing models to operate on a handful of entities rather than a large number\nof frames or pixels. To further address redundancy, we propose an attention\nlearning module that balances token relevance with diversity formulated as a\nsubset selection optimization problem. Our representation is versatile,\nenabling applications across various long video understanding tasks. Extensive\nexperiments show that SEAL significantly outperforms state-of-the-art methods\nin video question answering and temporal grounding tasks and benchmarks\nincluding LVBench, MovieChat-1K, and Ego4D.\n","authors":["Lan Wang","Yujia Chen","Du Tran","Vishnu Naresh Boddeti","Wen-Sheng Chu"],"pdf_url":"https://arxiv.org/pdf/2412.01798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.17449v3","updated":"2024-12-05T13:37:51Z","published":"2024-07-24T17:30:21Z","title":"Looking at Model Debiasing through the Lens of Anomaly Detection","summary":" It is widely recognized that deep neural networks are sensitive to bias in\nthe data. This means that during training these models are likely to learn\nspurious correlations between data and labels, resulting in limited\ngeneralization abilities and low performance. In this context, model debiasing\napproaches can be devised aiming at reducing the model's dependency on such\nunwanted correlations, either leveraging the knowledge of bias information or\nnot. In this work, we focus on the latter and more realistic scenario, showing\nthe importance of accurately predicting the bias-conflicting and bias-aligned\nsamples to obtain compelling performance in bias mitigation. On this ground, we\npropose to conceive the problem of model bias from an out-of-distribution\nperspective, introducing a new bias identification method based on anomaly\ndetection. We claim that when data is mostly biased, bias-conflicting samples\ncan be regarded as outliers with respect to the bias-aligned distribution in\nthe feature space of a biased model, thus allowing for precisely detecting them\nwith an anomaly detection method. Coupling the proposed bias identification\napproach with bias-conflicting data upsampling and augmentation in a two-step\nstrategy, we reach state-of-the-art performance on synthetic and real benchmark\ndatasets. Ultimately, our proposed approach shows that the data bias issue does\nnot necessarily require complex debiasing methods, given that an accurate bias\nidentification procedure is defined. Source code is available at\nhttps://github.com/Malga-Vision/MoDAD\n","authors":["Vito Paolo Pastore","Massimiliano Ciranni","Davide Marinelli","Francesca Odone","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2407.17449v3.pdf","comment":"13 pages, 8 figures; Accepted at IEEE/CVF Winter Conference on\n Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2411.11706v2","updated":"2024-12-05T13:27:22Z","published":"2024-11-18T16:33:52Z","title":"MC-LLaVA: Multi-Concept Personalized Vision-Language Model","summary":" Current vision-language models (VLMs) show exceptional abilities across\ndiverse tasks including visual question answering. To enhance user experience\nin practical applications, recent studies investigate VLM personalization to\nunderstand user-provided concepts. However, existing studies mainly focus on\nsingle-concept personalization, neglecting the existence and interplay of\nmultiple concepts, which limits the real-world applicability of personalized\nVLMs. In this paper, we propose the first multi-concept personalization method\nnamed MC-LLaVA along with a high-quality multi-concept personalization dataset.\nSpecifically, MC-LLaVA uses a joint training strategy incorporating multiple\nconcepts in a single training step, allowing VLMs to perform accurately in\nmulti-concept personalization. To reduce the cost of joint training, MC-LLaVA\nleverages visual token information for concept token initialization, yielding\nimproved concept representation and accelerating joint training. To advance\nmulti-concept personalization research, we further contribute a high-quality\ndataset. We carefully collect images from various movies that contain multiple\ncharacters and manually generate the multi-concept question-answer samples. Our\ndataset features diverse movie types and question-answer types. We conduct\ncomprehensive qualitative and quantitative experiments to demonstrate that\nMC-LLaVA can achieve impressive multi-concept personalized responses, paving\nthe way for VLMs to become better user-specific assistants. The code and\ndataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA.\n","authors":["Ruichuan An","Sihan Yang","Ming Lu","Kai Zeng","Yulin Luo","Ying Chen","Jiajun Cao","Hao Liang","Qi She","Shanghang Zhang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04149v1","updated":"2024-12-05T13:23:06Z","published":"2024-12-05T13:23:06Z","title":"Frequency-Adaptive Low-Latency Object Detection Using Events and Frames","summary":" Fusing Events and RGB images for object detection leverages the robustness of\nEvent cameras in adverse environments and the rich semantic information\nprovided by RGB cameras. However, two critical mismatches: low-latency Events\n\\textit{vs.}~high-latency RGB frames; temporally sparse labels in training\n\\textit{vs.}~continuous flow in inference, significantly hinder the\nhigh-frequency fusion-based object detection. To address these challenges, we\npropose the \\textbf{F}requency-\\textbf{A}daptive Low-Latency \\textbf{O}bject\n\\textbf{D}etector (FAOD). FAOD aligns low-frequency RGB frames with\nhigh-frequency Events through an Align Module, which reinforces cross-modal\nstyle and spatial proximity to address the Event-RGB Mismatch. We further\npropose a training strategy, Time Shift, which enforces the module to align the\nprediction from temporally shifted Event-RGB pairs and their original\nrepresentation, that is, consistent with Event-aligned annotations. This\nstrategy enables the network to use high-frequency Event data as the primary\nreference while treating low-frequency RGB images as supplementary information,\nretaining the low-latency nature of the Event stream toward high-frequency\ndetection. Furthermore, we observe that these corrected Event-RGB pairs\ndemonstrate better generalization from low training frequency to higher\ninference frequencies compared to using Event data alone. Extensive experiments\non the PKU-DAVIS-SOD and DSEC-Detection datasets demonstrate that our FAOD\nachieves SOTA performance. Specifically, in the PKU-DAVIS-SOD Dataset, FAOD\nachieves 9.8 points improvement in terms of the mAP in fully paired Event-RGB\ndata with only a quarter of the parameters compared to SODFormer, and even\nmaintains robust performance (only a 3 points drop in mAP) under 80$\\times$\nEvent-RGB frequency mismatch.\n","authors":["Haitian Zhang","Xiangyuan Wang","Chang Xu","Xinya Wang","Fang Xu","Huai Yu","Lei Yu","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2412.04149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04146v1","updated":"2024-12-05T13:16:47Z","published":"2024-12-05T13:16:47Z","title":"AnyDressing: Customizable Multi-Garment Virtual Dressing via Latent\n Diffusion Models","summary":" Recent advances in garment-centric image generation from text and image\nprompts based on diffusion models are impressive. However, existing methods\nlack support for various combinations of attire, and struggle to preserve the\ngarment details while maintaining faithfulness to the text prompts, limiting\ntheir performance across diverse scenarios. In this paper, we focus on a new\ntask, i.e., Multi-Garment Virtual Dressing, and we propose a novel AnyDressing\nmethod for customizing characters conditioned on any combination of garments\nand any personalized text prompts. AnyDressing comprises two primary networks\nnamed GarmentsNet and DressingNet, which are respectively dedicated to\nextracting detailed clothing features and generating customized images.\nSpecifically, we propose an efficient and scalable module called\nGarment-Specific Feature Extractor in GarmentsNet to individually encode\ngarment textures in parallel. This design prevents garment confusion while\nensuring network efficiency. Meanwhile, we design an adaptive\nDressing-Attention mechanism and a novel Instance-Level Garment Localization\nLearning strategy in DressingNet to accurately inject multi-garment features\ninto their corresponding regions. This approach efficiently integrates\nmulti-garment texture cues into generated images and further enhances\ntext-image consistency. Additionally, we introduce a Garment-Enhanced Texture\nLearning strategy to improve the fine-grained texture details of garments.\nThanks to our well-craft design, AnyDressing can serve as a plug-in module to\neasily integrate with any community control extensions for diffusion models,\nimproving the diversity and controllability of synthesized images. Extensive\nexperiments show that AnyDressing achieves state-of-the-art results.\n","authors":["Xinghui Li","Qichao Sun","Pengze Zhang","Fulong Ye","Zhichao Liao","Wanquan Feng","Songtao Zhao","Qian He"],"pdf_url":"https://arxiv.org/pdf/2412.04146v1.pdf","comment":"Project page: https://crayon-shinchan.github.io/AnyDressing/"},{"id":"http://arxiv.org/abs/2402.14123v2","updated":"2024-12-05T13:15:34Z","published":"2024-02-21T20:43:49Z","title":"DeiSAM: Segment Anything with Deictic Prompting","summary":" Large-scale, pre-trained neural networks have demonstrated strong\ncapabilities in various tasks, including zero-shot image segmentation. To\nidentify concrete objects in complex scenes, humans instinctively rely on\ndeictic descriptions in natural language, i.e., referring to something\ndepending on the context such as \"The object that is on the desk and behind the\ncup.\". However, deep learning approaches cannot reliably interpret such deictic\nrepresentations due to their lack of reasoning capabilities in complex\nscenarios. To remedy this issue, we propose DeiSAM -- a combination of large\npre-trained neural networks with differentiable logic reasoners -- for deictic\npromptable segmentation. Given a complex, textual segmentation description,\nDeiSAM leverages Large Language Models (LLMs) to generate first-order logic\nrules and performs differentiable forward reasoning on generated scene graphs.\nSubsequently, DeiSAM segments objects by matching them to the logically\ninferred image regions. As part of our evaluation, we propose the Deictic\nVisual Genome (DeiVG) dataset, containing paired visual input and complex,\ndeictic textual prompts. Our empirical results demonstrate that DeiSAM is a\nsubstantial improvement over purely data-driven baselines for deictic\npromptable segmentation.\n","authors":["Hikaru Shindo","Manuel Brack","Gopika Sudhakaran","Devendra Singh Dhami","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2402.14123v2.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.04137v1","updated":"2024-12-05T13:04:10Z","published":"2024-12-05T13:04:10Z","title":"Text Change Detection in Multilingual Documents Using Image Comparison","summary":" Document comparison typically relies on optical character recognition (OCR)\nas its core technology. However, OCR requires the selection of appropriate\nlanguage models for each document and the performance of multilingual or hybrid\nmodels remains limited. To overcome these challenges, we propose text change\ndetection (TCD) using an image comparison model tailored for multilingual\ndocuments. Unlike OCR-based approaches, our method employs word-level text\nimage-to-image comparison to detect changes. Our model generates bidirectional\nchange segmentation maps between the source and target documents. To enhance\nperformance without requiring explicit text alignment or scaling preprocessing,\nwe employ correlations among multi-scale attention features. We also construct\na benchmark dataset comprising actual printed and scanned word pairs in various\nlanguages to evaluate our model. We validate our approach using our benchmark\ndataset and public benchmarks Distorted Document Images and the LRDE Document\nBinarization Dataset. We compare our model against state-of-the-art semantic\nsegmentation and change detection models, as well as to conventional OCR-based\nmodels.\n","authors":["Doyoung Park","Naresh Reddy Yarram","Sunjin Kim","Minkyu Kim","Seongho Cho","Taehee Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04137v1.pdf","comment":"15pages, 11figures 6tables, wacv2025 accepted"},{"id":"http://arxiv.org/abs/2412.04130v1","updated":"2024-12-05T12:56:03Z","published":"2024-12-05T12:56:03Z","title":"Deep priors for satellite image restoration with accurate uncertainties","summary":" Satellite optical images, upon their on-ground receipt, offer a distorted\nview of the observed scene. Their restoration, classically including denoising,\ndeblurring, and sometimes super-resolution, is required before their\nexploitation. Moreover, quantifying the uncertainty related to this restoration\ncould be valuable by lowering the risk of hallucination and avoiding\npropagating these biases in downstream applications. Deep learning methods are\nnow state-of-the-art for satellite image restoration. However, they require to\ntrain a specific network for each sensor and they do not provide the associated\nuncertainties. This paper proposes a generic method involving a single network\nto restore images from several sensors and a scalable way to derive the\nuncertainties. We focus on deep regularization (DR) methods, which learn a deep\nprior on target images before plugging it into a model-based optimization\nscheme. First, we introduce VBLE-xz, which solves the inverse problem in the\nlatent space of a variational compressive autoencoder, estimating the\nuncertainty jointly in the latent and in the image spaces. It enables scalable\nposterior sampling with relevant and calibrated uncertainties. Second, we\npropose the denoiser-based method SatDPIR, adapted from DPIR, which efficiently\ncomputes accurate point estimates. We conduct a comprehensive set of\nexperiments on very high resolution simulated and real Pleiades images,\nasserting both the performance and robustness of the proposed methods. VBLE-xz\nand SatDPIR achieve state-of-the-art results compared to direct inversion\nmethods. In particular, VBLE-xz is a scalable method to get realistic posterior\nsamples and accurate uncertainties, while SatDPIR represents a compelling\nalternative to direct inversion methods when uncertainty quantification is not\nrequired.\n","authors":["Biquard Maud","Marie Chabert","Florence Genin","Christophe Latry","Thomas Oberlin"],"pdf_url":"https://arxiv.org/pdf/2412.04130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03324v2","updated":"2024-12-05T12:52:31Z","published":"2024-12-04T13:56:44Z","title":"A Stitch in Time Saves Nine: Small VLM is a Precise Guidance for\n Accelerating Large VLMs","summary":" Vision-language models (VLMs) have shown remarkable success across various\nmulti-modal tasks, yet large VLMs encounter significant efficiency challenges\ndue to processing numerous visual tokens. A promising approach to accelerating\nlarge VLM inference is using partial information, such as attention maps from\nspecific layers, to assess token importance and prune less essential tokens.\nHowever, our study reveals three key insights: (i) Partial attention\ninformation is insufficient for accurately identifying critical visual tokens,\nresulting in suboptimal performance, especially at low token retention ratios;\n(ii) Global attention information, such as the attention map aggregated across\nall layers, more effectively preserves essential tokens and maintains\ncomparable performance under aggressive pruning. However, the attention maps\nfrom all layers requires a full inference pass, which increases computational\nload and is therefore impractical in existing methods; and (iii) The global\nattention map aggregated from a small VLM closely resembles that of a large\nVLM, suggesting an efficient alternative. Based on these findings, we introduce\na \\textbf{training-free} method, \\underline{\\textbf{S}}mall VLM\n\\underline{\\textbf{G}}uidance for accelerating \\underline{\\textbf{L}}arge VLMs\n(\\textbf{SGL}). Specifically, we employ the attention map aggregated from a\nsmall VLM to guide visual token pruning in a large VLM. Additionally, an early\nexiting mechanism is developed to fully use the small VLM's predictions,\ndynamically invoking the larger VLM only when necessary, yielding a superior\ntrade-off between accuracy and computation. Extensive evaluations across 11\nbenchmarks demonstrate the effectiveness and generalizability of SGL, achieving\nup to 91\\% pruning ratio for visual tokens while retaining competitive\nperformance.\n","authors":["Wangbo Zhao","Yizeng Han","Jiasheng Tang","Zhikai Li","Yibing Song","Kai Wang","Zhangyang Wang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2412.03324v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02865v2","updated":"2024-12-05T12:38:58Z","published":"2024-12-03T22:00:12Z","title":"Memory-efficient Continual Learning with Neural Collapse Contrastive","summary":" Contrastive learning has significantly improved representation quality,\nenhancing knowledge transfer across tasks in continual learning (CL). However,\ncatastrophic forgetting remains a key challenge, as contrastive based methods\nprimarily focus on \"soft relationships\" or \"softness\" between samples, which\nshift with changing data distributions and lead to representation overlap\nacross tasks. Recently, the newly identified Neural Collapse phenomenon has\nshown promise in CL by focusing on \"hard relationships\" or \"hardness\" between\nsamples and fixed prototypes. However, this approach overlooks \"softness\",\ncrucial for capturing intra-class variability, and this rigid focus can also\npull old class representations toward current ones, increasing forgetting.\nBuilding on these insights, we propose Focal Neural Collapse Contrastive\n(FNC2), a novel representation learning loss that effectively balances both\nsoft and hard relationships. Additionally, we introduce the Hardness-Softness\nDistillation (HSD) loss to progressively preserve the knowledge gained from\nthese relationships across tasks. Our method outperforms state-of-the-art\napproaches, particularly in minimizing memory reliance. Remarkably, even\nwithout the use of memory, our approach rivals rehearsal-based methods,\noffering a compelling solution for data privacy concerns.\n","authors":["Trung-Anh Dang","Vincent Nguyen","Ngoc-Son Vu","Christel Vrain"],"pdf_url":"https://arxiv.org/pdf/2412.02865v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2412.04120v1","updated":"2024-12-05T12:38:18Z","published":"2024-12-05T12:38:18Z","title":"CrossSDF: 3D Reconstruction of Thin Structures From Cross-Sections","summary":" Reconstructing complex structures from planar cross-sections is a challenging\nproblem, with wide-reaching applications in medical imaging, manufacturing, and\ntopography. Out-of-the-box point cloud reconstruction methods can often fail\ndue to the data sparsity between slicing planes, while current bespoke methods\nstruggle to reconstruct thin geometric structures and preserve topological\ncontinuity. This is important for medical applications where thin vessel\nstructures are present in CT and MRI scans. This paper introduces \\method, a\nnovel approach for extracting a 3D signed distance field from 2D signed\ndistances generated from planar contours. Our approach makes the training of\nneural SDFs contour-aware by using losses designed for the case where geometry\nis known within 2D slices. Our results demonstrate a significant improvement\nover existing methods, effectively reconstructing thin structures and producing\naccurate 3D models without the interpolation artifacts or over-smoothing of\nprior approaches.\n","authors":["Thomas Walker","Salvatore Esposito","Daniel Rebain","Amir Vaxman","Arno Onken","Changjian Li","Oisin Mac Aodha"],"pdf_url":"https://arxiv.org/pdf/2412.04120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04117v1","updated":"2024-12-05T12:36:12Z","published":"2024-12-05T12:36:12Z","title":"MVUDA: Unsupervised Domain Adaptation for Multi-view Pedestrian\n Detection","summary":" We address multi-view pedestrian detection in a setting where labeled data is\ncollected using a multi-camera setup different from the one used for testing.\nWhile recent multi-view pedestrian detectors perform well on the camera rig\nused for training, their performance declines when applied to a different\nsetup. To facilitate seamless deployment across varied camera rigs, we propose\nan unsupervised domain adaptation (UDA) method that adapts the model to new\nrigs without requiring additional labeled data. Specifically, we leverage the\nmean teacher self-training framework with a novel pseudo-labeling technique\ntailored to multi-view pedestrian detection. This method achieves\nstate-of-the-art performance on multiple benchmarks, including\nMultiviewX$\\rightarrow$Wildtrack. Unlike previous methods, our approach\neliminates the need for external labeled monocular datasets, thereby reducing\nreliance on labeled data. Extensive evaluations demonstrate the effectiveness\nof our method and validate key design choices. By enabling robust adaptation\nacross camera setups, our work enhances the practicality of multi-view\npedestrian detectors and establishes a strong UDA baseline for future research.\n","authors":["Erik Brorsson","Lennart Svensson","Kristofer Bengtsson","Knut Åkesson"],"pdf_url":"https://arxiv.org/pdf/2412.04117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04114v1","updated":"2024-12-05T12:32:45Z","published":"2024-12-05T12:32:45Z","title":"Thermal and RGB Images Work Better Together in Wind Turbine Damage\n Detection","summary":" The inspection of wind turbine blades (WTBs) is crucial for ensuring their\nstructural integrity and operational efficiency. Traditional inspection methods\ncan be dangerous and inefficient, prompting the use of unmanned aerial vehicles\n(UAVs) that access hard-to-reach areas and capture high-resolution imagery. In\nthis study, we address the challenge of enhancing defect detection on WTBs by\nintegrating thermal and RGB images obtained from UAVs. We propose a\nmultispectral image composition method that combines thermal and RGB imagery\nthrough spatial coordinate transformation, key point detection, binary\ndescriptor creation, and weighted image overlay. Using a benchmark dataset of\nWTB images annotated for defects, we evaluated several state-of-the-art object\ndetection models. Our results show that composite images significantly improve\ndefect detection efficiency. Specifically, the YOLOv8 model's accuracy\nincreased from 91% to 95%, precision from 89% to 94%, recall from 85% to 92%,\nand F1-score from 87% to 93%. The number of false positives decreased from 6 to\n3, and missed defects reduced from 5 to 2. These findings demonstrate that\nintegrating thermal and RGB imagery enhances defect detection on WTBs,\ncontributing to improved maintenance and reliability.\n","authors":["Serhii Svystun","Oleksandr Melnychenko","Pavlo Radiuk","Oleg Savenko","Anatoliy Sachenko","Andrii Lysyi"],"pdf_url":"https://arxiv.org/pdf/2412.04114v1.pdf","comment":"Unmanned aerial vehicle, image composition, multispectral images,\n green energy, data quality management, weighted overlay"},{"id":"http://arxiv.org/abs/2412.04111v1","updated":"2024-12-05T12:29:12Z","published":"2024-12-05T12:29:12Z","title":"Adult Glioma Segmentation in Sub-Saharan Africa using Transfer Learning\n on Stratified Finetuning Data","summary":" Gliomas, a kind of brain tumor characterized by high mortality, present\nsubstantial diagnostic challenges in low- and middle-income countries,\nparticularly in Sub-Saharan Africa. This paper introduces a novel approach to\nglioma segmentation using transfer learning to address challenges in\nresource-limited regions with minimal and low-quality MRI data. We leverage\npre-trained deep learning models, nnU-Net and MedNeXt, and apply a stratified\nfine-tuning strategy using the BraTS2023-Adult-Glioma and BraTS-Africa\ndatasets. Our method exploits radiomic analysis to create stratified training\nfolds, model training on a large brain tumor dataset, and transfer learning to\nthe Sub-Saharan context. A weighted model ensembling strategy and adaptive\npost-processing are employed to enhance segmentation accuracy. The evaluation\nof our proposed method on unseen validation cases on the BraTS-Africa 2024 task\nresulted in lesion-wise mean Dice scores of 0.870, 0.865, and 0.926, for\nenhancing tumor, tumor core, and whole tumor regions and was ranked first for\nthe challenge. Our approach highlights the ability of integrated\nmachine-learning techniques to bridge the gap between the medical imaging\ncapabilities of resource-limited countries and established developed regions.\nBy tailoring our methods to a target population's specific needs and\nconstraints, we aim to enhance diagnostic capabilities in isolated\nenvironments. Our findings underscore the importance of approaches like local\ndata integration and stratification refinement to address healthcare\ndisparities, ensure practical applicability, and enhance impact.\n","authors":["Abhijeet Parida","Daniel Capellán-Martín","Zhifan Jiang","Austin Tapp","Xinyang Liu","Syed Muhammad Anwar","María J. Ledesma-Carbayo","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2412.04111v1.pdf","comment":"10 pages, 3 figures, 3 tables. This paper was accepted at\n MICCAI-BraTS 2024"},{"id":"http://arxiv.org/abs/2411.19824v2","updated":"2024-12-05T12:18:04Z","published":"2024-11-29T16:34:46Z","title":"SAT-HMR: Real-Time Multi-Person 3D Mesh Estimation via Scale-Adaptive\n Tokens","summary":" We propose a one-stage framework for real-time multi-person 3D human mesh\nestimation from a single RGB image. While current one-stage methods, which\nfollow a DETR-style pipeline, achieve state-of-the-art (SOTA) performance with\nhigh-resolution inputs, we observe that this particularly benefits the\nestimation of individuals in smaller scales of the image (e.g., those far from\nthe camera), but at the cost of significantly increased computation overhead.\nTo address this, we introduce scale-adaptive tokens that are dynamically\nadjusted based on the relative scale of each individual in the image within the\nDETR framework. Specifically, individuals in smaller scales are processed at\nhigher resolutions, larger ones at lower resolutions, and background regions\nare further distilled. These scale-adaptive tokens more efficiently encode the\nimage features, facilitating subsequent decoding to regress the human mesh,\nwhile allowing the model to allocate computational resources more effectively\nand focus on more challenging cases. Experiments show that our method preserves\nthe accuracy benefits of high-resolution processing while substantially\nreducing computational cost, achieving real-time inference with performance\ncomparable to SOTA methods.\n","authors":["Chi Su","Xiaoxuan Ma","Jiajun Su","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2411.19824v2.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2412.04097v1","updated":"2024-12-05T12:03:02Z","published":"2024-12-05T12:03:02Z","title":"D-LORD for Motion Stylization","summary":" This paper introduces a novel framework named D-LORD (Double Latent\nOptimization for Representation Disentanglement), which is designed for motion\nstylization (motion style transfer and motion retargeting). The primary\nobjective of this framework is to separate the class and content information\nfrom a given motion sequence using a data-driven latent optimization approach.\nHere, class refers to person-specific style, such as a particular emotion or an\nindividual's identity, while content relates to the style-agnostic aspect of an\naction, such as walking or jumping, as universally understood concepts. The key\nadvantage of D-LORD is its ability to perform style transfer without needing\npaired motion data. Instead, it utilizes class and content labels during the\nlatent optimization process. By disentangling the representation, the framework\nenables the transformation of one motion sequences style to another's style\nusing Adaptive Instance Normalization. The proposed D-LORD framework is\ndesigned with a focus on generalization, allowing it to handle different class\nand content labels for various applications. Additionally, it can generate\ndiverse motion sequences when specific class and content labels are provided.\nThe framework's efficacy is demonstrated through experimentation on three\ndatasets: the CMU XIA dataset for motion style transfer, the MHAD dataset, and\nthe RRIS Ability dataset for motion retargeting. Notably, this paper presents\nthe first generalized framework for motion style transfer and motion\nretargeting, showcasing its potential contributions in this area.\n","authors":["Meenakshi Gupta","Mingyuan Lei","Tat-Jen Cham","Hwee Kuan Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04095v1","updated":"2024-12-05T12:01:20Z","published":"2024-12-05T12:01:20Z","title":"HyperFLINT: Hypernetwork-based Flow Estimation and Temporal\n Interpolation for Scientific Ensemble Visualization","summary":" We present HyperFLINT (Hypernetwork-based FLow estimation and temporal\nINTerpolation), a novel deep learning-based approach for estimating flow\nfields, temporally interpolating scalar fields, and facilitating parameter\nspace exploration in spatio-temporal scientific ensemble data. This work\naddresses the critical need to explicitly incorporate ensemble parameters into\nthe learning process, as traditional methods often neglect these, limiting\ntheir ability to adapt to diverse simulation settings and provide meaningful\ninsights into the data dynamics. HyperFLINT introduces a hypernetwork to\naccount for simulation parameters, enabling it to generate accurate\ninterpolations and flow fields for each timestep by dynamically adapting to\nvarying conditions, thereby outperforming existing parameter-agnostic\napproaches. The architecture features modular neural blocks with convolutional\nand deconvolutional layers, supported by a hypernetwork that generates weights\nfor the main network, allowing the model to better capture intricate simulation\ndynamics. A series of experiments demonstrates HyperFLINT's significantly\nimproved performance in flow field estimation and temporal interpolation, as\nwell as its potential in enabling parameter space exploration, offering\nvaluable insights into complex scientific ensembles.\n","authors":["Hamid Gadirov","Qi Wu","David Bauer","Kwan-Liu Ma","Jos Roerdink","Steffen Frey"],"pdf_url":"https://arxiv.org/pdf/2412.04095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04094v1","updated":"2024-12-05T12:00:00Z","published":"2024-12-05T12:00:00Z","title":"Magnetic Resonance Imaging Feature-Based Subtyping and Model Ensemble\n for Enhanced Brain Tumor Segmentation","summary":" Accurate and automatic segmentation of brain tumors in multi-parametric\nmagnetic resonance imaging (mpMRI) is essential for quantitative measurements,\nwhich play an increasingly important role in clinical diagnosis and prognosis.\nThe International Brain Tumor Segmentation (BraTS) Challenge 2024 offers a\nunique benchmarking opportunity, including various types of brain tumors in\nboth adult and pediatric populations, such as pediatric brain tumors (PED),\nmeningiomas (MEN-RT) and brain metastases (MET), among others. Compared to\nprevious editions, BraTS 2024 has implemented changes to substantially increase\nclinical relevance, such as refined tumor regions for evaluation. We propose a\ndeep learning-based ensemble approach that integrates state-of-the-art\nsegmentation models. Additionally, we introduce innovative, adaptive pre- and\npost-processing techniques that employ MRI-based radiomic analyses to\ndifferentiate tumor subtypes. Given the heterogeneous nature of the tumors\npresent in the BraTS datasets, this approach enhances the precision and\ngeneralizability of segmentation models. On the final testing sets, our method\nachieved mean lesion-wise Dice similarity coefficients of 0.926, 0.801, and\n0.688 for the whole tumor in PED, MEN-RT, and MET, respectively. These results\ndemonstrate the effectiveness of our approach in improving segmentation\nperformance and generalizability for various brain tumor types.\n","authors":["Zhifan Jiang","Daniel Capellán-Martín","Abhijeet Parida","Austin Tapp","Xinyang Liu","María J. Ledesma-Carbayo","Syed Muhammad Anwar","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2412.04094v1.pdf","comment":"11 pages, 4 figures, 3 tables. This paper was accepted at\n MICCAI-BraTS 2024"},{"id":"http://arxiv.org/abs/2412.04090v1","updated":"2024-12-05T11:52:20Z","published":"2024-12-05T11:52:20Z","title":"LossAgent: Towards Any Optimization Objectives for Image Processing with\n LLM Agents","summary":" We present the first loss agent, dubbed LossAgent, for low-level image\nprocessing tasks, e.g., image super-resolution and restoration, intending to\nachieve any customized optimization objectives of low-level image processing in\ndifferent practical applications. Notably, not all optimization objectives,\nsuch as complex hand-crafted perceptual metrics, text description, and\nintricate human feedback, can be instantiated with existing low-level losses,\ne.g., MSE loss. which presents a crucial challenge in optimizing image\nprocessing networks in an end-to-end manner. To eliminate this, our LossAgent\nintroduces the powerful large language model (LLM) as the loss agent, where the\nrich textual understanding of prior knowledge empowers the loss agent with the\npotential to understand complex optimization objectives, trajectory, and state\nfeedback from external environments in the optimization process of the\nlow-level image processing networks. In particular, we establish the loss\nrepository by incorporating existing loss functions that support the end-to-end\noptimization for low-level image processing. Then, we design the\noptimization-oriented prompt engineering for the loss agent to actively and\nintelligently decide the compositional weights for each loss in the repository\nat each optimization interaction, thereby achieving the required optimization\ntrajectory for any customized optimization objectives. Extensive experiments on\nthree typical low-level image processing tasks and multiple optimization\nobjectives have shown the effectiveness and applicability of our proposed\nLossAgent. Code and pre-trained models will be available at\nhttps://github.com/lbc12345/LossAgent.\n","authors":["Bingchen Li","Xin Li","Yiting Lu","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2412.04090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13569v2","updated":"2024-12-05T11:50:24Z","published":"2024-10-17T17:17:09Z","title":"Learning on Model Weights using Tree Experts","summary":" The increasing availability of public models begs the question: can we train\nneural networks that use other networks as input? Such models allow us to study\ndifferent aspects of a given neural network, for example, determining the\ncategories in a model's training dataset. However, machine learning on model\nweights is challenging as they often exhibit significant variation unrelated to\nthe models' semantic properties (nuisance variation). Here, we identify a key\nproperty of real-world models: most public models belong to a small set of\nModel Trees, where all models within a tree are fine-tuned from a common\nancestor (e.g., a foundation model). Importantly, we find that within each tree\nthere is less nuisance variation between models. Concretely, while learning\nacross Model Trees requires complex architectures, even a linear classifier\ntrained on a single model layer often works within trees. While effective,\nthese linear classifiers are computationally expensive, especially when dealing\nwith larger models that have many parameters. To address this, we introduce\nProbing Experts (ProbeX), a theoretically motivated and lightweight method.\nNotably, ProbeX is the first probing method specifically designed to learn from\nthe weights of a single hidden model layer. We demonstrate the effectiveness of\nProbeX by predicting the categories in a model's training dataset based only on\nits weights. Excitingly, ProbeX can also map the weights of Stable Diffusion\ninto a shared weight-language embedding space, enabling zero-shot model\nclassification.\n","authors":["Eliahu Horwitz","Bar Cavia","Jonathan Kahana","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2410.13569v2.pdf","comment":"Project page: https://horwitz.ai/probex/"},{"id":"http://arxiv.org/abs/2412.04086v1","updated":"2024-12-05T11:48:54Z","published":"2024-12-05T11:48:54Z","title":"BodyMetric: Evaluating the Realism of HumanBodies in Text-to-Image\n Generation","summary":" Accurately generating images of human bodies from text remains a challenging\nproblem for state of the art text-to-image models. Commonly observed\nbody-related artifacts include extra or missing limbs, unrealistic poses,\nblurred body parts, etc. Currently, evaluation of such artifacts relies heavily\non time-consuming human judgments, limiting the ability to benchmark models at\nscale. We address this by proposing BodyMetric, a learnable metric that\npredicts body realism in images. BodyMetric is trained on realism labels and\nmulti-modal signals including 3D body representations inferred from the input\nimage, and textual descriptions. In order to facilitate this approach, we\ndesign an annotation pipeline to collect expert ratings on human body realism\nleading to a new dataset for this task, namely, BodyRealism. Ablation studies\nsupport our architectural choices for BodyMetric and the importance of\nleveraging a 3D human body prior in capturing body-related artifacts in 2D\nimages. In comparison to concurrent metrics which evaluate general user\npreference in images, BodyMetric specifically reflects body-related artifacts.\nWe demonstrate the utility of BodyMetric through applications that were\npreviously infeasible at scale. In particular, we use BodyMetric to benchmark\nthe generation ability of text-to-image models to produce realistic human\nbodies. We also demonstrate the effectiveness of BodyMetric in ranking\ngenerated images based on the predicted realism scores.\n","authors":["Nefeli Andreou","Varsha Vivek","Ying Wang","Alex Vorobiov","Tiffany Deng","Raja Bala","Larry Davis","Betty Mohler Tesch"],"pdf_url":"https://arxiv.org/pdf/2412.04086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11624v3","updated":"2024-12-05T11:47:49Z","published":"2024-06-17T15:07:55Z","title":"Words in Motion: Extracting Interpretable Control Vectors for Motion\n Transformers","summary":" Transformer-based models generate hidden states that are difficult to\ninterpret. In this work, we aim to interpret these hidden states and control\nthem at inference, with a focus on motion forecasting. We use linear probes to\nmeasure neural collapse towards interpretable motion features in hidden states.\nHigh probing accuracy implies meaningful directions and distances between\nhidden states of opposing features, which we use to fit interpretable control\nvectors for activation steering at inference. To optimize our control vectors,\nwe use sparse autoencoders with fully-connected, convolutional, MLPMixer layers\nand various activation functions. Notably, we show that enforcing sparsity in\nhidden states leads to a more linear relationship between control vector\ntemperatures and forecasts. Our approach enables mechanistic interpretability\nand zero-shot generalization to unseen dataset characteristics with negligible\ncomputational overhead. Our implementation is available at\nhttps://github.com/kit-mrt/future-motion\n","authors":["Omer Sahin Tas","Royden Wagner"],"pdf_url":"https://arxiv.org/pdf/2406.11624v3.pdf","comment":"Add autoencoders with convolutional, MLPMixer layers, and JumpReLU\n activations"},{"id":"http://arxiv.org/abs/2411.15390v2","updated":"2024-12-05T11:46:40Z","published":"2024-11-23T00:09:42Z","title":"The Hatching-Box: A Novel System for Automated Monitoring and\n Quantification of \\textit{Drosophila melanogaster} Developmental Behavior","summary":" In this paper we propose the Hatching-Box, a novel imaging and analysis\nsystem to automatically monitor and quantify the developmental behavior of\nDrosophila in standard rearing vials and during regular rearing routines,\nrendering explicit experiments obsolete. This is achieved by combining custom\ntailored imaging hardware with dedicated detection and tracking algorithms,\nenabling the quantification of larvae, filled/empty pupae and flies over\nmultiple days. Given the affordable and reproducible design of the Hatching-Box\nin combination with our generic client/server-based software, the system can\neasily be scaled to monitor an arbitrary amount of rearing vials\nsimultaneously. We evaluated our system on a curated image dataset comprising\nnearly 470,000 annotated objects and performed several studies on real world\nexperiments. We successfully reproduced results from well-established circadian\nexperiments by comparing the eclosion periods of wild type flies to the clock\nmutants $\\textit{per}^{short}$, $\\textit{per}^{long}$ and $\\textit{per}^0$\nwithout involvement of any manual labor. Furthermore we show, that the\nHatching-Box is able to extract additional information about group behavior as\nwell as to reconstruct the whole life-cycle of the individual specimens. These\nresults not only demonstrate the applicability of our system for long-term\nexperiments but also indicate its benefits for automated monitoring in the\ngeneral cultivation process.\n","authors":["Julian Bigge","Maite Ogueta","Luis Garcia","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2411.15390v2.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2412.04083v1","updated":"2024-12-05T11:36:37Z","published":"2024-12-05T11:36:37Z","title":"Unified Framework for Open-World Compositional Zero-shot Learning","summary":" Open-World Compositional Zero-Shot Learning (OW-CZSL) addresses the challenge\nof recognizing novel compositions of known primitives and entities. Even though\nprior works utilize language knowledge for recognition, such approaches exhibit\nlimited interactions between language-image modalities. Our approach primarily\nfocuses on enhancing the inter-modality interactions through fostering richer\ninteractions between image and textual data. Additionally, we introduce a novel\nmodule aimed at alleviating the computational burden associated with exhaustive\nexploration of all possible compositions during the inference stage. While\nprevious methods exclusively learn compositions jointly or independently, we\nintroduce an advanced hybrid procedure that leverages both learning mechanisms\nto generate final predictions. Our proposed model, achieves state-of-the-art in\nOW-CZSL in three datasets, while surpassing Large Vision Language Models (LLVM)\nin two datasets.\n","authors":["Hirunima Jayasekara","Khoi Pham","Nirat Saini","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2412.04083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18245v2","updated":"2024-12-05T11:29:56Z","published":"2024-07-25T17:58:17Z","title":"VGGHeads: 3D Multi Head Alignment with a Large-Scale Synthetic Dataset","summary":" Human head detection, keypoint estimation, and 3D head model fitting are\nessential tasks with many applications. However, traditional real-world\ndatasets often suffer from bias, privacy, and ethical concerns, and they have\nbeen recorded in laboratory environments, which makes it difficult for trained\nmodels to generalize. Here, we introduce \\method -- a large-scale synthetic\ndataset generated with diffusion models for human head detection and 3D mesh\nestimation. Our dataset comprises over 1 million high-resolution images, each\nannotated with detailed 3D head meshes, facial landmarks, and bounding boxes.\nUsing this dataset, we introduce a new model architecture capable of\nsimultaneous head detection and head mesh reconstruction from a single image in\na single step. Through extensive experimental evaluations, we demonstrate that\nmodels trained on our synthetic data achieve strong performance on real images.\nFurthermore, the versatility of our dataset makes it applicable across a broad\nspectrum of tasks, offering a general and comprehensive representation of human\nheads.\n","authors":["Orest Kupyn","Eugene Khvedchenia","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2407.18245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18017v2","updated":"2024-12-05T11:21:16Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04077v1","updated":"2024-12-05T11:17:57Z","published":"2024-12-05T11:17:57Z","title":"SoRA: Singular Value Decomposed Low-Rank Adaptation for Domain\n Generalizable Representation Learning","summary":" Domain generalization (DG) aims to adapt a model using one or multiple source\ndomains to ensure robust performance in unseen target domains. Recently,\nParameter-Efficient Fine-Tuning (PEFT) of foundation models has shown promising\nresults in the context of DG problem. Nevertheless, existing PEFT methods still\nstruggle to strike a balance between preserving generalizable components of the\npre-trained model and learning task-specific features. To gain insights into\nthe distribution of generalizable components, we begin by analyzing the\npre-trained weights through the lens of singular value decomposition. Building\non these insights, we introduce Singular Value Decomposed Low-Rank Adaptation\n(SoRA), an approach that selectively tunes minor singular components while\nkeeping the residual parts frozen. SoRA effectively retains the generalization\nability of the pre-trained model while efficiently acquiring task-specific\nskills. Furthermore, we freeze domain-generalizable blocks and employ an\nannealing weight decay strategy, thereby achieving an optimal balance in the\ndelicate trade-off between generalizability and discriminability. SoRA attains\nstate-of-the-art results on multiple benchmarks that span both domain\ngeneralized semantic segmentation to domain generalized object detection. In\naddition, our methods introduce no additional inference overhead or\nregularization loss, maintain compatibility with any backbone or head, and are\ndesigned to be versatile, allowing easy integration into a wide range of tasks.\n","authors":["Seokju Yun","Seunghye Chae","Dongheon Lee","Youngmin Ro"],"pdf_url":"https://arxiv.org/pdf/2412.04077v1.pdf","comment":"Project page: https://ysj9909.github.io/SoRA.github.io/"},{"id":"http://arxiv.org/abs/2412.04073v1","updated":"2024-12-05T11:11:39Z","published":"2024-12-05T11:11:39Z","title":"TransAdapter: Vision Transformer for Feature-Centric Unsupervised Domain\n Adaptation","summary":" Unsupervised Domain Adaptation (UDA) aims to utilize labeled data from a\nsource domain to solve tasks in an unlabeled target domain, often hindered by\nsignificant domain gaps. Traditional CNN-based methods struggle to fully\ncapture complex domain relationships, motivating the shift to vision\ntransformers like the Swin Transformer, which excel in modeling both local and\nglobal dependencies. In this work, we propose a novel UDA approach leveraging\nthe Swin Transformer with three key modules. A Graph Domain Discriminator\nenhances domain alignment by capturing inter-pixel correlations through graph\nconvolutions and entropy-based attention differentiation. An Adaptive Double\nAttention module combines Windows and Shifted Windows attention with dynamic\nreweighting to align long-range and local features effectively. Finally, a\nCross-Feature Transform modifies Swin Transformer blocks to improve\ngeneralization across domains. Extensive benchmarks confirm the\nstate-of-the-art performance of our versatile method, which requires no\ntask-specific alignment modules, establishing its adaptability to diverse\napplications.\n","authors":["A. Enes Doruk","Erhan Oztop","Hasan F. Ates"],"pdf_url":"https://arxiv.org/pdf/2412.04073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07330v3","updated":"2024-12-05T11:03:41Z","published":"2023-01-18T06:37:24Z","title":"FPANet: Frequency-based Video Demoireing using Frame-level Post\n Alignment","summary":" Moire patterns, created by the interference between overlapping grid patterns\nin the pixel space, degrade the visual quality of images and videos. Therefore,\nremoving such patterns~(demoireing) is crucial, yet remains a challenge due to\ntheir complexities in sizes and distortions. Conventional methods mainly tackle\nthis task by only exploiting the spatial domain of the input images, limiting\ntheir capabilities in removing large-scale moire patterns. Therefore, this work\nproposes FPANet, an image-video demoireing network that learns filters in both\nfrequency and spatial domains, improving the restoration quality by removing\nvarious sizes of moire patterns. To further enhance, our model takes multiple\nconsecutive frames, learning to extract frame-invariant content features and\noutputting better quality temporally consistent images. We demonstrate the\neffectiveness of our proposed method with a publicly available large-scale\ndataset, observing that ours outperforms the state-of-the-art approaches in\nterms of image and video quality metrics and visual experience.\n","authors":["Gyeongrok Oh","Sungjune Kim","Heon Gu","Sang Ho Yoon","Jinkyu Kim","Sangpil Kim"],"pdf_url":"https://arxiv.org/pdf/2301.07330v3.pdf","comment":"Accepted version, to appear in Neural Networks"},{"id":"http://arxiv.org/abs/2412.04062v1","updated":"2024-12-05T10:57:08Z","published":"2024-12-05T10:57:08Z","title":"ZipAR: Accelerating Autoregressive Image Generation through Spatial\n Locality","summary":" In this paper, we propose ZipAR, a training-free, plug-and-play parallel\ndecoding framework for accelerating auto-regressive (AR) visual generation. The\nmotivation stems from the observation that images exhibit local structures, and\nspatially distant regions tend to have minimal interdependence. Given a\npartially decoded set of visual tokens, in addition to the original next-token\nprediction scheme in the row dimension, the tokens corresponding to spatially\nadjacent regions in the column dimension can be decoded in parallel, enabling\nthe ``next-set prediction'' paradigm. By decoding multiple tokens\nsimultaneously in a single forward pass, the number of forward passes required\nto generate an image is significantly reduced, resulting in a substantial\nimprovement in generation efficiency. Experiments demonstrate that ZipAR can\nreduce the number of model forward passes by up to 91% on the Emu3-Gen model\nwithout requiring any additional retraining.\n","authors":["Yefei He","Feng Chen","Yuanyu He","Shaoxuan He","Hong Zhou","Kaipeng Zhang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2412.04062v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2212.05005v4","updated":"2024-12-05T10:52:25Z","published":"2022-12-09T17:45:36Z","title":"Memories are One-to-Many Mapping Alleviators in Talking Face Generation","summary":" Talking face generation aims at generating photo-realistic video portraits of\na target person driven by input audio. Due to its nature of one-to-many mapping\nfrom the input audio to the output video (e.g., one speech content may have\nmultiple feasible visual appearances), learning a deterministic mapping like\nprevious works brings ambiguity during training, and thus causes inferior\nvisual results. Although this one-to-many mapping could be alleviated in part\nby a two-stage framework (i.e., an audio-to-expression model followed by a\nneural-rendering model), it is still insufficient since the prediction is\nproduced without enough information (e.g., emotions, wrinkles, etc.). In this\npaper, we propose MemFace to complement the missing information with an\nimplicit memory and an explicit memory that follow the sense of the two stages\nrespectively. More specifically, the implicit memory is employed in the\naudio-to-expression model to capture high-level semantics in the\naudio-expression shared space, while the explicit memory is employed in the\nneural-rendering model to help synthesize pixel-level details. Our experimental\nresults show that our proposed MemFace surpasses all the state-of-the-art\nresults across multiple scenarios consistently and significantly.\n","authors":["Anni Tang","Tianyu He","Xu Tan","Jun Ling","Li Song"],"pdf_url":"https://arxiv.org/pdf/2212.05005v4.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence\n (2024). Project page: see https://memoryface.github.io"},{"id":"http://arxiv.org/abs/2410.14462v2","updated":"2024-12-05T10:34:11Z","published":"2024-10-18T13:44:29Z","title":"LUDVIG: Learning-free Uplifting of 2D Visual features to Gaussian\n Splatting scenes","summary":" We address the problem of extending the capabilities of vision foundation\nmodels such as DINO, SAM, and CLIP, to 3D tasks. Specifically, we introduce a\nnovel method to uplift 2D image features into 3D Gaussian Splatting scenes.\nUnlike traditional approaches that rely on minimizing a reconstruction loss,\nour method employs a simpler and more efficient feature aggregation technique,\naugmented by a graph diffusion mechanism. Graph diffusion enriches features\nfrom a given model, such as CLIP, by leveraging pairwise similarities that\nencode 3D geometry or similarities induced by another embedding like DINOv2.\nOur approach achieves performance comparable to the state of the art on\nmultiple downstream tasks while delivering significant speed-ups. Notably, we\nobtain competitive segmentation results using generic DINOv2 features, despite\nDINOv2 not being trained on millions of annotated segmentation masks like SAM.\nWhen applied to CLIP features, our method demonstrates strong performance in\nopen-vocabulary, language-based object detection tasks, highlighting the\nversatility of our approach.\n","authors":["Juliette Marrie","Romain Menegaux","Michael Arbel","Diane Larlus","Julien Mairal"],"pdf_url":"https://arxiv.org/pdf/2410.14462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07500v3","updated":"2024-12-05T10:23:27Z","published":"2024-06-11T17:35:39Z","title":"SPIN: Spacecraft Imagery for Navigation","summary":" The scarcity of data acquired under actual space operational conditions poses\na significant challenge for developing learning-based visual navigation\nalgorithms crucial for autonomous spacecraft navigation. This data shortage is\nprimarily due to the prohibitive costs and inherent complexities of space\noperations. While existing datasets, predominantly relying on\ncomputer-simulated data, have partially addressed this gap, they present\nnotable limitations. Firstly, these datasets often utilize proprietary image\ngeneration tools, restricting the evaluation of navigation methods in novel,\nunseen scenarios. Secondly, they provide limited ground-truth data, typically\nfocusing solely on the spacecraft's translation and rotation relative to the\ncamera. To address these limitations, we present SPIN (SPacecraft Imagery for\nNavigation), an open-source spacecraft image generation tool designed to\nsupport a wide range of visual navigation scenarios in space, with a particular\nfocus on relative navigation tasks. SPIN provides multiple modalities of\nground-truth data and allows researchers to employ custom 3D models of\nsatellites, define specific camera-relative poses, and adjust settings such as\ncamera parameters or environmental illumination conditions. We also propose a\nmethod for exploiting our tool as a data augmentation module. We validate our\ntool on the spacecraft pose estimation task by training with a SPIN-generated\nreplica of SPEED+, reaching a 47% average error reduction on SPEED+ testbed\ndata (that simulates realistic space conditions), further reducing it to a 60%\nerror reduction when using SPIN as a data augmentation method. Both the SPIN\ntool (and source code) and our SPIN-generated version of SPEED+ will be\npublicly released upon paper acceptance on GitHub.\nhttps://github.com/vpulab/SPIN\n","authors":["Javier Montalvo","Juan Ignacio Bravo Pérez-Villar","Álvaro García-Martín","Pablo Carballeira","Jesús Bescós"],"pdf_url":"https://arxiv.org/pdf/2406.07500v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04039v1","updated":"2024-12-05T10:23:16Z","published":"2024-12-05T10:23:16Z","title":"Benchmarking and Enhancing Surgical Phase Recognition Models for\n Robotic-Assisted Esophagectomy","summary":" Robotic-assisted minimally invasive esophagectomy (RAMIE) is a recognized\ntreatment for esophageal cancer, offering better patient outcomes compared to\nopen surgery and traditional minimally invasive surgery. RAMIE is highly\ncomplex, spanning multiple anatomical areas and involving repetitive phases and\nnon-sequential phase transitions. Our goal is to leverage deep learning for\nsurgical phase recognition in RAMIE to provide intraoperative support to\nsurgeons. To achieve this, we have developed a new surgical phase recognition\ndataset comprising 27 videos. Using this dataset, we conducted a comparative\nanalysis of state-of-the-art surgical phase recognition models. To more\neffectively capture the temporal dynamics of this complex procedure, we\ndeveloped a novel deep learning model featuring an encoder-decoder structure\nwith causal hierarchical attention, which demonstrates superior performance\ncompared to existing models.\n","authors":["Yiping Li","Romy van Jaarsveld","Ronald de Jong","Jasper Bongers","Gino Kuiper","Richard van Hillegersberg","Jelle Ruurda","Marcel Breeuwer","Yasmina Al Khalil"],"pdf_url":"https://arxiv.org/pdf/2412.04039v1.pdf","comment":"Accepted for presentation at the SPIE Medical Imaging Conference,\n 2025"},{"id":"http://arxiv.org/abs/2412.04037v1","updated":"2024-12-05T10:20:34Z","published":"2024-12-05T10:20:34Z","title":"INFP: Audio-Driven Interactive Head Generation in Dyadic Conversations","summary":" Imagine having a conversation with a socially intelligent agent. It can\nattentively listen to your words and offer visual and linguistic feedback\npromptly. This seamless interaction allows for multiple rounds of conversation\nto flow smoothly and naturally. In pursuit of actualizing it, we propose INFP,\na novel audio-driven head generation framework for dyadic interaction. Unlike\nprevious head generation works that only focus on single-sided communication,\nor require manual role assignment and explicit role switching, our model drives\nthe agent portrait dynamically alternates between speaking and listening state,\nguided by the input dyadic audio. Specifically, INFP comprises a Motion-Based\nHead Imitation stage and an Audio-Guided Motion Generation stage. The first\nstage learns to project facial communicative behaviors from real-life\nconversation videos into a low-dimensional motion latent space, and use the\nmotion latent codes to animate a static image. The second stage learns the\nmapping from the input dyadic audio to motion latent codes through denoising,\nleading to the audio-driven head generation in interactive scenarios. To\nfacilitate this line of research, we introduce DyConv, a large scale dataset of\nrich dyadic conversations collected from the Internet. Extensive experiments\nand visualizations demonstrate superior performance and effectiveness of our\nmethod. Project Page: https://grisoon.github.io/INFP/.\n","authors":["Yongming Zhu","Longhao Zhang","Zhengkun Rong","Tianshu Hu","Shuang Liang","Zhipeng Ge"],"pdf_url":"https://arxiv.org/pdf/2412.04037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04030v1","updated":"2024-12-05T10:06:58Z","published":"2024-12-05T10:06:58Z","title":"Mask of truth: model sensitivity to unexpected regions of medical images","summary":" The development of larger models for medical image analysis has led to\nincreased performance. However, it also affected our ability to explain and\nvalidate model decisions. Models can use non-relevant parts of images, also\ncalled spurious correlations or shortcuts, to obtain high performance on\nbenchmark datasets but fail in real-world scenarios. In this work, we challenge\nthe capacity of convolutional neural networks (CNN) to classify chest X-rays\nand eye fundus images while masking out clinically relevant parts of the image.\nWe show that all models trained on the PadChest dataset, irrespective of the\nmasking strategy, are able to obtain an Area Under the Curve (AUC) above\nrandom. Moreover, the models trained on full images obtain good performance on\nimages without the region of interest (ROI), even superior to the one obtained\non images only containing the ROI. We also reveal a possible spurious\ncorrelation in the Chaksu dataset while the performances are more aligned with\nthe expectation of an unbiased model. We go beyond the performance analysis\nwith the usage of the explainability method SHAP and the analysis of\nembeddings. We asked a radiology resident to interpret chest X-rays under\ndifferent masking to complement our findings with clinical knowledge. Our code\nis available at https://github.com/TheoSourget/MMC_Masking and\nhttps://github.com/TheoSourget/MMC_Masking_EyeFundus\n","authors":["Théo Sourget","Michelle Hestbek-Møller","Amelia Jiménez-Sánchez","Jack Junchi Xu","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2412.04030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04020v1","updated":"2024-12-05T09:56:24Z","published":"2024-12-05T09:56:24Z","title":"PriorMotion: Generative Class-Agnostic Motion Prediction with\n Raster-Vector Motion Field Priors","summary":" Reliable perception of spatial and motion information is crucial for safe\nautonomous navigation. Traditional approaches typically fall into two\ncategories: object-centric and class-agnostic methods. While object-centric\nmethods often struggle with missed detections, leading to inaccuracies in\nmotion prediction, many class-agnostic methods focus heavily on encoder design,\noften overlooking important priors like rigidity and temporal consistency,\nleading to suboptimal performance, particularly with sparse LiDAR data at\ndistant region. To address these issues, we propose $\\textbf{PriorMotion}$, a\ngenerative framework that extracts rasterized and vectorized scene\nrepresentations to model spatio-temporal priors. Our model comprises a BEV\nencoder, an Raster-Vector prior Encoder, and a Spatio-Temporal prior Generator,\nimproving both spatial and temporal consistency in motion prediction.\nAdditionally, we introduce a standardized evaluation protocol for\nclass-agnostic motion prediction. Experiments on the nuScenes dataset show that\nPriorMotion achieves state-of-the-art performance, with further validation on\nadvanced FMCW LiDAR confirming its robustness.\n","authors":["Kangan Qian","Xinyu Jiao","Yining Shi","Yunlong Wang","Ziang Luo","Zheng Fu","Kun Jiang","Diange Yang"],"pdf_url":"https://arxiv.org/pdf/2412.04020v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.05712v2","updated":"2024-12-05T09:39:07Z","published":"2024-11-08T17:13:53Z","title":"Scaling Laws for Task-Optimized Models of the Primate Visual Ventral\n Stream","summary":" When trained on large-scale object classification datasets, certain\nartificial neural network models begin to approximate core object recognition\n(COR) behaviors and neural response patterns in the primate visual ventral\nstream (VVS). While recent machine learning advances suggest that scaling model\nsize, dataset size, and compute resources improve task performance, the impact\nof scaling on brain alignment remains unclear. In this study, we explore\nscaling laws for modeling the primate VVS by systematically evaluating over 600\nmodels trained under controlled conditions on benchmarks spanning V1, V2, V4,\nIT and COR behaviors. We observe that while behavioral alignment continues to\nscale with larger models, neural alignment saturates. This observation remains\ntrue across model architectures and training datasets, even though models with\nstronger inductive bias and datasets with higher-quality images are more\ncompute-efficient. Increased scaling is especially beneficial for higher-level\nvisual areas, where small models trained on few samples exhibit only poor\nalignment. Finally, we develop a scaling recipe, indicating that a greater\nproportion of compute should be allocated to data samples over model size. Our\nresults suggest that while scaling alone might suffice for alignment with human\ncore object recognition behavior, it will not yield improved models of the\nbrain's visual ventral stream with current architectures and datasets,\nhighlighting the need for novel strategies in building brain-like models.\n","authors":["Abdulkadir Gokce","Martin Schrimpf"],"pdf_url":"https://arxiv.org/pdf/2411.05712v2.pdf","comment":"10 pages for the main paper, 23 pages in total. 7 main figures and 7\n supplementary figures. Code, model weights, and benchmark results can be\n accessed at https://github.com/epflneuroailab/scaling-primate-vvs - In\n version 2, Figure 7 and the related discussion are added, and the appendix is\n updated"},{"id":"http://arxiv.org/abs/2412.04000v1","updated":"2024-12-05T09:20:48Z","published":"2024-12-05T09:20:48Z","title":"IF-MDM: Implicit Face Motion Diffusion Model for High-Fidelity Realtime\n Talking Head Generation","summary":" We introduce a novel approach for high-resolution talking head generation\nfrom a single image and audio input. Prior methods using explicit face models,\nlike 3D morphable models (3DMM) and facial landmarks, often fall short in\ngenerating high-fidelity videos due to their lack of appearance-aware motion\nrepresentation. While generative approaches such as video diffusion models\nachieve high video quality, their slow processing speeds limit practical\napplication. Our proposed model, Implicit Face Motion Diffusion Model (IF-MDM),\nemploys implicit motion to encode human faces into appearance-aware compressed\nfacial latents, enhancing video generation. Although implicit motion lacks the\nspatial disentanglement of explicit models, which complicates alignment with\nsubtle lip movements, we introduce motion statistics to help capture\nfine-grained motion information. Additionally, our model provides motion\ncontrollability to optimize the trade-off between motion intensity and visual\nquality during inference. IF-MDM supports real-time generation of 512x512\nresolution videos at up to 45 frames per second (fps). Extensive evaluations\ndemonstrate its superior performance over existing diffusion and explicit face\nmodels. The code will be released publicly, available alongside supplementary\nmaterials. The video results can be found on\nhttps://bit.ly/ifmdm_supplementary.\n","authors":["Sejong Yang","Seoung Wug Oh","Yang Zhou","Seon Joo Kim"],"pdf_url":"https://arxiv.org/pdf/2412.04000v1.pdf","comment":"underreview in CVPR 2025"},{"id":"http://arxiv.org/abs/2412.03995v1","updated":"2024-12-05T09:15:21Z","published":"2024-12-05T09:15:21Z","title":"Blind Underwater Image Restoration using Co-Operational Regressor\n Networks","summary":" The exploration of underwater environments is essential for applications such\nas biological research, archaeology, and infrastructure maintenanceHowever,\nunderwater imaging is challenging due to the waters unique properties,\nincluding scattering, absorption, color distortion, and reduced visibility. To\naddress such visual degradations, a variety of approaches have been proposed\ncovering from basic signal processing methods to deep learning models; however,\nnone of them has proven to be consistently successful. In this paper, we\npropose a novel machine learning model, Co-Operational Regressor Networks\n(CoRe-Nets), designed to achieve the best possible underwater image\nrestoration. A CoRe-Net consists of two co-operating networks: the Apprentice\nRegressor (AR), responsible for image transformation, and the Master Regressor\n(MR), which evaluates the Peak Signal-to-Noise Ratio (PSNR) of the images\ngenerated by the AR and feeds it back to AR. CoRe-Nets are built on\nSelf-Organized Operational Neural Networks (Self-ONNs), which offer a superior\nlearning capability by modulating nonlinearity in kernel transformations. The\neffectiveness of the proposed model is demonstrated on the benchmark Large\nScale Underwater Image (LSUI) dataset. Leveraging the joint learning\ncapabilities of the two cooperating networks, the proposed model achieves the\nstate-of-art restoration performance with significantly reduced computational\ncomplexity and often presents such results that can even surpass the visual\nquality of the ground truth with a 2-pass application. Our results and the\noptimized PyTorch implementation of the proposed approach are now publicly\nshared on GitHub.\n","authors":["Ozer Can Devecioglu","Serkan Kiranyaz","Turker Ince","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2412.03995v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2412.03993v1","updated":"2024-12-05T09:14:50Z","published":"2024-12-05T09:14:50Z","title":"LaserGuider: A Laser Based Physical Backdoor Attack against Deep Neural\n Networks","summary":" Backdoor attacks embed hidden associations between triggers and targets in\ndeep neural networks (DNNs), causing them to predict the target when a trigger\nis present while maintaining normal behavior otherwise. Physical backdoor\nattacks, which use physical objects as triggers, are feasible but lack remote\ncontrol, temporal stealthiness, flexibility, and mobility. To overcome these\nlimitations, in this work, we propose a new type of backdoor triggers utilizing\nlasers that feature long-distance transmission and instant-imaging properties.\nBased on the laser-based backdoor triggers, we present a physical backdoor\nattack, called LaserGuider, which possesses remote control ability and achieves\nhigh temporal stealthiness, flexibility, and mobility. We also introduce a\nsystematic approach to optimize laser parameters for improving attack\neffectiveness. Our evaluation on traffic sign recognition DNNs, critical in\nautonomous vehicles, demonstrates that LaserGuider with three different\nlaser-based triggers achieves over 90% attack success rate with negligible\nimpact on normal inputs. Additionally, we release LaserMark, the first dataset\nof real world traffic signs stamped with physical laser spots, to support\nfurther research in backdoor attacks and defenses.\n","authors":["Yongjie Xu","Guangke Chen","Fu Song","Yuqi Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03993v1.pdf","comment":"In Proceedings of the 23rd International Conference on Applied\n Cryptography and Network Security (ACNS), Munich, Germany, 23-26 June, 2025"},{"id":"http://arxiv.org/abs/2403.09401v3","updated":"2024-12-05T09:09:37Z","published":"2024-03-14T13:52:03Z","title":"Unsupervised Modality-Transferable Video Highlight Detection with\n Representation Activation Sequence Learning","summary":" Identifying highlight moments of raw video materials is crucial for improving\nthe efficiency of editing videos that are pervasive on internet platforms.\nHowever, the extensive work of manually labeling footage has created obstacles\nto applying supervised methods to videos of unseen categories. The absence of\nan audio modality that contains valuable cues for highlight detection in many\nvideos also makes it difficult to use multimodal strategies. In this paper, we\npropose a novel model with cross-modal perception for unsupervised highlight\ndetection. The proposed model learns representations with visual-audio level\nsemantics from image-audio pair data via a self-reconstruction task. To achieve\nunsupervised highlight detection, we investigate the latent representations of\nthe network and propose the representation activation sequence learning (RASL)\nmodule with k-point contrastive learning to learn significant representation\nactivations. To connect the visual modality with the audio modality, we use the\nsymmetric contrastive learning (SCL) module to learn the paired visual and\naudio representations. Furthermore, an auxiliary task of masked feature vector\nsequence (FVS) reconstruction is simultaneously conducted during pretraining\nfor representation enhancement. During inference, the cross-modal pretrained\nmodel can generate representations with paired visual-audio semantics given\nonly the visual modality. The RASL module is used to output the highlight\nscores. The experimental results show that the proposed framework achieves\nsuperior performance compared to other state-of-the-art approaches.\n","authors":["Tingtian Li","Zixun Sun","Xinyu Xiao"],"pdf_url":"https://arxiv.org/pdf/2403.09401v3.pdf","comment":"Accepted by IEEE Transactions on Image Processing, 2024"},{"id":"http://arxiv.org/abs/2411.15808v3","updated":"2024-12-05T09:08:01Z","published":"2024-11-24T12:30:12Z","title":"LRSAA: Large-scale Remote Sensing Image Target Recognition and Automatic\n Annotation","summary":" This paper presents a method for object recognition and automatic labeling in\nlarge-area remote sensing images called LRSAA. The method integrates YOLOv11\nand MobileNetV3-SSD object detection algorithms through ensemble learning to\nenhance model performance. Furthermore, it employs Poisson disk sampling\nsegmentation techniques and the EIOU metric to optimize the training and\ninference processes of segmented images, followed by the integration of\nresults. This approach not only reduces the demand for computational resources\nbut also achieves a good balance between accuracy and speed. The source code\nfor this project has been made publicly available on\nhttps://github.com/anaerovane/LRSAA.\n","authors":["Wuzheng Dong","Yujuan Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.15808v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2411.07802"},{"id":"http://arxiv.org/abs/2412.03986v1","updated":"2024-12-05T09:04:11Z","published":"2024-12-05T09:04:11Z","title":"UNCOVER: Unknown Class Object Detection for Autonomous Vehicles in\n Real-time","summary":" Autonomous driving (AD) operates in open-world scenarios, where encountering\nunknown objects is inevitable. However, standard object detectors trained on a\nlimited number of base classes tend to ignore any unknown objects, posing\npotential risks on the road. To address this, it is important to learn a\ngeneric rather than a class specific objectness from objects seen during\ntraining. We therefore introduce an occupancy prediction together with bounding\nbox regression. It learns to score the objectness by calculating the ratio of\nthe predicted area occupied by actual objects. To enhance its generalizability,\nwe increase the object diversity by exploiting data from other domains via\nMosaic and Mixup augmentation. The objects outside the AD training classes are\nclassified as a newly added out-of-distribution (OOD) class. Our solution\nUNCOVER, for UNknown Class Object detection for autonomous VEhicles in\nReal-time, excels at achieving both real-time detection and high recall of\nunknown objects on challenging AD benchmarks. To further attain very low false\npositive rates, particularly for close objects, we introduce a post-hoc\nfiltering step that utilizes geometric cues extracted from the depth map,\ntypically available within the AD system.\n","authors":["Lars Schmarje","Kaspar Sakman","Reinhard Koch","Dan Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03982v1","updated":"2024-12-05T08:58:25Z","published":"2024-12-05T08:58:25Z","title":"Exploring Fully Convolutional Networks for the Segmentation of\n Hyperspectral Imaging Applied to Advanced Driver Assistance Systems","summary":" Advanced Driver Assistance Systems (ADAS) are designed with the main purpose\nof increasing the safety and comfort of vehicle occupants. Most of current\ncomputer vision-based ADAS perform detection and tracking tasks quite\nsuccessfully under regular conditions, but are not completely reliable,\nparticularly under adverse weather and changing lighting conditions, neither in\ncomplex situations with many overlapping objects. In this work we explore the\nuse of hyperspectral imaging (HSI) in ADAS on the assumption that the distinct\nnear infrared (NIR) spectral reflectances of different materials can help to\nbetter separate the objects in a driving scene. In particular, this paper\ndescribes some experimental results of the application of fully convolutional\nnetworks (FCN) to the image segmentation of HSI for ADAS applications. More\nspecifically, our aim is to investigate to what extent the spatial features\ncodified by convolutional filters can be helpful to improve the performance of\nHSI segmentation systems. With that aim, we use the HSI-Drive v1.1 dataset,\nwhich provides a set of labelled images recorded in real driving conditions\nwith a small-size snapshot NIR-HSI camera. Finally, we analyze the\nimplementability of such a HSI segmentation system by prototyping the developed\nFCN model together with the necessary hyperspectral cube preprocessing stage\nand characterizing its performance on an MPSoC.\n","authors":["Jon Gutiérrez-Zaballa","Koldo Basterretxea","Javier Echanobe","M. Victoria Martínez","Inés del Campo"],"pdf_url":"https://arxiv.org/pdf/2412.03982v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2411.19274"},{"id":"http://arxiv.org/abs/2304.02278v4","updated":"2024-12-05T08:55:34Z","published":"2023-04-05T07:50:16Z","title":"SCMM: Calibrating Cross-modal Representations for Text-Based Person\n Search","summary":" Text-Based Person Search (TBPS) is a crucial task that enables accurate\nretrieval of target individuals from large-scale galleries with only given\ntextual caption. For cross-modal TBPS tasks, it is critical to obtain\nwell-distributed representation in the common embedding space to reduce the\ninter-modal gap. Furthermore, learning detailed image-text correspondences is\nessential to discriminate similar targets and enable fine-grained search. To\naddress these challenges, we present a simple yet effective method named Sew\nCalibration and Masked Modeling (SCMM) that calibrates cross-modal\nrepresentations by learning compact and well-aligned embeddings. SCMM is\ndistinguished by two novel losses to provide fine-grained cross-modal\nrepresentations: 1) a Sew calibration loss that takes the quality of textual\ncaptions as guidance and aligns features between image and text modalities, and\n2) a Masked Caption Modeling (MCM) loss that leverages a masked caption\nprediction task to establish detailed and generic relationships between textual\nand visual parts. The dual-pronged strategy refines feature alignment and\nenriches cross-modal correspondences, enabling the accurate distinction of\nsimilar individuals. Consequently, its streamlined dual-encoder architecture\navoids complex branches and interactions and facilitates high-speed inference\nsuitable for real-time requirements. Consequently, high-speed inference is\nachieved, which is essential for resource-limited applications often demanding\nreal-time processing. Extensive experiments on three popular TBPS benchmarks\ndemonstrate the superiority of SCMM, achieving top results with 73.81%, 64.25%,\nand 57.35% Rank-1 accuracy on CUHK-PEDES, ICFG-PEDES, and RSTPReID,\nrespectively. We hope SCMM's scalable and cost-effective design will serve as a\nstrong baseline and facilitate future research in this field.\n","authors":["Jing Liu","Donglai Wei","Yang Liu","Sipeng Zhang","Tong Yang","Victor C. M. Leung"],"pdf_url":"https://arxiv.org/pdf/2304.02278v4.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2412.03214v2","updated":"2024-12-05T08:49:02Z","published":"2024-12-04T11:05:01Z","title":"Continual Low-Rank Scaled Dot-product Attention","summary":" Transformers are widely used for their ability to capture data relations in\nsequence processing, with great success for a wide range of static tasks.\nHowever, the computational and memory footprint of their main component, i.e.,\nthe Scaled Dot-product Attention, is commonly overlooked. This makes their\nadoption in applications involving stream data processing with constraints in\nresponse latency, computational and memory resources infeasible. Some works\nhave proposed methods to lower the computational cost of transformers, i.e.\nlow-rank approximations, sparsity in attention, and efficient formulations for\nContinual Inference. In this paper, we introduce a new formulation of the\nScaled Dot-product Attention based on the Nystr\\\"om approximation that is\nsuitable for Continual Inference. In experiments on Online Audio Classification\nand Online Action Detection tasks, the proposed Continual Scaled Dot-product\nAttention can lower the number of operations by up to three orders of magnitude\ncompared to the original Transformers while retaining the predictive\nperformance of competing models.\n","authors":["Ginés Carreto Picón","Illia Oleksiienko","Lukas Hedegaard","Arian Bakhtiarnia","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2412.03214v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.01262v2","updated":"2024-12-05T08:44:53Z","published":"2024-10-02T06:16:06Z","title":"Improving Fine-Grained Control via Aggregation of Multiple Diffusion\n Models","summary":" While many diffusion models perform well when controlling for particular\naspect among style, character, and interaction, they struggle with fine-grained\ncontrol due to dataset limitations and intricate model architecture design.\nThis paper introduces a novel algorithm, Aggregation of Multiple Diffusion\nModels (AMDM), which synthesizes features from multiple diffusion models into a\nspecified model, activating specific features for fine-grained control.\nExperimental results demonstrate that AMDM significantly improves fine-grained\ncontrol without training, proving its effectiveness. Additionally, it reveals\nthat diffusion models initially focus on features such as position, attributes,\nand style, with later stages improving generation quality and consistency. AMDM\noffers a new perspective for tackling the challenges of fine-grained\nconditional control generation in diffusion models: We can fully utilize\nexisting or develop new conditional diffusion models that control specific\naspects, and then aggregate them using AMDM algorithm. This eliminates the need\nfor constructing complex datasets, designing intricate model architectures, and\nincurring high training costs. Code is available at:\nhttps://github.com/Hammour-steak/AMDM.\n","authors":["Conghan Yue","Zhengwei Peng","Shiyan Du","Zhi Ji","Chuangjian Cai","Le Wan","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.01262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.19156v2","updated":"2024-12-05T08:39:53Z","published":"2024-11-28T13:55:06Z","title":"LoRA of Change: Learning to Generate LoRA for the Editing Instruction\n from A Single Before-After Image Pair","summary":" In this paper, we propose the LoRA of Change (LoC) framework for image\nediting with visual instructions, i.e., before-after image pairs. Compared to\nthe ambiguities, insufficient specificity, and diverse interpretations of\nnatural language, visual instructions can accurately reflect users' intent.\nBuilding on the success of LoRA in text-based image editing and generation, we\ndynamically learn an instruction-specific LoRA to encode the \"change\" in a\nbefore-after image pair, enhancing the interpretability and reusability of our\nmodel. Furthermore, generalizable models for image editing with visual\ninstructions typically require quad data, i.e., a before-after image pair,\nalong with query and target images. Due to the scarcity of such quad data,\nexisting models are limited to a narrow range of visual instructions. To\novercome this limitation, we introduce the LoRA Reverse optimization technique,\nenabling large-scale training with paired data alone. Extensive qualitative and\nquantitative experiments demonstrate that our model produces high-quality\nimages that align with user intent and support a broad spectrum of real-world\nvisual instructions.\n","authors":["Xue Song","Jiequan Cui","Hanwang Zhang","Jiaxin Shi","Jingjing Chen","Chi Zhang","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.19156v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03969v1","updated":"2024-12-05T08:38:01Z","published":"2024-12-05T08:38:01Z","title":"HyperDefect-YOLO: Enhance YOLO with HyperGraph Computation for\n Industrial Defect Detection","summary":" In the manufacturing industry, defect detection is an essential but\nchallenging task aiming to detect defects generated in the process of\nproduction. Though traditional YOLO models presents a good performance in\ndefect detection, they still have limitations in capturing high-order feature\ninterrelationships, which hurdles defect detection in the complex scenarios and\nacross the scales. To this end, we introduce hypergraph computation into YOLO\nframework, dubbed HyperDefect-YOLO (HD-YOLO), to improve representative ability\nand semantic exploitation. HD-YOLO consists of Defect Aware Module (DAM) and\nMixed Graph Network (MGNet) in the backbone, which specialize for perception\nand extraction of defect features. To effectively aggregate multi-scale\nfeatures, we propose HyperGraph Aggregation Network (HGANet) which combines\nhypergraph and attention mechanism to aggregate multi-scale features.\nCross-Scale Fusion (CSF) is proposed to adaptively fuse and handle features\ninstead of simple concatenation and convolution. Finally, we propose Semantic\nAware Module (SAM) in the neck to enhance semantic exploitation for accurately\nlocalizing defects with different sizes in the disturbed background. HD-YOLO\nundergoes rigorous evaluation on public HRIPCB and NEU-DET datasets with\nsignificant improvements compared to state-of-the-art methods. We also evaluate\nHD-YOLO on self-built MINILED dataset collected in real industrial scenarios to\ndemonstrate the effectiveness of the proposed method. The source codes are at\nhttps://github.com/Jay-zzcoder/HD-YOLO.\n","authors":["Zuo Zuo","Jiahao Dong","Yue Gao","Zongze Wu"],"pdf_url":"https://arxiv.org/pdf/2412.03969v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2208.14784v2","updated":"2024-12-05T15:07:54Z","published":"2022-08-31T11:45:21Z","title":"Practical Operator Sketching Framework for Accelerating Iterative\n Data-Driven Solutions in Inverse Problems","summary":" We propose a new operator-sketching paradigm for designing efficient\niterative data-driven reconstruction (IDR) schemes, e.g. Plug-and-Play\nalgorithms and deep unrolling networks. These IDR schemes are currently the\nstate-of-the-art solutions for imaging inverse problems. However, for\nhigh-dimensional imaging tasks, especially X-ray CT and MRI imaging, these IDR\nschemes typically become inefficient both in terms of computation, due to the\nneed of computing multiple times the high-dimensional forward and adjoint\noperators. In this work, we explore and propose a universal dimensionality\nreduction framework for accelerating IDR schemes in solving imaging inverse\nproblems, based on leveraging the sketching techniques from stochastic\noptimization. Using this framework, we derive a number of accelerated IDR\nschemes, such as the plug-and-play multi-stage sketched gradient (PnP-MS2G) and\nsketching-based primal-dual (LSPD and Sk-LSPD) deep unrolling networks.\nMeanwhile, for fully accelerating PnP schemes when the denoisers are\ncomputationally expensive, we provide novel stochastic lazy denoising schemes\n(Lazy-PnP and Lazy-PnP-EQ), leveraging the ProxSkip scheme in optimization and\nequivariant image denoisers, which can massively accelerate the PnP algorithms\nwith improved practicality. We provide theoretical analysis for recovery\nguarantees of instances of the proposed framework. Our numerical experiments on\nnatural image processing and tomographic image reconstruction demonstrate the\nremarkable effectiveness of our sketched IDR schemes.\n","authors":["Junqi Tang","Guixian Xu","Subhadip Mukherjee","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2208.14784v2.pdf","comment":null}],"Artificial Intelligence":[{"id":"http://arxiv.org/abs/2412.04471v1","updated":"2024-12-05T18:59:57Z","published":"2024-12-05T18:59:57Z","title":"PaintScene4D: Consistent 4D Scene Generation from Text Prompts","summary":" Recent advances in diffusion models have revolutionized 2D and 3D content\ncreation, yet generating photorealistic dynamic 4D scenes remains a significant\nchallenge. Existing dynamic 4D generation methods typically rely on distilling\nknowledge from pre-trained 3D generative models, often fine-tuned on synthetic\nobject datasets. Consequently, the resulting scenes tend to be object-centric\nand lack photorealism. While text-to-video models can generate more realistic\nscenes with motion, they often struggle with spatial understanding and provide\nlimited control over camera viewpoints during rendering. To address these\nlimitations, we present PaintScene4D, a novel text-to-4D scene generation\nframework that departs from conventional multi-view generative models in favor\nof a streamlined architecture that harnesses video generative models trained on\ndiverse real-world datasets. Our method first generates a reference video using\na video generation model, and then employs a strategic camera array selection\nfor rendering. We apply a progressive warping and inpainting technique to\nensure both spatial and temporal consistency across multiple viewpoints.\nFinally, we optimize multi-view images using a dynamic renderer, enabling\nflexible camera control based on user preferences. Adopting a training-free\narchitecture, our PaintScene4D efficiently produces realistic 4D scenes that\ncan be viewed from arbitrary trajectories. The code will be made publicly\navailable. Our project page is at https://paintscene4d.github.io/\n","authors":["Vinayak Gupta","Yunze Man","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04471v1.pdf","comment":"Project page: https://paintscene4d.github.io/"},{"id":"http://arxiv.org/abs/2412.04469v1","updated":"2024-12-05T18:59:55Z","published":"2024-12-05T18:59:55Z","title":"QUEEN: QUantized Efficient ENcoding of Dynamic Gaussians for Streaming\n Free-viewpoint Videos","summary":" Online free-viewpoint video (FVV) streaming is a challenging problem, which\nis relatively under-explored. It requires incremental on-the-fly updates to a\nvolumetric representation, fast training and rendering to satisfy real-time\nconstraints and a small memory footprint for efficient transmission. If\nachieved, it can enhance user experience by enabling novel applications, e.g.,\n3D video conferencing and live volumetric video broadcast, among others. In\nthis work, we propose a novel framework for QUantized and Efficient ENcoding\n(QUEEN) for streaming FVV using 3D Gaussian Splatting (3D-GS). QUEEN directly\nlearns Gaussian attribute residuals between consecutive frames at each\ntime-step without imposing any structural constraints on them, allowing for\nhigh quality reconstruction and generalizability. To efficiently store the\nresiduals, we further propose a quantization-sparsity framework, which contains\na learned latent-decoder for effectively quantizing attribute residuals other\nthan Gaussian positions and a learned gating module to sparsify position\nresiduals. We propose to use the Gaussian viewspace gradient difference vector\nas a signal to separate the static and dynamic content of the scene. It acts as\na guide for effective sparsity learning and speeds up training. On diverse FVV\nbenchmarks, QUEEN outperforms the state-of-the-art online FVV methods on all\nmetrics. Notably, for several highly dynamic scenes, it reduces the model size\nto just 0.7 MB per frame while training in under 5 sec and rendering at 350\nFPS. Project website is at https://research.nvidia.com/labs/amri/projects/queen\n","authors":["Sharath Girish","Tianye Li","Amrita Mazumdar","Abhinav Shrivastava","David Luebke","Shalini De Mello"],"pdf_url":"https://arxiv.org/pdf/2412.04469v1.pdf","comment":"Accepted at NeurIPS 2024, Project website:\n https://research.nvidia.com/labs/amri/projects/queen"},{"id":"http://arxiv.org/abs/2412.04467v1","updated":"2024-12-05T18:59:53Z","published":"2024-12-05T18:59:53Z","title":"VisionZip: Longer is Better but Not Necessary in Vision Language Models","summary":" Recent advancements in vision-language models have enhanced performance by\nincreasing the length of visual tokens, making them much longer than text\ntokens and significantly raising computational costs. However, we observe that\nthe visual tokens generated by popular vision encoders, such as CLIP and\nSigLIP, contain significant redundancy. To address this, we introduce\nVisionZip, a simple yet effective method that selects a set of informative\ntokens for input to the language model, reducing visual token redundancy and\nimproving efficiency while maintaining model performance. The proposed\nVisionZip can be widely applied to image and video understanding tasks and is\nwell-suited for multi-turn dialogues in real-world scenarios, where previous\nmethods tend to underperform. Experimental results show that VisionZip\noutperforms the previous state-of-the-art method by at least 5% performance\ngains across nearly all settings. Moreover, our method significantly enhances\nmodel inference speed, improving the prefilling time by 8x and enabling the\nLLaVA-Next 13B model to infer faster than the LLaVA-Next 7B model while\nachieving better results. Furthermore, we analyze the causes of this redundancy\nand encourage the community to focus on extracting better visual features\nrather than merely increasing token length. Our code is available at\nhttps://github.com/dvlab-research/VisionZip .\n","authors":["Senqiao Yang","Yukang Chen","Zhuotao Tian","Chengyao Wang","Jingyao Li","Bei Yu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2412.04467v1.pdf","comment":"2 columns, 28 pages, 15 figures, 18 tables"},{"id":"http://arxiv.org/abs/2412.04455v1","updated":"2024-12-05T18:58:27Z","published":"2024-12-05T18:58:27Z","title":"Code-as-Monitor: Constraint-aware Visual Programming for Reactive and\n Proactive Robotic Failure Detection","summary":" Automatic detection and prevention of open-set failures are crucial in\nclosed-loop robotic systems. Recent studies often struggle to simultaneously\nidentify unexpected failures reactively after they occur and prevent\nforeseeable ones proactively. To this end, we propose Code-as-Monitor (CaM), a\nnovel paradigm leveraging the vision-language model (VLM) for both open-set\nreactive and proactive failure detection. The core of our method is to\nformulate both tasks as a unified set of spatio-temporal constraint\nsatisfaction problems and use VLM-generated code to evaluate them for real-time\nmonitoring. To enhance the accuracy and efficiency of monitoring, we further\nintroduce constraint elements that abstract constraint-related entities or\ntheir parts into compact geometric elements. This approach offers greater\ngenerality, simplifies tracking, and facilitates constraint-aware visual\nprogramming by leveraging these elements as visual prompts. Experiments show\nthat CaM achieves a 28.7% higher success rate and reduces execution time by\n31.8% under severe disturbances compared to baselines across three simulators\nand a real-world setting. Moreover, CaM can be integrated with open-loop\ncontrol policies to form closed-loop systems, enabling long-horizon tasks in\ncluttered scenes with dynamic environments.\n","authors":["Enshen Zhou","Qi Su","Cheng Chi","Zhizheng Zhang","Zhongyuan Wang","Tiejun Huang","Lu Sheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04455v1.pdf","comment":"Project page: https://zhoues.github.io/Code-as-Monitor/"},{"id":"http://arxiv.org/abs/2412.04447v1","updated":"2024-12-05T18:57:23Z","published":"2024-12-05T18:57:23Z","title":"EgoPlan-Bench2: A Benchmark for Multimodal Large Language Model Planning\n in Real-World Scenarios","summary":" The advent of Multimodal Large Language Models, leveraging the power of Large\nLanguage Models, has recently demonstrated superior multimodal understanding\nand reasoning abilities, heralding a new era for artificial general\nintelligence. However, achieving AGI necessitates more than just comprehension\nand reasoning. A crucial capability required is effective planning in diverse\nscenarios, which involves making reasonable decisions based on complex\nenvironments to solve real-world problems. Despite its importance, the planning\nabilities of current MLLMs in varied scenarios remain underexplored. In this\npaper, we introduce EgoPlan-Bench2, a rigorous and comprehensive benchmark\ndesigned to assess the planning capabilities of MLLMs across a wide range of\nreal-world scenarios. EgoPlan-Bench2 encompasses everyday tasks spanning 4\nmajor domains and 24 detailed scenarios, closely aligned with human daily life.\nEgoPlan-Bench2 is constructed through a semi-automatic process utilizing\negocentric videos, complemented by manual verification. Grounded in a\nfirst-person perspective, it mirrors the way humans approach problem-solving in\neveryday life. We evaluate 21 competitive MLLMs and provide an in-depth\nanalysis of their limitations, revealing that they face significant challenges\nin real-world planning. To further improve the planning proficiency of current\nMLLMs, we propose a training-free approach using multimodal Chain-of-Thought\n(CoT) prompting through investigating the effectiveness of various multimodal\nprompts in complex planning. Our approach enhances the performance of GPT-4V by\n10.24 on EgoPlan-Bench2 without additional training. Our work not only sheds\nlight on the current limitations of MLLMs in planning, but also provides\ninsights for future enhancements in this critical area. We have made data and\ncode available at https://qiulu66.github.io/egoplanbench2/.\n","authors":["Lu Qiu","Yuying Ge","Yi Chen","Yixiao Ge","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04447v1.pdf","comment":"Code & data are available at:\n https://qiulu66.github.io/egoplanbench2/"},{"id":"http://arxiv.org/abs/2412.04445v1","updated":"2024-12-05T18:57:04Z","published":"2024-12-05T18:57:04Z","title":"Moto: Latent Motion Token as the Bridging Language for Robot\n Manipulation","summary":" Recent developments in Large Language Models pre-trained on extensive corpora\nhave shown significant success in various natural language processing tasks\nwith minimal fine-tuning. This success offers new promise for robotics, which\nhas long been constrained by the high cost of action-labeled data. We ask:\ngiven the abundant video data containing interaction-related knowledge\navailable as a rich \"corpus\", can a similar generative pre-training approach be\neffectively applied to enhance robot learning? The key challenge is to identify\nan effective representation for autoregressive pre-training that benefits robot\nmanipulation tasks. Inspired by the way humans learn new skills through\nobserving dynamic environments, we propose that effective robotic learning\nshould emphasize motion-related knowledge, which is closely tied to low-level\nactions and is hardware-agnostic, facilitating the transfer of learned motions\nto actual robot actions. To this end, we introduce Moto, which converts video\ncontent into latent Motion Token sequences by a Latent Motion Tokenizer,\nlearning a bridging \"language\" of motion from videos in an unsupervised manner.\nWe pre-train Moto-GPT through motion token autoregression, enabling it to\ncapture diverse visual motion knowledge. After pre-training, Moto-GPT\ndemonstrates the promising ability to produce semantically interpretable motion\ntokens, predict plausible motion trajectories, and assess trajectory\nrationality through output likelihood. To transfer learned motion priors to\nreal robot actions, we implement a co-fine-tuning strategy that seamlessly\nbridges latent motion token prediction and real robot control. Extensive\nexperiments show that the fine-tuned Moto-GPT exhibits superior robustness and\nefficiency on robot manipulation benchmarks, underscoring its effectiveness in\ntransferring knowledge from video data to downstream visual manipulation tasks.\n","authors":["Yi Chen","Yuying Ge","Yizhuo Li","Yixiao Ge","Mingyu Ding","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04445v1.pdf","comment":"Project released at: https://chenyi99.github.io/moto/"},{"id":"http://arxiv.org/abs/2409.03669v2","updated":"2024-12-05T18:56:04Z","published":"2024-09-05T16:23:07Z","title":"A method to benchmark high-dimensional process drift detection","summary":" Process curves are multivariate finite time series data coming from\nmanufacturing processes. This paper studies machine learning that detect drifts\nin process curve datasets. A theoretic framework to synthetically generate\nprocess curves in a controlled way is introduced in order to benchmark machine\nlearning algorithms for process drift detection. An evaluation score, called\nthe temporal area under the curve, is introduced, which allows to quantify how\nwell machine learning models unveil curves belonging to drift segments.\nFinally, a benchmark study comparing popular machine learning approaches on\nsynthetic data generated with the introduced framework is presented that shows\nthat existing algorithms often struggle with datasets containing multiple drift\nsegments.\n","authors":["Edgar Wolf","Tobias Windisch"],"pdf_url":"https://arxiv.org/pdf/2409.03669v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04426v1","updated":"2024-12-05T18:51:18Z","published":"2024-12-05T18:51:18Z","title":"Marvel: Accelerating Safe Online Reinforcement Learning with Finetuned\n Offline Policy","summary":" The high costs and risks involved in extensive environment interactions\nhinder the practical application of current online safe reinforcement learning\n(RL) methods. While offline safe RL addresses this by learning policies from\nstatic datasets, the performance therein is usually limited due to reliance on\ndata quality and challenges with out-of-distribution (OOD) actions. Inspired by\nrecent successes in offline-to-online (O2O) RL, it is crucial to explore\nwhether offline safe RL can be leveraged to facilitate faster and safer online\npolicy learning, a direction that has yet to be fully investigated. To fill\nthis gap, we first demonstrate that naively applying existing O2O algorithms\nfrom standard RL would not work well in the safe RL setting due to two unique\nchallenges: \\emph{erroneous Q-estimations}, resulted from offline-online\nobjective mismatch and offline cost sparsity, and \\emph{Lagrangian mismatch},\nresulted from difficulties in aligning Lagrange multipliers between offline and\nonline policies. To address these challenges, we introduce \\textbf{Marvel}, a\nnovel framework for O2O safe RL, comprising two key components that work in\nconcert: \\emph{Value Pre-Alignment} to align the Q-functions with the\nunderlying truth before online learning, and \\emph{Adaptive PID Control} to\neffectively adjust the Lagrange multipliers during online finetuning. Extensive\nexperiments demonstrate that Marvel significantly outperforms existing\nbaselines in both reward maximization and safety constraint satisfaction. By\nintroducing the first policy-finetuning based framework for O2O safe RL, which\nis compatible with many offline and online safe RL methods, our work has the\ngreat potential to advance the field towards more efficient and practical safe\nRL solutions.\n","authors":["Keru Chen","Honghao Wei","Zhigang Deng","Sen Lin"],"pdf_url":"https://arxiv.org/pdf/2412.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04424v1","updated":"2024-12-05T18:50:39Z","published":"2024-12-05T18:50:39Z","title":"Florence-VL: Enhancing Vision-Language Models with Generative Vision\n Encoder and Depth-Breadth Fusion","summary":" We present Florence-VL, a new family of multimodal large language models\n(MLLMs) with enriched visual representations produced by Florence-2, a\ngenerative vision foundation model. Unlike the widely used CLIP-style vision\ntransformer trained by contrastive learning, Florence-2 can capture different\nlevels and aspects of visual features, which are more versatile to be adapted\nto diverse downstream tasks. We propose a novel feature-fusion architecture and\nan innovative training recipe that effectively integrates Florence-2's visual\nfeatures into pretrained LLMs, such as Phi 3.5 and LLama 3. In particular, we\npropose \"depth-breath fusion (DBFusion)\" to fuse the visual features extracted\nfrom different depths and under multiple prompts. Our model training is\ncomposed of end-to-end pretraining of the whole model followed by finetuning of\nthe projection layer and the LLM, on a carefully designed recipe of diverse\nopen-source datasets that include high-quality image captions and\ninstruction-tuning pairs. Our quantitative analysis and visualization of\nFlorence-VL's visual features show its advantages over popular vision encoders\non vision-language alignment, where the enriched depth and breath play\nimportant roles. Florence-VL achieves significant improvements over existing\nstate-of-the-art MLLMs across various multi-modal and vision-centric benchmarks\ncovering general VQA, perception, hallucination, OCR, Chart,\nknowledge-intensive understanding, etc. To facilitate future research, our\nmodels and the complete training recipe are open-sourced.\nhttps://github.com/JiuhaiChen/Florence-VL\n","authors":["Jiuhai Chen","Jianwei Yang","Haiping Wu","Dianqi Li","Jianfeng Gao","Tianyi Zhou","Bin Xiao"],"pdf_url":"https://arxiv.org/pdf/2412.04424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.07384v2","updated":"2024-12-05T18:47:47Z","published":"2024-03-12T07:45:33Z","title":"SmallToLarge (S2L): Scalable Data Selection for Fine-tuning Large\n Language Models by Summarizing Training Trajectories of Small Models","summary":" Despite the effectiveness of data selection for large language models (LLMs)\nduring pretraining and instruction fine-tuning phases, improving data\nefficiency in supervised fine-tuning (SFT) for specialized domains poses\nsignificant challenges due to the complexity of fine-tuning data. To bridge\nthis gap, we introduce an effective and scalable data selection method for SFT,\nSmallToLarge (S2L), which leverages training trajectories from small models to\nguide the data selection for larger models. We demonstrate through extensive\nexperiments that S2L significantly improves data efficiency in SFT for\nmathematical problem-solving, reducing the training data to just 11% of the\noriginal MathInstruct dataset (Yue et al., 2023) to match full dataset\nperformance while outperforming state-of-the-art data selection algorithms by\nan average of 4.7% across 6 in- and out-domain evaluation datasets. Remarkably,\nselecting only 50K data for SFT, S2L achieves a 32.7% accuracy on the most\nchallenging MATH (Hendrycks et al., 2021) benchmark, improving Phi-2 (Li et\nal., 2023b) by 16.6%. In clinical text summarization on the MIMIC-III dataset\n(Johnson et al., 2016), S2L again outperforms training on the full dataset\nusing only 50% of the data. Notably, S2L can perform data selection using a\nreference model 40x smaller than the target model, proportionally reducing the\ncost of data selection.\n","authors":["Yu Yang","Siddhartha Mishra","Jeffrey N Chiang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2403.07384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01339v2","updated":"2024-12-05T18:43:25Z","published":"2024-12-02T10:06:57Z","title":"Negative Token Merging: Image-based Adversarial Feature Guidance","summary":" Text-based adversarial guidance using a negative prompt has emerged as a\nwidely adopted approach to steer diffusion models away from producing undesired\nconcepts. While useful, performing adversarial guidance using text alone can be\ninsufficient to capture complex visual concepts or avoid specific visual\nelements like copyrighted characters. In this paper, for the first time we\nexplore an alternate modality in this direction by performing adversarial\nguidance directly using visual features from a reference image or other images\nin a batch. We introduce negative token merging (NegToMe), a simple but\neffective training-free approach which performs adversarial guidance through\nimages by selectively pushing apart matching visual features between reference\nand generated images during the reverse diffusion process. By simply adjusting\nthe used reference, NegToMe enables a diverse range of applications. Notably,\nwhen using other images in same batch as reference, we find that NegToMe\nsignificantly enhances output diversity (e.g., racial, gender, visual) by\nguiding features of each image away from others. Similarly, when used w.r.t.\ncopyrighted reference images, NegToMe reduces visual similarity to copyrighted\ncontent by 34.57%. NegToMe is simple to implement using just few-lines of code,\nuses only marginally higher (<4%) inference time and is compatible with\ndifferent diffusion architectures, including those like Flux, which don't\nnatively support the use of a negative prompt. Code is available at\nhttps://negtome.github.io\n","authors":["Jaskirat Singh","Lindsey Li","Weijia Shi","Ranjay Krishna","Yejin Choi","Pang Wei Koh","Michael F. Cohen","Stephen Gould","Liang Zheng","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2412.01339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04416v1","updated":"2024-12-05T18:42:29Z","published":"2024-12-05T18:42:29Z","title":"FedDUAL: A Dual-Strategy with Adaptive Loss and Dynamic Aggregation for\n Mitigating Data Heterogeneity in Federated Learning","summary":" Federated Learning (FL) marks a transformative approach to distributed model\ntraining by combining locally optimized models from various clients into a\nunified global model. While FL preserves data privacy by eliminating\ncentralized storage, it encounters significant challenges such as performance\ndegradation, slower convergence, and reduced robustness of the global model due\nto the heterogeneity in client data distributions. Among the various forms of\ndata heterogeneity, label skew emerges as a particularly formidable and\nprevalent issue, especially in domains such as image classification. To address\nthese challenges, we begin with comprehensive experiments to pinpoint the\nunderlying issues in the FL training process. Based on our findings, we then\nintroduce an innovative dual-strategy approach designed to effectively resolve\nthese issues. First, we introduce an adaptive loss function for client-side\ntraining, meticulously crafted to preserve previously acquired knowledge while\nmaintaining an optimal equilibrium between local optimization and global model\ncoherence. Secondly, we develop a dynamic aggregation strategy for aggregating\nclient models at the server. This approach adapts to each client's unique\nlearning patterns, effectively addressing the challenges of diverse data across\nthe network. Our comprehensive evaluation, conducted across three diverse\nreal-world datasets, coupled with theoretical convergence guarantees,\ndemonstrates the superior efficacy of our method compared to several\nestablished state-of-the-art approaches.\n","authors":["Pranab Sahoo","Ashutosh Tripathi","Sriparna Saha","Samrat Mondal"],"pdf_url":"https://arxiv.org/pdf/2412.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04415v1","updated":"2024-12-05T18:38:30Z","published":"2024-12-05T18:38:30Z","title":"Targeting the Core: A Simple and Effective Method to Attack RAG-based\n Agents via Direct LLM Manipulation","summary":" AI agents, powered by large language models (LLMs), have transformed\nhuman-computer interactions by enabling seamless, natural, and context-aware\ncommunication. While these advancements offer immense utility, they also\ninherit and amplify inherent safety risks such as bias, fairness,\nhallucinations, privacy breaches, and a lack of transparency. This paper\ninvestigates a critical vulnerability: adversarial attacks targeting the LLM\ncore within AI agents. Specifically, we test the hypothesis that a deceptively\nsimple adversarial prefix, such as \\textit{Ignore the document}, can compel\nLLMs to produce dangerous or unintended outputs by bypassing their contextual\nsafeguards. Through experimentation, we demonstrate a high attack success rate\n(ASR), revealing the fragility of existing LLM defenses. These findings\nemphasize the urgent need for robust, multi-layered security measures tailored\nto mitigate vulnerabilities at the LLM level and within broader agent-based\narchitectures.\n","authors":["Xuying Li","Zhuo Li","Yuji Kosuga","Yasuhiro Yoshida","Victor Bian"],"pdf_url":"https://arxiv.org/pdf/2412.04415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12924v3","updated":"2024-12-05T18:35:26Z","published":"2024-09-04T03:17:19Z","title":"WaveletGPT: Wavelets Meet Large Language Models","summary":" Large Language Models (LLMs) have ushered in a new wave of artificial\nintelligence advancements impacting every scientific field and discipline. They\nare trained on a simple objective: to predict the next token given the previous\ncontext. We live in a world where most of the data around us, e.g., text,\naudio, and music, has a multi-scale structure associated with it. This paper\ninfuses LLMs with traditional signal processing ideas, namely wavelets, during\npre-training to take advantage of the structure. Without adding \\textbf{any\nextra parameters} to a GPT-style LLM architecture, we achieve the same\npre-training performance almost twice as fast in text, raw audio, and symbolic\nmusic. This is achieved by imposing a structure on intermediate embeddings.\nWhen trained for the same number of training steps, we achieve significant\ngains in performance, which is comparable to pre-training a larger neural\narchitecture. Our architecture allows every next token prediction access to\nintermediate embeddings at different temporal resolutions in every Transformer\ndecoder block. This work will hopefully pave the way for incorporating\nmulti-rate signal processing ideas into traditional LLM pre-training. Further,\nwe showcase pushing model performance by improving internal structure instead\nof just going after scale.\n","authors":["Prateek Verma"],"pdf_url":"https://arxiv.org/pdf/2409.12924v3.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.04403v1","updated":"2024-12-05T18:21:49Z","published":"2024-12-05T18:21:49Z","title":"Establishing Task Scaling Laws via Compute-Efficient Model Ladders","summary":" We develop task scaling laws and model ladders to predict the individual task\nperformance of pretrained language models (LMs) in the overtrained setting.\nStandard power laws for language modeling loss cannot accurately model task\nperformance. Therefore, we leverage a two-step prediction approach: first use\nmodel and data size to predict a task-specific loss, and then use this task\nloss to predict task performance. We train a set of small-scale \"ladder\"\nmodels, collect data points to fit the parameterized functions of the two\nprediction steps, and make predictions for two target models: a 7B model\ntrained to 4T tokens and a 13B model trained to 5T tokens. Training the ladder\nmodels only costs 1% of the compute used for the target models. On four\nmultiple-choice tasks written in ranked classification format, we can predict\nthe accuracy of both target models within 2 points of absolute error. We have\nhigher prediction error on four other tasks (average absolute error 6.9) and\nfind that these are often tasks with higher variance in task metrics. We also\nfind that using less compute to train fewer ladder models tends to deteriorate\npredictions. Finally, we empirically show that our design choices and the\ntwo-step approach lead to superior performance in establishing scaling laws.\n","authors":["Akshita Bhagia","Jiacheng Liu","Alexander Wettig","David Heineman","Oyvind Tafjord","Ananya Harsh Jha","Luca Soldaini","Noah A. Smith","Dirk Groeneveld","Pang Wei Koh","Jesse Dodge","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2412.04403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04384v1","updated":"2024-12-05T17:59:58Z","published":"2024-12-05T17:59:58Z","title":"Probabilistic Gaussian Superposition for Efficient 3D Occupancy\n Prediction","summary":" 3D semantic occupancy prediction is an important task for robust\nvision-centric autonomous driving, which predicts fine-grained geometry and\nsemantics of the surrounding scene. Most existing methods leverage dense\ngrid-based scene representations, overlooking the spatial sparsity of the\ndriving scenes. Although 3D semantic Gaussian serves as an object-centric\nsparse alternative, most of the Gaussians still describe the empty region with\nlow efficiency. To address this, we propose a probabilistic Gaussian\nsuperposition model which interprets each Gaussian as a probability\ndistribution of its neighborhood being occupied and conforms to probabilistic\nmultiplication to derive the overall geometry. Furthermore, we adopt the exact\nGaussian mixture model for semantics calculation to avoid unnecessary\noverlapping of Gaussians. To effectively initialize Gaussians in non-empty\nregion, we design a distribution-based initialization module which learns the\npixel-aligned occupancy distribution instead of the depth of surfaces. We\nconduct extensive experiments on nuScenes and KITTI-360 datasets and our\nGaussianFormer-2 achieves state-of-the-art performance with high efficiency.\nCode: https://github.com/huang-yh/GaussianFormer.\n","authors":["Yuanhui Huang","Amonnut Thammatadatrakoon","Wenzhao Zheng","Yunpeng Zhang","Dalong Du","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04384v1.pdf","comment":"Code is available at: https://github.com/huang-yh/GaussianFormer"},{"id":"http://arxiv.org/abs/2412.04380v1","updated":"2024-12-05T17:57:09Z","published":"2024-12-05T17:57:09Z","title":"EmbodiedOcc: Embodied 3D Occupancy Prediction for Vision-based Online\n Scene Understanding","summary":" 3D occupancy prediction provides a comprehensive description of the\nsurrounding scenes and has become an essential task for 3D perception. Most\nexisting methods focus on offline perception from one or a few views and cannot\nbe applied to embodied agents which demands to gradually perceive the scene\nthrough progressive embodied exploration. In this paper, we formulate an\nembodied 3D occupancy prediction task to target this practical scenario and\npropose a Gaussian-based EmbodiedOcc framework to accomplish it. We initialize\nthe global scene with uniform 3D semantic Gaussians and progressively update\nlocal regions observed by the embodied agent. For each update, we extract\nsemantic and structural features from the observed image and efficiently\nincorporate them via deformable cross-attention to refine the regional\nGaussians. Finally, we employ Gaussian-to-voxel splatting to obtain the global\n3D occupancy from the updated 3D Gaussians. Our EmbodiedOcc assumes an unknown\n(i.e., uniformly distributed) environment and maintains an explicit global\nmemory of it with 3D Gaussians. It gradually gains knowledge through local\nrefinement of regional Gaussians, which is consistent with how humans\nunderstand new scenes through embodied exploration. We reorganize an\nEmbodiedOcc-ScanNet benchmark based on local annotations to facilitate the\nevaluation of the embodied 3D occupancy prediction task. Experiments\ndemonstrate that our EmbodiedOcc outperforms existing local prediction methods\nand accomplishes the embodied occupancy prediction with high accuracy and\nstrong expandability. Our code is available at:\nhttps://github.com/YkiWu/EmbodiedOcc.\n","authors":["Yuqi Wu","Wenzhao Zheng","Sicheng Zuo","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04380v1.pdf","comment":"Code: https://github.com/YkiWu/EmbodiedOcc"},{"id":"http://arxiv.org/abs/2412.04378v1","updated":"2024-12-05T17:54:27Z","published":"2024-12-05T17:54:27Z","title":"Discriminative Fine-tuning of LVLMs","summary":" Contrastively-trained Vision-Language Models (VLMs) like CLIP have become the\nde facto approach for discriminative vision-language representation learning.\nHowever, these models have limited language understanding, often exhibiting a\n\"bag of words\" behavior. At the same time, Large Vision-Language Models\n(LVLMs), which combine vision encoders with LLMs, have been shown capable of\ndetailed vision-language reasoning, yet their autoregressive nature renders\nthem less suitable for discriminative tasks.\n In this work, we propose to combine \"the best of both worlds\": a new training\napproach for discriminative fine-tuning of LVLMs that results in strong\ndiscriminative and compositional capabilities. Essentially, our approach\nconverts a generative LVLM into a discriminative one, unlocking its capability\nfor powerful image-text discrimination combined with enhanced language\nunderstanding.\n Our contributions include: (1) A carefully designed training/optimization\nframework that utilizes image-text pairs of variable length and granularity for\ntraining the model with both contrastive and next-token prediction losses. This\nis accompanied by ablation studies that justify the necessity of our\nframework's components. (2) A parameter-efficient adaptation method using a\ncombination of soft prompting and LoRA adapters. (3) Significant improvements\nover state-of-the-art CLIP-like models of similar size, including standard\nimage-text retrieval benchmarks and notable gains in compositionality.\n","authors":["Yassine Ouali","Adrian Bulat","Alexandros Xenos","Anestis Zaganidis","Ioannis Maniadis Metaxas","Georgios Tzimiropoulos","Brais Martinez"],"pdf_url":"https://arxiv.org/pdf/2412.04378v1.pdf","comment":"Preprint. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2412.02819v2","updated":"2024-12-05T17:51:20Z","published":"2024-12-03T20:35:57Z","title":"CNNSum: Exploring Long-Conext Summarization with Large Language Models\n in Chinese Novels","summary":" Large Language Models (LLMs) have been well-researched in many long-context\ntasks. However, due to high annotation costs, high-quality long-context summary\ndatasets for training or evaluation are scarce, limiting further research. In\nthis work, we introduce CNNSum, a new multi-scale Chinese long-context novel\nsummarization benchmark, including four subsets, length covering\n16k\\textasciitilde128k, 695 samples in total, the annotations are human-driven.\nWe evaluate commercial and open-source models on CNNSum and conduct a detailed\nanalysis. Based on the observations, we further conduct fine-tuning exploration\nwith short-context summary data. In our study: (1) GPT-4o underperformed, due\nto excessive subjective commentary. (2) Currently, long-context summarization\nmainly relies on memory ability, small LLMs with stable longer context lengths\nare the most cost-effective. Using long data concatenated from short-context\nsummaries makes a significant improvement. (3) Prompt templates may cause a\nlarge performance gap but can be mitigated through fine-tuning. (4) Fine-tuned\nChat or Instruction versions may harm the Base model and further fine-tuning\ncannot bridge performance gap. (5) while models with RoPE base scaling exhibit\nstrong extrapolation potential, their performance may vary significantly when\ncombined with other interpolation methods and need careful selection. (6)\nCNNSum provides more reliable and insightful evaluation results than other\nbenchmarks. We release CNNSum to advance research in this field.\n","authors":["Lingxiao Wei","He Yan","Xiangju Lu","Junmin Zhu","Jun Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.02819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12259v2","updated":"2024-12-05T17:47:30Z","published":"2024-06-18T04:24:30Z","title":"Adversarial Attacks on Large Language Models in Medicine","summary":" The integration of Large Language Models (LLMs) into healthcare applications\noffers promising advancements in medical diagnostics, treatment\nrecommendations, and patient care. However, the susceptibility of LLMs to\nadversarial attacks poses a significant threat, potentially leading to harmful\noutcomes in delicate medical contexts. This study investigates the\nvulnerability of LLMs to two types of adversarial attacks in three medical\ntasks. Utilizing real-world patient data, we demonstrate that both open-source\nand proprietary LLMs are susceptible to manipulation across multiple tasks.\nThis research further reveals that domain-specific tasks demand more\nadversarial data in model fine-tuning than general domain tasks for effective\nattack execution, especially for more capable models. We discover that while\nintegrating adversarial data does not markedly degrade overall model\nperformance on medical benchmarks, it does lead to noticeable shifts in\nfine-tuned model weights, suggesting a potential pathway for detecting and\ncountering model attacks. This research highlights the urgent need for robust\nsecurity measures and the development of defensive mechanisms to safeguard LLMs\nin medical applications, to ensure their safe and effective deployment in\nhealthcare settings.\n","authors":["Yifan Yang","Qiao Jin","Furong Huang","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2406.12259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04367v1","updated":"2024-12-05T17:35:29Z","published":"2024-12-05T17:35:29Z","title":"Machine Theory of Mind for Autonomous Cyber-Defence","summary":" Intelligent autonomous agents hold much potential for the domain of\ncyber-security. However, due to many state-of-the-art approaches relying on\nuninterpretable black-box models, there is growing demand for methods that\noffer stakeholders clear and actionable insights into their latent beliefs and\nmotivations. To address this, we evaluate Theory of Mind (ToM) approaches for\nAutonomous Cyber Operations. Upon learning a robust prior, ToM models can\npredict an agent's goals, behaviours, and contextual beliefs given only a\nhandful of past behaviour observations. In this paper, we introduce a novel\nGraph Neural Network (GNN)-based ToM architecture tailored for cyber-defence,\nGraph-In, Graph-Out (GIGO)-ToM, which can accurately predict both the targets\nand attack trajectories of adversarial cyber agents over arbitrary computer\nnetwork topologies. To evaluate the latter, we propose a novel extension of the\nWasserstein distance for measuring the similarity of graph-based probability\ndistributions. Whereas the standard Wasserstein distance lacks a fixed\nreference scale, we introduce a graph-theoretic normalization factor that\nenables a standardized comparison between networks of different sizes. We\nfurnish this metric, which we term the Network Transport Distance (NTD), with a\nweighting function that emphasizes predictions according to custom node\nfeatures, allowing network operators to explore arbitrary strategic\nconsiderations. Benchmarked against a Graph-In, Dense-Out (GIDO)-ToM\narchitecture in an abstract cyber-defence environment, our empirical\nevaluations show that GIGO-ToM can accurately predict the goals and behaviours\nof various unseen cyber-attacking agents across a range of network topologies,\nas well as learn embeddings that can effectively characterize their policies.\n","authors":["Luke Swaby","Matthew Stewart","Daniel Harrold","Chris Willis","Gregory Palmer"],"pdf_url":"https://arxiv.org/pdf/2412.04367v1.pdf","comment":"29 pages, 17 figures, 12 tables"},{"id":"http://arxiv.org/abs/2412.04366v1","updated":"2024-12-05T17:33:12Z","published":"2024-12-05T17:33:12Z","title":"Artificial intelligence and the internal processes of creativity","summary":" Artificial intelligence (AI) systems capable of generating creative outputs\nare reshaping our understanding of creativity. This shift presents an\nopportunity for creativity researchers to reevaluate the key components of the\ncreative process. In particular, the advanced capabilities of AI underscore the\nimportance of studying the internal processes of creativity. This paper\nexplores the neurobiological machinery that underlies these internal processes\nand describes the experiential component of creativity. It is concluded that\nalthough the products of artificial and human creativity can be similar, the\ninternal processes are different. The paper also discusses how AI may\nnegatively affect the internal processes of human creativity, such as the\ndevelopment of skills, the integration of knowledge, and the diversity of\nideas.\n","authors":["Jaan Aru"],"pdf_url":"https://arxiv.org/pdf/2412.04366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01951v2","updated":"2024-12-05T17:31:43Z","published":"2024-01-03T19:27:20Z","title":"GeoPos: A Minimal Positional Encoding for Enhanced Fine-Grained Details\n in Image Synthesis Using Convolutional Neural Networks","summary":" The enduring inability of image generative models to recreate intricate\ngeometric features, such as those present in human hands and fingers has been\nan ongoing problem in image generation for nearly a decade. While strides have\nbeen made by increasing model sizes and diversifying training datasets, this\nissue remains prevalent across all models, from denoising diffusion models to\nGenerative Adversarial Networks (GAN), pointing to a fundamental shortcoming in\nthe underlying architectures. In this paper, we demonstrate how this problem\ncan be mitigated by augmenting convolution layers geometric capabilities\nthrough providing them with a single input channel incorporating the relative\nn-dimensional Cartesian coordinate system. We show this drastically improves\nquality of images generated by Diffusion Models, GANs, and Variational\nAutoEncoders (VAE).\n","authors":["Mehran Hosseini","Peyman Hosseini"],"pdf_url":"https://arxiv.org/pdf/2401.01951v2.pdf","comment":"Accepted at WACV 2025. Contains 19 pages, 15 figures, and 9 tables"},{"id":"http://arxiv.org/abs/2409.13000v2","updated":"2024-12-05T17:19:12Z","published":"2024-09-19T15:38:21Z","title":"Introducing the Large Medical Model: State of the art healthcare cost\n and risk prediction with transformers trained on patient event sequences","summary":" With U.S. healthcare spending approaching $5T (NHE Fact Sheet 2024), and 25%\nof it estimated to be wasteful (Waste in the US the health care system:\nestimated costs and potential for savings, n.d.), the need to better predict\nrisk and optimal patient care is evermore important. This paper introduces the\nLarge Medical Model (LMM), a generative pre-trained transformer (GPT) designed\nto guide and predict the broad facets of patient care and healthcare\nadministration. The model is trained on medical event sequences from over 140M\nlongitudinal patient claims records with a specialized vocabulary built from\nmedical terminology systems and demonstrates a superior capability to forecast\nhealthcare costs and identify potential risk factors. Through experimentation\nand validation, we showcase the LMM's proficiency in not only in cost and risk\npredictions, but also in discerning intricate patterns within complex medical\nconditions and an ability to identify novel relationships in patient care. The\nLMM is able to improve both cost prediction by 14.1% over the best commercial\nmodels and chronic conditions prediction by 1.9% over the best transformer\nmodels in research predicting a broad set of conditions. The LMM is a\nsubstantial advancement in healthcare analytics, offering the potential to\nsignificantly enhance risk assessment, cost management, and personalized\nmedicine.\n","authors":["Ricky Sahu","Eric Marriott","Ethan Siegel","David Wagner","Flore Uzan","Troy Yang","Asim Javed"],"pdf_url":"https://arxiv.org/pdf/2409.13000v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.04351v1","updated":"2024-12-05T17:10:19Z","published":"2024-12-05T17:10:19Z","title":"BhashaVerse : Translation Ecosystem for Indian Subcontinent Languages","summary":" This paper focuses on developing translation models and related applications\nfor 36 Indian languages, including Assamese, Awadhi, Bengali, Bhojpuri, Braj,\nBodo, Dogri, English, Konkani, Gondi, Gujarati, Hindi, Hinglish, Ho, Kannada,\nKangri, Kashmiri (Arabic and Devanagari), Khasi, Mizo, Magahi, Maithili,\nMalayalam, Marathi, Manipuri (Bengali and Meitei), Nepali, Oriya, Punjabi,\nSanskrit, Santali, Sinhala, Sindhi (Arabic and Devanagari), Tamil, Tulu,\nTelugu, and Urdu. Achieving this requires parallel and other types of corpora\nfor all 36 * 36 language pairs, addressing challenges like script variations,\nphonetic differences, and syntactic diversity. For instance, languages like\nKashmiri and Sindhi, which use multiple scripts, demand script normalization\nfor alignment, while low-resource languages such as Khasi and Santali require\nsynthetic data augmentation to ensure sufficient coverage and quality.\n To address these challenges, this work proposes strategies for corpus\ncreation by leveraging existing resources, developing parallel datasets,\ngenerating domain-specific corpora, and utilizing synthetic data techniques.\nAdditionally, it evaluates machine translation across various dimensions,\nincluding standard and discourse-level translation, domain-specific\ntranslation, reference-based and reference-free evaluation, error analysis, and\nautomatic post-editing. By integrating these elements, the study establishes a\ncomprehensive framework to improve machine translation quality and enable\nbetter cross-lingual communication in India's linguistically diverse ecosystem.\n","authors":["Vandan Mujadia","Dipti Misra Sharma"],"pdf_url":"https://arxiv.org/pdf/2412.04351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04343v1","updated":"2024-12-05T17:01:09Z","published":"2024-12-05T17:01:09Z","title":"RMD: A Simple Baseline for More General Human Motion Generation via\n Training-free Retrieval-Augmented Motion Diffuse","summary":" While motion generation has made substantial progress, its practical\napplication remains constrained by dataset diversity and scale, limiting its\nability to handle out-of-distribution scenarios. To address this, we propose a\nsimple and effective baseline, RMD, which enhances the generalization of motion\ngeneration through retrieval-augmented techniques. Unlike previous\nretrieval-based methods, RMD requires no additional training and offers three\nkey advantages: (1) the external retrieval database can be flexibly replaced;\n(2) body parts from the motion database can be reused, with an LLM facilitating\nsplitting and recombination; and (3) a pre-trained motion diffusion model\nserves as a prior to improve the quality of motions obtained through retrieval\nand direct combination. Without any training, RMD achieves state-of-the-art\nperformance, with notable advantages on out-of-distribution data.\n","authors":["Zhouyingcheng Liao","Mingyuan Zhang","Wenjia Wang","Lei Yang","Taku Komura"],"pdf_url":"https://arxiv.org/pdf/2412.04343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04342v1","updated":"2024-12-05T17:00:32Z","published":"2024-12-05T17:00:32Z","title":"Retrieval-Augmented Machine Translation with Unstructured Knowledge","summary":" Retrieval-augmented generation (RAG) introduces additional information to\nenhance large language models (LLMs). In machine translation (MT), previous\nwork typically retrieves in-context examples from paired MT corpora, or\ndomain-specific knowledge from knowledge graphs, to enhance models' MT ability.\nHowever, a large amount of world knowledge is organized in unstructured\ndocuments, and might not be fully paired across different languages. In this\npaper, we study retrieval-augmented MT using unstructured documents.\nSpecifically, we build RAGtrans, the first benchmark to train and evaluate\nLLMs' retrieval-augmented MT ability. RAGtrans contains 79K MT samples\ncollected via GPT-4o and human translators. Besides, documents from different\nlanguages are also provided to supply the knowledge to these samples. Based on\nRAGtrans, we further propose a multi-task training method to teach LLMs how to\nuse information from multilingual documents during their translation. The\nmethod uses existing multilingual corpora to create auxiliary training\nobjectives without additional labeling requirements. Extensive experiments show\nthat the method improves LLMs by 1.58-3.09 BLEU and 1.00-2.03 COMET scores.\n","authors":["Jiaan Wang","Fandong Meng","Yingxue Zhang","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.04342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04327v1","updated":"2024-12-05T16:42:45Z","published":"2024-12-05T16:42:45Z","title":"Action Mapping for Reinforcement Learning in Continuous Environments\n with Constraints","summary":" Deep reinforcement learning (DRL) has had success across various domains, but\napplying it to environments with constraints remains challenging due to poor\nsample efficiency and slow convergence. Recent literature explored\nincorporating model knowledge to mitigate these problems, particularly through\nthe use of models that assess the feasibility of proposed actions. However,\nintegrating feasibility models efficiently into DRL pipelines in environments\nwith continuous action spaces is non-trivial. We propose a novel DRL training\nstrategy utilizing action mapping that leverages feasibility models to\nstreamline the learning process. By decoupling the learning of feasible actions\nfrom policy optimization, action mapping allows DRL agents to focus on\nselecting the optimal action from a reduced feasible action set. We demonstrate\nthrough experiments that action mapping significantly improves training\nperformance in constrained environments with continuous action spaces,\nespecially with imperfect feasibility models.\n","authors":["Mirco Theile","Lukas Dirnberger","Raphael Trumpp","Marco Caccamo","Alberto L. Sangiovanni-Vincentelli"],"pdf_url":"https://arxiv.org/pdf/2412.04327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.16453v2","updated":"2024-12-05T16:39:32Z","published":"2024-06-24T08:45:03Z","title":"Learning in Wilson-Cowan model for metapopulation","summary":" The Wilson-Cowan model for metapopulation, a Neural Mass Network Model,\ntreats different subcortical regions of the brain as connected nodes, with\nconnections representing various types of structural, functional, or effective\nneuronal connectivity between these regions. Each region comprises interacting\npopulations of excitatory and inhibitory cells, consistent with the standard\nWilson-Cowan model. By incorporating stable attractors into such a\nmetapopulation model's dynamics, we transform it into a learning algorithm\ncapable of achieving high image and text classification accuracy. We test it on\nMNIST and Fashion MNIST, in combination with convolutional neural networks, on\nCIFAR-10 and TF-FLOWERS, and, in combination with a transformer architecture\n(BERT), on IMDB, always showing high classification accuracy. These numerical\nevaluations illustrate that minimal modifications to the Wilson-Cowan model for\nmetapopulation can reveal unique and previously unobserved dynamics.\n","authors":["Raffaele Marino","Lorenzo Buffoni","Lorenzo Chicchi","Francesca Di Patti","Diego Febbe","Lorenzo Giambagli","Duccio Fanelli"],"pdf_url":"https://arxiv.org/pdf/2406.16453v2.pdf","comment":"Paper Accepted in Neural Computation (in press)"},{"id":"http://arxiv.org/abs/2412.04323v1","updated":"2024-12-05T16:39:01Z","published":"2024-12-05T16:39:01Z","title":"GRAM: Generalization in Deep RL with a Robust Adaptation Module","summary":" The reliable deployment of deep reinforcement learning in real-world settings\nrequires the ability to generalize across a variety of conditions, including\nboth in-distribution scenarios seen during training as well as novel\nout-of-distribution scenarios. In this work, we present a framework for\ndynamics generalization in deep reinforcement learning that unifies these two\ndistinct types of generalization within a single architecture. We introduce a\nrobust adaptation module that provides a mechanism for identifying and reacting\nto both in-distribution and out-of-distribution environment dynamics, along\nwith a joint training pipeline that combines the goals of in-distribution\nadaptation and out-of-distribution robustness. Our algorithm GRAM achieves\nstrong generalization performance across in-distribution and\nout-of-distribution scenarios upon deployment, which we demonstrate on a\nvariety of realistic simulated locomotion tasks with a quadruped robot.\n","authors":["James Queeney","Xiaoyi Cai","Mouhacine Benosman","Jonathan P. How"],"pdf_url":"https://arxiv.org/pdf/2412.04323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12068v3","updated":"2024-12-05T16:34:21Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":" In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2412.04318v1","updated":"2024-12-05T16:34:20Z","published":"2024-12-05T16:34:20Z","title":"The Hyperfitting Phenomenon: Sharpening and Stabilizing LLMs for\n Open-Ended Text Generation","summary":" This paper introduces the counter-intuitive generalization results of\noverfitting pre-trained large language models (LLMs) on very small datasets. In\nthe setting of open-ended text generation, it is well-documented that LLMs tend\nto generate repetitive and dull sequences, a phenomenon that is especially\napparent when generating using greedy decoding. This issue persists even with\nstate-of-the-art LLMs containing billions of parameters, trained via next-token\nprediction on large datasets. We find that by further fine-tuning these models\nto achieve a near-zero training loss on a small set of samples -- a process we\nrefer to as hyperfitting -- the long-sequence generative capabilities are\ngreatly enhanced. Greedy decoding with these Hyperfitted models even outperform\nTop-P sampling over long-sequences, both in terms of diversity and human\npreferences. This phenomenon extends to LLMs of various sizes, different\ndomains, and even autoregressive image generation. We further find this\nphenomena to be distinctly different from that of Grokking and double descent.\nSurprisingly, our experiments indicate that hyperfitted models rarely fall into\nrepeating sequences they were trained on, and even explicitly blocking these\nsequences results in high-quality output. All hyperfitted models produce\nextremely low-entropy predictions, often allocating nearly all probability to a\nsingle token.\n","authors":["Fredrik Carlsson","Fangyu Liu","Daniel Ward","Murathan Kurfali","Joakim Nivre"],"pdf_url":"https://arxiv.org/pdf/2412.04318v1.pdf","comment":"Under review at ICLR"},{"id":"http://arxiv.org/abs/2412.04315v1","updated":"2024-12-05T16:31:13Z","published":"2024-12-05T16:31:13Z","title":"Densing Law of LLMs","summary":" Large Language Models (LLMs) have emerged as a milestone in artificial\nintelligence, and their performance can improve as the model size increases.\nHowever, this scaling brings great challenges to training and inference\nefficiency, particularly for deploying LLMs in resource-constrained\nenvironments, and the scaling trend is becoming increasingly unsustainable.\nThis paper introduces the concept of ``\\textit{capacity density}'' as a new\nmetric to evaluate the quality of the LLMs across different scales and\ndescribes the trend of LLMs in terms of both effectiveness and efficiency. To\ncalculate the capacity density of a given target LLM, we first introduce a set\nof reference models and develop a scaling law to predict the downstream\nperformance of these reference models based on their parameter sizes. We then\ndefine the \\textit{effective parameter size} of the target LLM as the parameter\nsize required by a reference model to achieve equivalent performance, and\nformalize the capacity density as the ratio of the effective parameter size to\nthe actual parameter size of the target LLM. Capacity density provides a\nunified framework for assessing both model effectiveness and efficiency. Our\nfurther analysis of recent open-source base LLMs reveals an empirical law (the\ndensing law)that the capacity density of LLMs grows exponentially over time.\nMore specifically, using some widely used benchmarks for evaluation, the\ncapacity density of LLMs doubles approximately every three months. The law\nprovides new perspectives to guide future LLM development, emphasizing the\nimportance of improving capacity density to achieve optimal results with\nminimal computational overhead.\n","authors":["Chaojun Xiao","Jie Cai","Weilin Zhao","Guoyang Zeng","Xu Han","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2412.04315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17978v2","updated":"2024-12-05T16:24:15Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v2.pdf","comment":"Accepted at NeurIPS'24, please cite the conference version"},{"id":"http://arxiv.org/abs/2412.04300v1","updated":"2024-12-05T16:21:01Z","published":"2024-12-05T16:21:01Z","title":"T2I-FactualBench: Benchmarking the Factuality of Text-to-Image Models\n with Knowledge-Intensive Concepts","summary":" Evaluating the quality of synthesized images remains a significant challenge\nin the development of text-to-image (T2I) generation. Most existing studies in\nthis area primarily focus on evaluating text-image alignment, image quality,\nand object composition capabilities, with comparatively fewer studies\naddressing the evaluation of the factuality of T2I models, particularly when\nthe concepts involved are knowledge-intensive. To mitigate this gap, we present\nT2I-FactualBench in this work - the largest benchmark to date in terms of the\nnumber of concepts and prompts specifically designed to evaluate the factuality\nof knowledge-intensive concept generation. T2I-FactualBench consists of a\nthree-tiered knowledge-intensive text-to-image generation framework, ranging\nfrom the basic memorization of individual knowledge concepts to the more\ncomplex composition of multiple knowledge concepts. We further introduce a\nmulti-round visual question answering (VQA) based evaluation framework to\nassess the factuality of three-tiered knowledge-intensive text-to-image\ngeneration tasks. Experiments on T2I-FactualBench indicate that current\nstate-of-the-art (SOTA) T2I models still leave significant room for\nimprovement.\n","authors":["Ziwei Huang","Wanggui He","Quanyu Long","Yandi Wang","Haoyuan Li","Zhelun Yu","Fangxun Shu","Long Chen","Hao Jiang","Leilei Gan"],"pdf_url":"https://arxiv.org/pdf/2412.04300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04292v1","updated":"2024-12-05T16:12:25Z","published":"2024-12-05T16:12:25Z","title":"SIDA: Social Media Image Deepfake Detection, Localization and\n Explanation with Large Multimodal Model","summary":" The rapid advancement of generative models in creating highly realistic\nimages poses substantial risks for misinformation dissemination. For instance,\na synthetic image, when shared on social media, can mislead extensive audiences\nand erode trust in digital content, resulting in severe repercussions. Despite\nsome progress, academia has not yet created a large and diversified deepfake\ndetection dataset for social media, nor has it devised an effective solution to\naddress this issue. In this paper, we introduce the Social media Image\nDetection dataSet (SID-Set), which offers three key advantages: (1) extensive\nvolume, featuring 300K AI-generated/tampered and authentic images with\ncomprehensive annotations, (2) broad diversity, encompassing fully synthetic\nand tampered images across various classes, and (3) elevated realism, with\nimages that are predominantly indistinguishable from genuine ones through mere\nvisual inspection. Furthermore, leveraging the exceptional capabilities of\nlarge multimodal models, we propose a new image deepfake detection,\nlocalization, and explanation framework, named SIDA (Social media Image\nDetection, localization, and explanation Assistant). SIDA not only discerns the\nauthenticity of images, but also delineates tampered regions through mask\nprediction and provides textual explanations of the model's judgment criteria.\nCompared with state-of-the-art deepfake detection models on SID-Set and other\nbenchmarks, extensive experiments demonstrate that SIDA achieves superior\nperformance among diversified settings. The code, model, and dataset will be\nreleased.\n","authors":["Zhenglin Huang","Jinwei Hu","Xiangtai Li","Yiwei He","Xingyu Zhao","Bei Peng","Baoyuan Wu","Xiaowei Huang","Guangliang Cheng"],"pdf_url":"https://arxiv.org/pdf/2412.04292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18908v3","updated":"2024-12-05T16:05:48Z","published":"2024-02-29T07:08:18Z","title":"Facility Location Games with Scaling Effects","summary":" We take the classic facility location problem and consider a variation, in\nwhich each agent's individual cost function is equal to their distance from the\nfacility multiplied by a scaling factor which is determined by the facility\nplacement. In addition to the general class of continuous scaling functions, we\nalso provide results for piecewise linear scaling functions which can\neffectively approximate or model the scaling of many real world scenarios. We\nfocus on the objectives of total and maximum cost, describing the computation\nof the optimal solution. We then move to the approximate mechanism design\nsetting, observing that the agents' preferences may no longer be single-peaked.\nConsequently, we characterize the conditions on scaling functions which ensure\nthat agents have single-peaked preferences. Under these conditions, we find a\ncharacterization of continuous, strategyproof, and anonymous mechanisms, and\ncompute the total and maximum cost approximation ratios achievable by these\nmechanisms.\n","authors":["Yu He","Alexander Lam","Minming Li"],"pdf_url":"https://arxiv.org/pdf/2402.18908v3.pdf","comment":"This is an updated version of the paper which appeared at the 23rd\n International Conference on Autonomous Agents and Multi-Agent Systems\n (AAMAS-24)"},{"id":"http://arxiv.org/abs/2412.04272v1","updated":"2024-12-05T15:54:16Z","published":"2024-12-05T15:54:16Z","title":"PoTable: Programming Standardly on Table-based Reasoning Like a Human\n Analyst","summary":" Table-based reasoning has garnered substantial research interest,\nparticularly in its integration with Large Language Model (LLM) which has\nrevolutionized the general reasoning paradigm. Numerous LLM-based studies\nintroduce symbolic tools (e.g., databases, Python) as assistants to extend\nhuman-like abilities in structured table understanding and complex arithmetic\ncomputations. However, these studies can be improved better in simulating human\ncognitive behavior when using symbolic tools, as they still suffer from\nlimitations of non-standard logical splits and constrained operation pools. In\nthis study, we propose PoTable as a novel table-based reasoning method that\nsimulates a human tabular analyst, which integrates a Python interpreter as the\nreal-time executor accompanied by an LLM-based operation planner and code\ngenerator. Specifically, PoTable follows a human-like logical stage split and\nextends the operation pool into an open-world space without any constraints.\nThrough planning and executing in each distinct stage, PoTable standardly\ncompletes the entire reasoning process and produces superior reasoning results\nalong with highly accurate, steply commented and completely executable\nprograms. Accordingly, the effectiveness and explainability of PoTable are\nfully demonstrated. Extensive experiments over three evaluation datasets from\ntwo public benchmarks on two backbones show the outstanding performance of our\napproach. In particular, GPT-based PoTable achieves over 4% higher absolute\naccuracy than runner-ups on all evaluation datasets.\n","authors":["Qingyang Mao","Qi Liu","Zhi Li","Mingyue Cheng","Zheng Zhang","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2412.04272v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2405.20331v2","updated":"2024-12-05T15:48:24Z","published":"2024-05-30T17:59:04Z","title":"CoSy: Evaluating Textual Explanations of Neurons","summary":" A crucial aspect of understanding the complex nature of Deep Neural Networks\n(DNNs) is the ability to explain learned concepts within their latent\nrepresentations. While methods exist to connect neurons to human-understandable\ntextual descriptions, evaluating the quality of these explanations is\nchallenging due to the lack of a unified quantitative approach. We introduce\nCoSy (Concept Synthesis), a novel, architecture-agnostic framework for\nevaluating textual explanations of latent neurons. Given textual explanations,\nour proposed framework uses a generative model conditioned on textual input to\ncreate data points representing the explanations. By comparing the neuron's\nresponse to these generated data points and control data points, we can\nestimate the quality of the explanation. We validate our framework through\nsanity checks and benchmark various neuron description methods for Computer\nVision tasks, revealing significant differences in quality.\n","authors":["Laura Kopf","Philine Lou Bommer","Anna Hedström","Sebastian Lapuschkin","Marina M. -C. Höhne","Kirill Bykov"],"pdf_url":"https://arxiv.org/pdf/2405.20331v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2412.04260v1","updated":"2024-12-05T15:39:54Z","published":"2024-12-05T15:39:54Z","title":"Enhancing Whole Slide Image Classification through Supervised\n Contrastive Domain Adaptation","summary":" Domain shift in the field of histopathological imaging is a common phenomenon\ndue to the intra- and inter-hospital variability of staining and digitization\nprotocols. The implementation of robust models, capable of creating generalized\ndomains, represents a need to be solved. In this work, a new domain adaptation\nmethod to deal with the variability between histopathological images from\nmultiple centers is presented. In particular, our method adds a training\nconstraint to the supervised contrastive learning approach to achieve domain\nadaptation and improve inter-class separability. Experiments performed on\ndomain adaptation and classification of whole-slide images of six skin cancer\nsubtypes from two centers demonstrate the method's usefulness. The results\nreflect superior performance compared to not using domain adaptation after\nfeature extraction or staining normalization.\n","authors":["Ilán Carretero","Pablo Meseguer","Rocío del Amor","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2412.04260v1.pdf","comment":"Accepted in CASEIB 2024"},{"id":"http://arxiv.org/abs/2412.04256v1","updated":"2024-12-05T15:37:29Z","published":"2024-12-05T15:37:29Z","title":"Transient Multi-Agent Path Finding for Lifelong Navigation in Dense\n Environments","summary":" Multi-Agent Path Finding (MAPF) deals with finding conflict-free paths for a\nset of agents from an initial configuration to a given target configuration.\nThe Lifelong MAPF (LMAPF) problem is a well-studied online version of MAPF in\nwhich an agent receives a new target when it reaches its current target. The\ncommon approach for solving LMAPF is to treat it as a sequence of MAPF\nproblems, periodically replanning from the agents' current configurations to\ntheir current targets. A significant drawback in this approach is that in MAPF\nthe agents must reach a configuration in which all agents are at their targets\nsimultaneously, which is needlessly restrictive for LMAPF. Techniques have been\nproposed to indirectly mitigate this drawback. We describe cases where these\nmitigation techniques fail. As an alternative, we propose to solve LMAPF\nproblems by solving a sequence of modified MAPF problems, in which the\nobjective is for each agent to eventually visit its target, but not necessarily\nfor all agents to do so simultaneously. We refer to this MAPF variant as\nTransient MAPF (TMAPF) and propose several algorithms for solving it based on\nexisting MAPF algorithms. A limited experimental evaluation identifies some\ncases where using a TMAPF algorithm instead of a MAPF algorithm with an LMAPF\nframework can improve the system throughput significantly.\n","authors":["Jonathan Morag","Noy Gabay","Daniel koyfman","Roni Stern"],"pdf_url":"https://arxiv.org/pdf/2412.04256v1.pdf","comment":"Submitted to The 35th International Conference on Automated Planning\n and Scheduling (ICAPS 2025)"},{"id":"http://arxiv.org/abs/2412.04254v1","updated":"2024-12-05T15:34:02Z","published":"2024-12-05T15:34:02Z","title":"CLINICSUM: Utilizing Language Models for Generating Clinical Summaries\n from Patient-Doctor Conversations","summary":" This paper presents ClinicSum, a novel framework designed to automatically\ngenerate clinical summaries from patient-doctor conversations. It utilizes a\ntwo-module architecture: a retrieval-based filtering module that extracts\nSubjective, Objective, Assessment, and Plan (SOAP) information from\nconversation transcripts, and an inference module powered by fine-tuned\nPre-trained Language Models (PLMs), which leverage the extracted SOAP data to\ngenerate abstracted clinical summaries. To fine-tune the PLM, we created a\ntraining dataset of consisting 1,473 conversations-summaries pair by\nconsolidating two publicly available datasets, FigShare and MTS-Dialog, with\nground truth summaries validated by Subject Matter Experts (SMEs). ClinicSum's\neffectiveness is evaluated through both automatic metrics (e.g., ROUGE,\nBERTScore) and expert human assessments. Results show that ClinicSum\noutperforms state-of-the-art PLMs, demonstrating superior precision, recall,\nand F-1 scores in automatic evaluations and receiving high preference from SMEs\nin human assessment, making it a robust solution for automated clinical\nsummarization.\n","authors":["Subash Neupane","Himanshu Tripathi","Shaswata Mitra","Sean Bozorgzad","Sudip Mittal","Shahram Rahimi","Amin Amirlatifi"],"pdf_url":"https://arxiv.org/pdf/2412.04254v1.pdf","comment":"accepted at the the 2024 IEEE International Conference on Big Data\n workshop Workshop on Big Data and AI for Healthcare"},{"id":"http://arxiv.org/abs/2410.14086v2","updated":"2024-12-05T15:24:33Z","published":"2024-10-17T23:37:34Z","title":"In-context learning and Occam's razor","summary":" A central goal of machine learning is generalization. While the No Free Lunch\nTheorem states that we cannot obtain theoretical guarantees for generalization\nwithout further assumptions, in practice we observe that simple models which\nexplain the training data generalize best: a principle called Occam's razor.\nDespite the need for simple models, most current approaches in machine learning\nonly minimize the training error, and at best indirectly promote simplicity\nthrough regularization or architecture design. Here, we draw a connection\nbetween Occam's razor and in-context learning: an emergent ability of certain\nsequence models like Transformers to learn at inference time from past\nobservations in a sequence. In particular, we show that the next-token\nprediction loss used to train in-context learners is directly equivalent to a\ndata compression technique called prequential coding, and that minimizing this\nloss amounts to jointly minimizing both the training error and the complexity\nof the model that was implicitly learned from context. Our theory and the\nempirical experiments we use to support it not only provide a normative account\nof in-context learning, but also elucidate the shortcomings of current\nin-context learning methods, suggesting ways in which they can be improved. We\nmake our code available at https://github.com/3rdCore/PrequentialCode.\n","authors":["Eric Elmoznino","Tom Marty","Tejas Kasetty","Leo Gagnon","Sarthak Mittal","Mahan Fathi","Dhanya Sridhar","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2410.14086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08339v3","updated":"2024-12-05T15:23:20Z","published":"2022-10-15T17:15:53Z","title":"Reachable Polyhedral Marching (RPM): An Exact Analysis Tool for\n Deep-Learned Control Systems","summary":" Neural networks are increasingly used in robotics as policies, state\ntransition models, state estimation models, or all of the above. With these\ncomponents being learned from data, it is important to be able to analyze what\nbehaviors were learned and how this affects closed-loop performance. In this\npaper we take steps toward this goal by developing methods for computing\ncontrol invariant sets and regions of attraction (ROAs) of dynamical systems\nrepresented as neural networks. We focus our attention on feedforward neural\nnetworks with the rectified linear unit (ReLU) activation, which are known to\nimplement continuous piecewise-affine (PWA) functions. We describe the\nReachable Polyhedral Marching (RPM) algorithm for enumerating the affine pieces\nof a neural network through an incremental connected walk. We then use this\nalgorithm to compute exact forward and backward reachable sets, from which we\nprovide methods for computing control invariant sets and ROAs. Our approach is\nunique in that we find these sets incrementally, without Lyapunov-based tools.\nIn our examples we demonstrate the ability of our approach to find non-convex\ncontrol invariant sets and ROAs on tasks with learned van der Pol oscillator\nand pendulum models. Further, we provide an accelerated algorithm for computing\nROAs that leverages the incremental and connected enumeration of affine regions\nthat RPM provides. We show this acceleration to lead to a 15x speedup in our\nexamples. Finally, we apply our methods to find a set of states that are\nstabilized by an image-based controller for an aircraft runway control problem.\n","authors":["Joseph A. Vincent","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2210.08339v3.pdf","comment":"Submitted to IEEE Transactions on Neural Networks and Learning\n Systems. arXiv admin note: text overlap with arXiv:2011.11609"},{"id":"http://arxiv.org/abs/2410.14817v2","updated":"2024-12-05T15:20:28Z","published":"2024-10-18T18:37:27Z","title":"A Complexity-Based Theory of Compositionality","summary":" Compositionality is believed to be fundamental to intelligence. In humans, it\nunderlies the structure of thought, language, and higher-level reasoning. In\nAI, compositional representations can enable a powerful form of\nout-of-distribution generalization, in which a model systematically adapts to\nnovel combinations of known concepts. However, while we have strong intuitions\nabout what compositionality is, there currently exists no formal definition for\nit that is measurable and mathematical. Here, we propose such a definition,\nwhich we call representational compositionality, that accounts for and extends\nour intuitions about compositionality. The definition is conceptually simple,\nquantitative, grounded in algorithmic information theory, and applicable to any\nrepresentation. Intuitively, representational compositionality states that a\ncompositional representation satisfies three properties. First, it must be\nexpressive. Second, it must be possible to re-describe the representation as a\nfunction of discrete symbolic sequences with re-combinable parts, analogous to\nsentences in natural language. Third, the function that relates these symbolic\nsequences to the representation, analogous to semantics in natural language,\nmust be simple. Through experiments on both synthetic and real world data, we\nvalidate our definition of compositionality and show how it unifies disparate\nintuitions from across the literature in both AI and cognitive science. We also\nshow that representational compositionality, while theoretically intractable,\ncan be readily estimated using standard deep learning tools. Our definition has\nthe potential to inspire the design of novel, theoretically-driven models that\nbetter capture the mechanisms of compositional thought.\n","authors":["Eric Elmoznino","Thomas Jiralerspong","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2410.14817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.06601v2","updated":"2024-12-05T15:19:47Z","published":"2024-03-11T10:48:56Z","title":"Cross-domain and Cross-dimension Learning for Image-to-Graph\n Transformers","summary":" Direct image-to-graph transformation is a challenging task that involves\nsolving object detection and relationship prediction in a single model. Due to\nthis task's complexity, large training datasets are rare in many domains,\nmaking the training of deep-learning methods challenging. This data sparsity\nnecessitates transfer learning strategies akin to the state-of-the-art in\ngeneral computer vision. In this work, we introduce a set of methods enabling\ncross-domain and cross-dimension learning for image-to-graph transformers. We\npropose (1) a regularized edge sampling loss to effectively learn object\nrelations in multiple domains with different numbers of edges, (2) a domain\nadaptation framework for image-to-graph transformers aligning image- and\ngraph-level features from different domains, and (3) a projection function that\nallows using 2D data for training 3D transformers. We demonstrate our method's\nutility in cross-domain and cross-dimension experiments, where we utilize\nlabeled data from 2D road networks for simultaneous learning in vastly\ndifferent target domains. Our method consistently outperforms standard transfer\nlearning and self-supervised pretraining on challenging benchmarks, such as\nretinal or whole-brain vessel graph extraction.\n","authors":["Alexander H. Berger","Laurin Lux","Suprosanna Shit","Ivan Ezhov","Georgios Kaissis","Martin J. Menten","Daniel Rueckert","Johannes C. Paetzold"],"pdf_url":"https://arxiv.org/pdf/2403.06601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04234v1","updated":"2024-12-05T15:10:13Z","published":"2024-12-05T15:10:13Z","title":"DEIM: DETR with Improved Matching for Fast Convergence","summary":" We introduce DEIM, an innovative and efficient training framework designed to\naccelerate convergence in real-time object detection with Transformer-based\narchitectures (DETR). To mitigate the sparse supervision inherent in one-to-one\n(O2O) matching in DETR models, DEIM employs a Dense O2O matching strategy. This\napproach increases the number of positive samples per image by incorporating\nadditional targets, using standard data augmentation techniques. While Dense\nO2O matching speeds up convergence, it also introduces numerous low-quality\nmatches that could affect performance. To address this, we propose the\nMatchability-Aware Loss (MAL), a novel loss function that optimizes matches\nacross various quality levels, enhancing the effectiveness of Dense O2O.\nExtensive experiments on the COCO dataset validate the efficacy of DEIM. When\nintegrated with RT-DETR and D-FINE, it consistently boosts performance while\nreducing training time by 50%. Notably, paired with RT-DETRv2, DEIM achieves\n53.2% AP in a single day of training on an NVIDIA 4090 GPU. Additionally,\nDEIM-trained real-time models outperform leading real-time object detectors,\nwith DEIM-D-FINE-L and DEIM-D-FINE-X achieving 54.7% and 56.5% AP at 124 and 78\nFPS on an NVIDIA T4 GPU, respectively, without the need for additional data. We\nbelieve DEIM sets a new baseline for advancements in real-time object\ndetection. Our code and pre-trained models are available at\nhttps://github.com/ShihuaHuang95/DEIM.\n","authors":["Shihua Huang","Zhichao Lu","Xiaodong Cun","Yongjun Yu","Xiao Zhou","Xi Shen"],"pdf_url":"https://arxiv.org/pdf/2412.04234v1.pdf","comment":"Exceeding all existing real-time object detectors, including YOLOv11\n and D-FINE"},{"id":"http://arxiv.org/abs/2412.04233v1","updated":"2024-12-05T15:09:51Z","published":"2024-12-05T15:09:51Z","title":"HyperMARL: Adaptive Hypernetworks for Multi-Agent RL","summary":" Balancing individual specialisation and shared behaviours is a critical\nchallenge in multi-agent reinforcement learning (MARL). Existing methods\ntypically focus on encouraging diversity or leveraging shared representations.\nFull parameter sharing (FuPS) improves sample efficiency but struggles to learn\ndiverse behaviours when required, while no parameter sharing (NoPS) enables\ndiversity but is computationally expensive and sample inefficient. To address\nthese challenges, we introduce HyperMARL, a novel approach using hypernetworks\nto balance efficiency and specialisation. HyperMARL generates agent-specific\nactor and critic parameters, enabling agents to adaptively exhibit diverse or\nhomogeneous behaviours as needed, without modifying the learning objective or\nrequiring prior knowledge of the optimal diversity. Furthermore, HyperMARL\ndecouples agent-specific and state-based gradients, which empirically\ncorrelates with reduced policy gradient variance, potentially offering insights\ninto its ability to capture diverse behaviours. Across MARL benchmarks\nrequiring homogeneous, heterogeneous, or mixed behaviours, HyperMARL\nconsistently matches or outperforms FuPS, NoPS, and diversity-focused methods,\nachieving NoPS-level diversity with a shared architecture. These results\nhighlight the potential of hypernetworks as a versatile approach to the\ntrade-off between specialisation and shared behaviours in MARL.\n","authors":["Kale-ab Abebe Tessera","Arrasy Rahman","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2412.04233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05357v2","updated":"2024-12-05T15:08:56Z","published":"2024-10-07T15:55:55Z","title":"Model-GLUE: Democratized LLM Scaling for A Large Model Zoo in the Wild","summary":" As Large Language Models (LLMs) excel across tasks and specialized domains,\nscaling LLMs based on existing models has garnered significant attention, which\nfaces the challenge of decreasing performance when combining disparate models.\nVarious techniques have been proposed for the aggregation of pre-trained LLMs,\nincluding model merging, Mixture-of-Experts, and stacking. Despite their\nmerits, a comprehensive comparison and synergistic application of them to a\ndiverse model zoo is yet to be adequately addressed. In light of this research\ngap, this paper introduces Model-GLUE, a holistic LLM scaling guideline. First,\nour work starts with a benchmarking of existing LLM scaling techniques,\nespecially selective merging, and variants of mixture. Utilizing the insights\nfrom the benchmark results, we formulate an optimal strategy for the selection\nand aggregation of a heterogeneous model zoo characterizing different\narchitectures and initialization.Our methodology involves the clustering of\nmergeable models and optimal merging strategy selection, and the integration of\nclusters through a model mixture. Finally, evidenced by our experiments on a\ndiverse Llama-2-based model zoo, Model-GLUE shows an average performance\nenhancement of 5.61%, achieved without additional training. Codes are available\nat: https://github.com/Model-GLUE/Model-GLUE.\n","authors":["Xinyu Zhao","Guoheng Sun","Ruisi Cai","Yukun Zhou","Pingzhi Li","Peihao Wang","Bowen Tan","Yexiao He","Li Chen","Yi Liang","Beidi Chen","Binhang Yuan","Hongyi Wang","Ang Li","Zhangyang Wang","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.05357v2.pdf","comment":"24 pages, 4 figures, accepted to NeurIPS 2024 Datasets and Benchmarks\n Track"},{"id":"http://arxiv.org/abs/2412.03441v2","updated":"2024-12-05T15:03:26Z","published":"2024-12-04T16:30:03Z","title":"PBP: Post-training Backdoor Purification for Malware Classifiers","summary":" In recent years, the rise of machine learning (ML) in cybersecurity has\nbrought new challenges, including the increasing threat of backdoor poisoning\nattacks on ML malware classifiers. For instance, adversaries could inject\nmalicious samples into public malware repositories, contaminating the training\ndata and potentially misclassifying malware by the ML model. Current\ncountermeasures predominantly focus on detecting poisoned samples by leveraging\ndisagreements within the outputs of a diverse set of ensemble models on\ntraining data points. However, these methods are not suitable for scenarios\nwhere Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove\nbackdoors from a model after it has been trained. Addressing this scenario, we\nintroduce PBP, a post-training defense for malware classifiers that mitigates\nvarious types of backdoor embeddings without assuming any specific backdoor\nembedding mechanism. Our method exploits the influence of backdoor attacks on\nthe activation distribution of neural networks, independent of the\ntrigger-embedding method. In the presence of a backdoor attack, the activation\ndistribution of each layer is distorted into a mixture of distributions. By\nregulating the statistics of the batch normalization layers, we can guide a\nbackdoored model to perform similarly to a clean one. Our method demonstrates\nsubstantial advantages over several state-of-the-art methods, as evidenced by\nexperiments on two datasets, two types of backdoor methods, and various attack\nconfigurations. Notably, our approach requires only a small portion of the\ntraining data -- only 1\\% -- to purify the backdoor and reduce the attack\nsuccess rate from 100\\% to almost 0\\%, a 100-fold improvement over the baseline\nmethods. Our code is available at\n\\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}.\n","authors":["Dung Thuy Nguyen","Ngoc N. Tran","Taylor T. Johnson","Kevin Leach"],"pdf_url":"https://arxiv.org/pdf/2412.03441v2.pdf","comment":"Accepted at NDSS 2025"},{"id":"http://arxiv.org/abs/2410.03960v2","updated":"2024-12-05T14:56:56Z","published":"2024-10-04T22:45:26Z","title":"SwiftKV: Fast Prefill-Optimized Inference with Knowledge-Preserving\n Model Transformation","summary":" LLM inference for popular enterprise use cases, such as summarization, RAG,\nand code-generation, typically observes orders of magnitude longer prompt\nlengths than generation lengths. This characteristic leads to high cost of\nprefill and increased response latency. In this paper, we present SwiftKV, a\nnovel model transformation and distillation procedure specifically designed to\nreduce the time and cost of processing prompt tokens while preserving high\nquality of generated tokens. SwiftKV combines three key mechanisms: i)\nSingleInputKV, which prefills later layers' KV cache using a much earlier\nlayer's output, allowing prompt tokens to skip much of the model computation,\nii) AcrossKV, which merges the KV caches of neighboring layers to reduce the\nmemory footprint and support larger batch size for higher throughput, and iii)\na knowledge-preserving distillation procedure that can adapt existing LLMs for\nSwiftKV with minimal accuracy impact and low compute and data requirement. For\nLlama-3.1-8B and 70B, SwiftKV reduces the compute requirement of prefill by 50%\nand the memory requirement of the KV cache by 62.5% while incurring minimum\nquality degradation across a wide range of tasks. In the end-to-end inference\nserving using an optimized vLLM implementation, SwiftKV realizes up to 2x\nhigher aggregate throughput and 60% lower time per output token. It can achieve\na staggering 560 TFlops/GPU of normalized inference throughput, which\ntranslates to 16K tokens/s for Llama-3.1-70B in 16-bit precision on 4x H100\nGPUs. Our training, inference, and model implementations are open-sourced and\ncan be found through\nhttps://huggingface.co/collections/Snowflake/swiftkv-models-674f7d7474eb789e185d31cb.\n","authors":["Aurick Qiao","Zhewei Yao","Samyam Rajbhandari","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2410.03960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06740v4","updated":"2024-12-05T14:56:30Z","published":"2024-11-11T06:25:13Z","title":"Dockformer: A transformer-based molecular docking paradigm for\n large-scale virtual screening","summary":" Molecular docking is a crucial step in drug development, which enables the\nvirtual screening of compound libraries to identify potential ligands that\ntarget proteins of interest. However, the computational complexity of\ntraditional docking models increases as the size of the compound library\nincreases. Recently, deep learning algorithms can provide data-driven research\nand development models to increase the speed of the docking process.\nUnfortunately, few models can achieve superior screening performance compared\nto that of traditional models. Therefore, a novel deep learning-based docking\napproach named Dockformer is introduced in this study. Dockformer leverages\nmultimodal information to capture the geometric topology and structural\nknowledge of molecules and can directly generate binding conformations with the\ncorresponding confidence measures in an end-to-end manner. The experimental\nresults show that Dockformer achieves success rates of 90.53% and 82.71% on the\nPDBbind core set and PoseBusters benchmarks, respectively, and more than a\n100-fold increase in the inference process speed, outperforming almost all\nstate-of-the-art docking methods. In addition, the ability of Dockformer to\nidentify the main protease inhibitors of coronaviruses is demonstrated in a\nreal-world virtual screening scenario. Considering its high docking accuracy\nand screening efficiency, Dockformer can be regarded as a powerful and robust\ntool in the field of drug design.\n","authors":["Zhangfan Yang","Junkai Ji","Shan He","Jianqiang Li","Tiantian He","Ruibin Bai","Zexuan Zhu","Yew Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2411.06740v4.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.04220v1","updated":"2024-12-05T14:54:31Z","published":"2024-12-05T14:54:31Z","title":"Customize Segment Anything Model for Multi-Modal Semantic Segmentation\n with Mixture of LoRA Experts","summary":" The recent Segment Anything Model (SAM) represents a significant breakthrough\nin scaling segmentation models, delivering strong performance across various\ndownstream applications in the RGB modality. However, directly applying SAM to\nemerging visual modalities, such as depth and event data results in suboptimal\nperformance in multi-modal segmentation tasks. In this paper, we make the first\nattempt to adapt SAM for multi-modal semantic segmentation by proposing a\nMixture of Low-Rank Adaptation Experts (MoE-LoRA) tailored for different input\nvisual modalities. By training only the MoE-LoRA layers while keeping SAM's\nweights frozen, SAM's strong generalization and segmentation capabilities can\nbe preserved for downstream tasks. Specifically, to address cross-modal\ninconsistencies, we propose a novel MoE routing strategy that adaptively\ngenerates weighted features across modalities, enhancing multi-modal feature\nintegration. Additionally, we incorporate multi-scale feature extraction and\nfusion by adapting SAM's segmentation head and introducing an auxiliary\nsegmentation head to combine multi-scale features for improved segmentation\nperformance effectively. Extensive experiments were conducted on three\nmulti-modal benchmarks: DELIVER, MUSES, and MCubeS. The results consistently\ndemonstrate that the proposed method significantly outperforms state-of-the-art\napproaches across diverse scenarios. Notably, under the particularly\nchallenging condition of missing modalities, our approach exhibits a\nsubstantial performance gain, achieving an improvement of 32.15% compared to\nexisting methods.\n","authors":["Chenyang Zhu","Bin Xiao","Lin Shi","Shoukun Xu","Xu Zheng"],"pdf_url":"https://arxiv.org/pdf/2412.04220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.10886v2","updated":"2024-12-05T14:51:55Z","published":"2024-11-16T20:59:01Z","title":"MetricGold: Leveraging Text-To-Image Latent Diffusion Models for Metric\n Depth Estimation","summary":" Recovering metric depth from a single image remains a fundamental challenge\nin computer vision, requiring both scene understanding and accurate scaling.\nWhile deep learning has advanced monocular depth estimation, current models\noften struggle with unfamiliar scenes and layouts, particularly in zero-shot\nscenarios and when predicting scale-ergodic metric depth. We present\nMetricGold, a novel approach that harnesses generative diffusion model's rich\npriors to improve metric depth estimation. Building upon recent advances in\nMariGold, DDVM and Depth Anything V2 respectively, our method combines latent\ndiffusion, log-scaled metric depth representation, and synthetic data training.\nMetricGold achieves efficient training on a single RTX 3090 within two days\nusing photo-realistic synthetic data from HyperSIM, VirtualKitti, and\nTartanAir. Our experiments demonstrate robust generalization across diverse\ndatasets, producing sharper and higher quality metric depth estimates compared\nto existing approaches.\n","authors":["Ansh Shah","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2411.10886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00326v4","updated":"2024-12-05T14:45:05Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging LLM Agents for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM agents have\nrevolutionised data engineering and have been applied creatively in many\ndomains, their potential for OM remains underexplored. This study introduces a\nnovel agent-powered LLM-based design paradigm for OM systems. With\nconsideration of several specific challenges in leveraging LLM agents for OM,\nwe propose a generic framework, namely Agent-OM (Agent for Ontology Matching),\nconsisting of two Siamese agents for retrieval and matching, with a set of\nsimple OM tools. Our framework is implemented in a proof-of-concept system.\nEvaluations of three Ontology Alignment Evaluation Initiative (OAEI) tracks\nover state-of-the-art OM systems show that our system can achieve results very\nclose to the long-standing best performance on simple OM tasks and can\nsignificantly improve the performance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v4.pdf","comment":"14 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2412.04202v1","updated":"2024-12-05T14:40:27Z","published":"2024-12-05T14:40:27Z","title":"Relationships between Keywords and Strong Beats in Lyrical Music","summary":" Artificial Intelligence (AI) song generation has emerged as a popular topic,\nyet the focus on exploring the latent correlations between specific lyrical and\nrhythmic features remains limited. In contrast, this pilot study particularly\ninvestigates the relationships between keywords and rhythmically stressed\nfeatures such as strong beats in songs. It focuses on several key elements:\nkeywords or non-keywords, stressed or unstressed syllables, and strong or weak\nbeats, with the aim of uncovering insightful correlations. Experimental results\nindicate that, on average, 80.8\\% of keywords land on strong beats, whereas\n62\\% of non-keywords fall on weak beats. The relationship between stressed\nsyllables and strong or weak beats is weak, revealing that keywords have the\nstrongest relationships with strong beats. Additionally, the lyrics-rhythm\nmatching score, a key matching metric measuring keywords on strong beats and\nnon-keywords on weak beats across various time signatures, is 0.765, while the\nmatching score for syllable types is 0.495. This study demonstrates that word\ntypes strongly align with their corresponding beat types, as evidenced by the\ndistinct patterns, whereas syllable types exhibit a much weaker alignment. This\ndisparity underscores the greater reliability of word types in capturing\nrhythmic structures in music, highlighting their crucial role in effective\nrhythmic matching and analysis. We also conclude that keywords that\nconsistently align with strong beats are more reliable indicators of\nlyrics-rhythm associations, providing valuable insights for AI-driven song\ngeneration through enhanced structural analysis. Furthermore, our development\nof tailored Lyrics-Rhythm Matching (LRM) metrics maximizes lyrical alignments\nwith corresponding beat stresses, and our novel LRM file format captures\ncritical lyrical and rhythmic information without needing original sheet music.\n","authors":["Callie C. Liao","Duoduo Liao","Ellie L. Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.04202v1.pdf","comment":"Accepted by IEEE BigData 2024"},{"id":"http://arxiv.org/abs/2408.15996v3","updated":"2024-12-05T14:38:12Z","published":"2024-08-28T17:59:05Z","title":"Spatio-Temporal Context Prompting for Zero-Shot Action Detection","summary":" Spatio-temporal action detection encompasses the tasks of localizing and\nclassifying individual actions within a video. Recent works aim to enhance this\nprocess by incorporating interaction modeling, which captures the relationship\nbetween people and their surrounding context. However, these approaches have\nprimarily focused on fully-supervised learning, and the current limitation lies\nin the lack of generalization capability to recognize unseen action categories.\nIn this paper, we aim to adapt the pretrained image-language models to detect\nunseen actions. To this end, we propose a method which can effectively leverage\nthe rich knowledge of visual-language models to perform Person-Context\nInteraction. Meanwhile, our Context Prompting module will utilize contextual\ninformation to prompt labels, thereby enhancing the generation of more\nrepresentative text features. Moreover, to address the challenge of recognizing\ndistinct actions by multiple people at the same timestamp, we design the\nInterest Token Spotting mechanism which employs pretrained visual knowledge to\nfind each person's interest context tokens, and then these tokens will be used\nfor prompting to generate text features tailored to each individual. To\nevaluate the ability to detect unseen actions, we propose a comprehensive\nbenchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our\nmethod achieves superior results compared to previous approaches and can be\nfurther extended to multi-action videos, bringing it closer to real-world\napplications. The code and data can be found in\nhttps://webber2933.github.io/ST-CLIP-project-page.\n","authors":["Wei-Jhe Huang","Min-Hung Chen","Shang-Hong Lai"],"pdf_url":"https://arxiv.org/pdf/2408.15996v3.pdf","comment":"Accepted by WACV2025. Project page:\n https://webber2933.github.io/ST-CLIP-project-page"},{"id":"http://arxiv.org/abs/2412.04190v1","updated":"2024-12-05T14:30:18Z","published":"2024-12-05T14:30:18Z","title":"Directed Structural Adaptation to Overcome Statistical Conflicts and\n Enable Continual Learning","summary":" Adaptive networks today rely on overparameterized fixed topologies that\ncannot break through the statistical conflicts they encounter in the data they\nare exposed to, and are prone to \"catastrophic forgetting\" as the network\nattempts to reuse the existing structures to learn new task. We propose a\nstructural adaptation method, DIRAD, that can complexify as needed and in a\ndirected manner without being limited by statistical conflicts within a\ndataset. We then extend this method and present the PREVAL framework, designed\nto prevent \"catastrophic forgetting\" in continual learning by detection of new\ndata and assigning encountered data to suitable models adapted to process them,\nwithout needing task labels anywhere in the workflow. We show the reliability\nof the DIRAD in growing a network with high performance and orders-of-magnitude\nsimpler than fixed topology networks; and demonstrate the proof-of-concept\noperation of PREVAL, in which continual adaptation to new tasks is observed\nwhile being able to detect and discern previously-encountered tasks.\n","authors":["Zeki Doruk Erden","Boi Faltings"],"pdf_url":"https://arxiv.org/pdf/2412.04190v1.pdf","comment":"Presented in Deployable AI (DAI) workshop at AAAI-2024"},{"id":"http://arxiv.org/abs/2412.04185v1","updated":"2024-12-05T14:24:07Z","published":"2024-12-05T14:24:07Z","title":"Leveraging Large Language Models to Generate Course-specific\n Semantically Annotated Learning Objects","summary":" Background: Over the past few decades, the process and methodology of\nautomated question generation (AQG) have undergone significant transformations.\nRecent progress in generative natural language models has opened up new\npotential in the generation of educational content.\n Objectives: This paper explores the potential of large language models (LLMs)\nfor generating computer science questions that are sufficiently annotated for\nautomatic learner model updates, are fully situated in the context of a\nparticular course, and address the cognitive dimension understand.\n Methods: Unlike previous attempts that might use basic methods like ChatGPT,\nour approach involves more targeted strategies such as retrieval-augmented\ngeneration (RAG) to produce contextually relevant and pedagogically meaningful\nlearning objects.\n Results and Conclusions: Our results show that generating structural,\nsemantic annotations works well. However, this success was not reflected in the\ncase of relational annotations. The quality of the generated questions often\ndid not meet educational standards, highlighting that although LLMs can\ncontribute to the pool of learning materials, their current level of\nperformance requires significant human intervention to refine and validate the\ngenerated content.\n","authors":["Dominic Lohr","Marc Berges","Abhishek Chugh","Michael Kohlhase","Dennis Müller"],"pdf_url":"https://arxiv.org/pdf/2412.04185v1.pdf","comment":"Accepted at Journal of Computer Assisted Learning (2024)"},{"id":"http://arxiv.org/abs/2411.16105v2","updated":"2024-12-05T14:16:57Z","published":"2024-11-25T05:32:34Z","title":"Adaptive Circuit Behavior and Generalization in Mechanistic\n Interpretability","summary":" Mechanistic interpretability aims to understand the inner workings of large\nneural networks by identifying circuits, or minimal subgraphs within the model\nthat implement algorithms responsible for performing specific tasks. These\ncircuits are typically discovered and analyzed using a narrowly defined prompt\nformat. However, given the abilities of large language models (LLMs) to\ngeneralize across various prompt formats for the same task, it remains unclear\nhow well these circuits generalize. For instance, it is unclear whether the\nmodels generalization results from reusing the same circuit components, the\ncomponents behaving differently, or the use of entirely different components.\nIn this paper, we investigate the generality of the indirect object\nidentification (IOI) circuit in GPT-2 small, which is well-studied and believed\nto implement a simple, interpretable algorithm. We evaluate its performance on\nprompt variants that challenge the assumptions of this algorithm. Our findings\nreveal that the circuit generalizes surprisingly well, reusing all of its\ncomponents and mechanisms while only adding additional input edges. Notably,\nthe circuit generalizes even to prompt variants where the original algorithm\nshould fail; we discover a mechanism that explains this which we term S2\nHacking. Our findings indicate that circuits within LLMs may be more flexible\nand general than previously recognized, underscoring the importance of studying\ncircuit generalization to better understand the broader capabilities of these\nmodels.\n","authors":["Jatin Nainani","Sankaran Vaidyanathan","AJ Yeung","Kartik Gupta","David Jensen"],"pdf_url":"https://arxiv.org/pdf/2411.16105v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.04167v1","updated":"2024-12-05T14:03:41Z","published":"2024-12-05T14:03:41Z","title":"Bench-CoE: a Framework for Collaboration of Experts from Benchmark","summary":" Large Language Models (LLMs) are key technologies driving intelligent systems\nto handle multiple tasks. To meet the demands of various tasks, an increasing\nnumber of LLMs-driven experts with diverse capabilities have been developed,\naccompanied by corresponding benchmarks to evaluate their performance. This\npaper proposes the Bench-CoE framework, which enables Collaboration of Experts\n(CoE) by effectively leveraging benchmark evaluations to achieve optimal\nperformance across various tasks. Bench-CoE includes a set of expert models, a\nrouter for assigning tasks to corresponding experts, and a benchmark dataset\nfor training the router. Moreover, we formulate Query-Level and Subject-Level\napproaches based on our framework, and analyze the merits and drawbacks of\nthese two approaches. Finally, we conduct a series of experiments with vary\ndata distributions on both language and multimodal tasks to validate that our\nproposed Bench-CoE outperforms any single model in terms of overall\nperformance. We hope this method serves as a baseline for further research in\nthis area. The code is available at\n\\url{https://github.com/ZhangXJ199/Bench-CoE}.\n","authors":["Yuanshuai Wang","Xingjian Zhang","Jinkun Zhao","Siwei Wen","Peilin Feng","Shuhao Liao","Lei Huang","Wenjun Wu"],"pdf_url":"https://arxiv.org/pdf/2412.04167v1.pdf","comment":"The code is available at\n \\url{https://github.com/ZhangXJ199/Bench-CoE}"},{"id":"http://arxiv.org/abs/2109.06181v2","updated":"2024-12-05T13:50:59Z","published":"2021-09-13T16:06:10Z","title":"When Stability meets Sufficiency: Informative Explanations that do not\n Overwhelm","summary":" Recent studies evaluating various criteria for explainable artificial\nintelligence (XAI) suggest that fidelity, stability, and comprehensibility are\namong the most important metrics considered by users of AI across a diverse\ncollection of usage contexts. We consider these criteria as applied to\nfeature-based attribution methods, which are amongst the most prevalent in XAI\nliterature. Going beyond standard correlation, methods have been proposed that\nhighlight what should be minimally sufficient to justify the classification of\nan input (viz. pertinent positives). While minimal sufficiency is an attractive\nproperty akin to comprehensibility, the resulting explanations are often too\nsparse for a human to understand and evaluate the local behavior of the model.\nTo overcome these limitations, we incorporate the criteria of stability and\nfidelity and propose a novel method called Path-Sufficient Explanations Method\n(PSEM) that outputs a sequence of stable and sufficient explanations for a\ngiven input of strictly decreasing size (or value) -- from original input to a\nminimally sufficient explanation -- which can be thought to trace the local\nboundary of the model in a stable manner, thus providing better intuition about\nthe local model behavior for the specific input. We validate these claims, both\nqualitatively and quantitatively, with experiments that show the benefit of\nPSEM across three modalities (image, tabular and text) as well as versus other\npath explanations. A user study depicts the strength of the method in\ncommunicating the local behavior, where (many) users are able to correctly\ndetermine the prediction made by a model.\n","authors":["Ronny Luss","Amit Dhurandhar"],"pdf_url":"https://arxiv.org/pdf/2109.06181v2.pdf","comment":"Published at TMLR"},{"id":"http://arxiv.org/abs/2411.11706v2","updated":"2024-12-05T13:27:22Z","published":"2024-11-18T16:33:52Z","title":"MC-LLaVA: Multi-Concept Personalized Vision-Language Model","summary":" Current vision-language models (VLMs) show exceptional abilities across\ndiverse tasks including visual question answering. To enhance user experience\nin practical applications, recent studies investigate VLM personalization to\nunderstand user-provided concepts. However, existing studies mainly focus on\nsingle-concept personalization, neglecting the existence and interplay of\nmultiple concepts, which limits the real-world applicability of personalized\nVLMs. In this paper, we propose the first multi-concept personalization method\nnamed MC-LLaVA along with a high-quality multi-concept personalization dataset.\nSpecifically, MC-LLaVA uses a joint training strategy incorporating multiple\nconcepts in a single training step, allowing VLMs to perform accurately in\nmulti-concept personalization. To reduce the cost of joint training, MC-LLaVA\nleverages visual token information for concept token initialization, yielding\nimproved concept representation and accelerating joint training. To advance\nmulti-concept personalization research, we further contribute a high-quality\ndataset. We carefully collect images from various movies that contain multiple\ncharacters and manually generate the multi-concept question-answer samples. Our\ndataset features diverse movie types and question-answer types. We conduct\ncomprehensive qualitative and quantitative experiments to demonstrate that\nMC-LLaVA can achieve impressive multi-concept personalized responses, paving\nthe way for VLMs to become better user-specific assistants. The code and\ndataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA.\n","authors":["Ruichuan An","Sihan Yang","Ming Lu","Kai Zeng","Yulin Luo","Ying Chen","Jiajun Cao","Hao Liang","Qi She","Shanghang Zhang","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.11706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04149v1","updated":"2024-12-05T13:23:06Z","published":"2024-12-05T13:23:06Z","title":"Frequency-Adaptive Low-Latency Object Detection Using Events and Frames","summary":" Fusing Events and RGB images for object detection leverages the robustness of\nEvent cameras in adverse environments and the rich semantic information\nprovided by RGB cameras. However, two critical mismatches: low-latency Events\n\\textit{vs.}~high-latency RGB frames; temporally sparse labels in training\n\\textit{vs.}~continuous flow in inference, significantly hinder the\nhigh-frequency fusion-based object detection. To address these challenges, we\npropose the \\textbf{F}requency-\\textbf{A}daptive Low-Latency \\textbf{O}bject\n\\textbf{D}etector (FAOD). FAOD aligns low-frequency RGB frames with\nhigh-frequency Events through an Align Module, which reinforces cross-modal\nstyle and spatial proximity to address the Event-RGB Mismatch. We further\npropose a training strategy, Time Shift, which enforces the module to align the\nprediction from temporally shifted Event-RGB pairs and their original\nrepresentation, that is, consistent with Event-aligned annotations. This\nstrategy enables the network to use high-frequency Event data as the primary\nreference while treating low-frequency RGB images as supplementary information,\nretaining the low-latency nature of the Event stream toward high-frequency\ndetection. Furthermore, we observe that these corrected Event-RGB pairs\ndemonstrate better generalization from low training frequency to higher\ninference frequencies compared to using Event data alone. Extensive experiments\non the PKU-DAVIS-SOD and DSEC-Detection datasets demonstrate that our FAOD\nachieves SOTA performance. Specifically, in the PKU-DAVIS-SOD Dataset, FAOD\nachieves 9.8 points improvement in terms of the mAP in fully paired Event-RGB\ndata with only a quarter of the parameters compared to SODFormer, and even\nmaintains robust performance (only a 3 points drop in mAP) under 80$\\times$\nEvent-RGB frequency mismatch.\n","authors":["Haitian Zhang","Xiangyuan Wang","Chang Xu","Xinya Wang","Fang Xu","Huai Yu","Lei Yu","Wen Yang"],"pdf_url":"https://arxiv.org/pdf/2412.04149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03417v2","updated":"2024-12-05T13:22:28Z","published":"2024-12-04T15:53:45Z","title":"Learning Semantic Association Rules from Internet of Things Data","summary":" Association Rule Mining (ARM) is the task of discovering commonalities in\ndata in the form of logical implications. ARM is used in the Internet of Things\n(IoT) for different tasks including monitoring and decision-making. However,\nexisting methods give limited consideration to IoT-specific requirements such\nas heterogeneity and volume. Furthermore, they do not utilize important static\ndomain-specific description data about IoT systems, which is increasingly\nrepresented as knowledge graphs. In this paper, we propose a novel ARM pipeline\nfor IoT data that utilizes both dynamic sensor data and static IoT system\nmetadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method\n(Aerial) as part of the pipeline to address the high volume of IoT data and\nreduce the total number of rules that are resource-intensive to process. Aerial\nlearns a neural representation of a given data and extracts association rules\nfrom this representation by exploiting the reconstruction (decoding) mechanism\nof an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show\nthat ARM on both static and dynamic IoT data results in more generically\napplicable rules while Aerial can learn a more concise set of high-quality\nassociation rules than the state-of-the-art with full coverage over the\ndatasets.\n","authors":["Erkan Karabulut","Paul Groth","Victoria Degeler"],"pdf_url":"https://arxiv.org/pdf/2412.03417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14123v2","updated":"2024-12-05T13:15:34Z","published":"2024-02-21T20:43:49Z","title":"DeiSAM: Segment Anything with Deictic Prompting","summary":" Large-scale, pre-trained neural networks have demonstrated strong\ncapabilities in various tasks, including zero-shot image segmentation. To\nidentify concrete objects in complex scenes, humans instinctively rely on\ndeictic descriptions in natural language, i.e., referring to something\ndepending on the context such as \"The object that is on the desk and behind the\ncup.\". However, deep learning approaches cannot reliably interpret such deictic\nrepresentations due to their lack of reasoning capabilities in complex\nscenarios. To remedy this issue, we propose DeiSAM -- a combination of large\npre-trained neural networks with differentiable logic reasoners -- for deictic\npromptable segmentation. Given a complex, textual segmentation description,\nDeiSAM leverages Large Language Models (LLMs) to generate first-order logic\nrules and performs differentiable forward reasoning on generated scene graphs.\nSubsequently, DeiSAM segments objects by matching them to the logically\ninferred image regions. As part of our evaluation, we propose the Deictic\nVisual Genome (DeiVG) dataset, containing paired visual input and complex,\ndeictic textual prompts. Our empirical results demonstrate that DeiSAM is a\nsubstantial improvement over purely data-driven baselines for deictic\npromptable segmentation.\n","authors":["Hikaru Shindo","Manuel Brack","Gopika Sudhakaran","Devendra Singh Dhami","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2402.14123v2.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.04144v1","updated":"2024-12-05T13:12:51Z","published":"2024-12-05T13:12:51Z","title":"If You Can't Use Them, Recycle Them: Optimizing Merging at Scale\n Mitigates Performance Tradeoffs","summary":" Model merging has shown great promise at combining expert models, but the\nbenefit of merging is unclear when merging ``generalist'' models trained on\nmany tasks. We explore merging in the context of large ($\\sim100$B) models, by\n\\textit{recycling} checkpoints that exhibit tradeoffs among different tasks.\nSuch checkpoints are often created in the process of developing a frontier\nmodel, and many suboptimal ones are usually discarded. Given a pool of model\ncheckpoints obtained from different training runs (e.g., different stages,\nobjectives, hyperparameters, and data mixtures), which naturally show tradeoffs\nacross different language capabilities (e.g., instruction following vs. code\ngeneration), we investigate whether merging can recycle such suboptimal models\ninto a Pareto-optimal one. Our optimization algorithm tunes the weight of each\ncheckpoint in a linear combination, resulting in a Pareto-optimal models that\noutperforms both individual models and merge-based baselines. Further analysis\nshows that good merges tend to include almost all checkpoints with with\nnon-zero weights, indicating that even seemingly bad initial checkpoints can\ncontribute to good final merges.\n","authors":["Muhammad Khalifa","Yi-Chern Tan","Arash Ahmadian","Tom Hosking","Honglak Lee","Lu Wang","Ahmet Üstün","Tom Sherborne","Matthias Gallé"],"pdf_url":"https://arxiv.org/pdf/2412.04144v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.04142v1","updated":"2024-12-05T13:11:04Z","published":"2024-12-05T13:11:04Z","title":"Methodology for Online Estimation of Rheological Parameters in Polymer\n Melts Using Deep Learning and Microfluidics","summary":" Microfluidic devices are increasingly used in biological and chemical\nexperiments due to their cost-effectiveness for rheological estimation in\nfluids. However, these devices often face challenges in terms of accuracy,\nsize, and cost. This study presents a methodology, integrating deep learning,\nmodeling and simulation to enhance the design of microfluidic systems, used to\ndevelop an innovative approach for viscosity measurement of polymer melts. We\nuse synthetic data generated from the simulations to train a deep learning\nmodel, which then identifies rheological parameters of polymer melts from\npressure drop and flow rate measurements in a microfluidic circuit, enabling\nonline estimation of fluid properties. By improving the accuracy and\nflexibility of microfluidic rheological estimation, our methodology accelerates\nthe design and testing of microfluidic devices, reducing reliance on physical\nprototypes, and offering significant contributions to the field.\n","authors":["Juan Sandubete-López","José L. Risco-Martín","Alexander H. McMillan","Eva Besada-Portas"],"pdf_url":"https://arxiv.org/pdf/2412.04142v1.pdf","comment":"12 pages, 6 figures, Winter Simulation Conference 2024"},{"id":"http://arxiv.org/abs/2412.04140v1","updated":"2024-12-05T13:07:24Z","published":"2024-12-05T13:07:24Z","title":"Understanding Memorization in Generative Models via Sharpness in\n Probability Landscapes","summary":" In this paper, we introduce a geometric framework to analyze memorization in\ndiffusion models using the eigenvalues of the Hessian of the log probability\ndensity. We propose that memorization arises from isolated points in the\nlearned probability distribution, characterized by sharpness in the probability\nlandscape, as indicated by large negative eigenvalues of the Hessian. Through\nexperiments on various datasets, we demonstrate that these eigenvalues\neffectively detect and quantify memorization. Our approach provides a clear\nunderstanding of memorization in diffusion models and lays the groundwork for\ndeveloping strategies to ensure secure and reliable generative models\n","authors":["Dongjae Jeon","Dueun Kim","Albert No"],"pdf_url":"https://arxiv.org/pdf/2412.04140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04139v1","updated":"2024-12-05T13:06:03Z","published":"2024-12-05T13:06:03Z","title":"Monet: Mixture of Monosemantic Experts for Transformers","summary":" Understanding the internal computations of large language models (LLMs) is\ncrucial for aligning them with human values and preventing undesirable\nbehaviors like toxic content generation. However, mechanistic interpretability\nis hindered by polysemanticity -- where individual neurons respond to multiple,\nunrelated concepts. While Sparse Autoencoders (SAEs) have attempted to\ndisentangle these features through sparse dictionary learning, they have\ncompromised LLM performance due to reliance on post-hoc reconstruction loss. To\naddress this issue, we introduce Mixture of Monosemantic Experts for\nTransformers (Monet) architecture, which incorporates sparse dictionary\nlearning directly into end-to-end Mixture-of-Experts pretraining. Our novel\nexpert decomposition method enables scaling the expert count to 262,144 per\nlayer while total parameters scale proportionally to the square root of the\nnumber of experts. Our analyses demonstrate mutual exclusivity of knowledge\nacross experts and showcase the parametric knowledge encapsulated within\nindividual experts. Moreover, Monet allows knowledge manipulation over domains,\nlanguages, and toxicity mitigation without degrading general performance. Our\npursuit of transparent LLMs highlights the potential of scaling expert counts\nto enhance} mechanistic interpretability and directly resect the internal\nknowledge to fundamentally adjust} model behavior. The source code and\npretrained checkpoints are available at https://github.com/dmis-lab/Monet.\n","authors":["Jungwoo Park","Young Jin Ahn","Kee-Eung Kim","Jaewoo Kang"],"pdf_url":"https://arxiv.org/pdf/2412.04139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04137v1","updated":"2024-12-05T13:04:10Z","published":"2024-12-05T13:04:10Z","title":"Text Change Detection in Multilingual Documents Using Image Comparison","summary":" Document comparison typically relies on optical character recognition (OCR)\nas its core technology. However, OCR requires the selection of appropriate\nlanguage models for each document and the performance of multilingual or hybrid\nmodels remains limited. To overcome these challenges, we propose text change\ndetection (TCD) using an image comparison model tailored for multilingual\ndocuments. Unlike OCR-based approaches, our method employs word-level text\nimage-to-image comparison to detect changes. Our model generates bidirectional\nchange segmentation maps between the source and target documents. To enhance\nperformance without requiring explicit text alignment or scaling preprocessing,\nwe employ correlations among multi-scale attention features. We also construct\na benchmark dataset comprising actual printed and scanned word pairs in various\nlanguages to evaluate our model. We validate our approach using our benchmark\ndataset and public benchmarks Distorted Document Images and the LRDE Document\nBinarization Dataset. We compare our model against state-of-the-art semantic\nsegmentation and change detection models, as well as to conventional OCR-based\nmodels.\n","authors":["Doyoung Park","Naresh Reddy Yarram","Sunjin Kim","Minkyu Kim","Seongho Cho","Taehee Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04137v1.pdf","comment":"15pages, 11figures 6tables, wacv2025 accepted"},{"id":"http://arxiv.org/abs/2411.02785v2","updated":"2024-12-05T12:58:44Z","published":"2024-11-05T03:51:13Z","title":"Stochastic Monkeys at Play: Random Augmentations Cheaply Break LLM\n Safety Alignment","summary":" Safety alignment of Large Language Models (LLMs) has recently become a\ncritical objective of model developers. In response, a growing body of work has\nbeen investigating how safety alignment can be bypassed through various\njailbreaking methods, such as adversarial attacks. However, these jailbreak\nmethods can be rather costly or involve a non-trivial amount of creativity and\neffort, introducing the assumption that malicious users are high-resource or\nsophisticated. In this paper, we study how simple random augmentations to the\ninput prompt affect safety alignment effectiveness in state-of-the-art LLMs,\nsuch as Llama 3 and Qwen 2. We perform an in-depth evaluation of 17 different\nmodels and investigate the intersection of safety under random augmentations\nwith multiple dimensions: augmentation type, model size, quantization,\nfine-tuning-based defenses, and decoding strategies (e.g., sampling\ntemperature). We show that low-resource and unsophisticated attackers, i.e.\n$\\textit{stochastic monkeys}$, can significantly improve their chances of\nbypassing alignment with just 25 random augmentations per prompt. Source code\nand data: https://github.com/uiuc-focal-lab/stochastic-monkeys/\n","authors":["Jason Vega","Junsheng Huang","Gaokai Zhang","Hangoo Kang","Minjia Zhang","Gagandeep Singh"],"pdf_url":"https://arxiv.org/pdf/2411.02785v2.pdf","comment":"v2: Updated with changes from peer review rebuttal. v1: Version under\n peer review"},{"id":"http://arxiv.org/abs/2411.03906v2","updated":"2024-12-05T12:56:40Z","published":"2024-11-06T13:37:28Z","title":"Lexicalization Is All You Need: Examining the Impact of Lexical\n Knowledge in a Compositional QALD System","summary":" In this paper, we examine the impact of lexicalization on Question Answering\nover Linked Data (QALD). It is well known that one of the key challenges in\ninterpreting natural language questions with respect to SPARQL lies in bridging\nthe lexical gap, that is mapping the words in the query to the correct\nvocabulary elements. We argue in this paper that lexicalization, that is\nexplicit knowledge about the potential interpretations of a word with respect\nto the given vocabulary, significantly eases the task and increases the\nperformance of QA systems. Towards this goal, we present a compositional QA\nsystem that can leverage explicit lexical knowledge in a compositional manner\nto infer the meaning of a question in terms of a SPARQL query. We show that\nsuch a system, given lexical knowledge, has a performance well beyond current\nQA systems, achieving up to a $35.8\\%$ increase in the micro $F_1$ score\ncompared to the best QA system on QALD-9. This shows the importance and\npotential of including explicit lexical knowledge. In contrast, we show that\nLLMs have limited abilities to exploit lexical knowledge, with only marginal\nimprovements compared to a version without lexical knowledge. This shows that\nLLMs have no ability to compositionally interpret a question on the basis of\nthe meaning of its parts, a key feature of compositional approaches. Taken\ntogether, our work shows new avenues for QALD research, emphasizing the\nimportance of lexicalization and compositionality.\n","authors":["David Maria Schmidt","Mohammad Fazleh Elahi","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2411.03906v2.pdf","comment":"24th International Conference on Knowledge Engineering and Knowledge\n Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands"},{"id":"http://arxiv.org/abs/2412.04121v1","updated":"2024-12-05T12:46:18Z","published":"2024-12-05T12:46:18Z","title":"DeepFEA: Deep Learning for Prediction of Transient Finite Element\n Analysis Solutions","summary":" Finite Element Analysis (FEA) is a powerful but computationally intensive\nmethod for simulating physical phenomena. Recent advancements in machine\nlearning have led to surrogate models capable of accelerating FEA. Yet there\nare still limitations in developing surrogates of transient FEA models that can\nsimultaneously predict the solutions for both nodes and elements with\napplicability on both the 2D and 3D domains. Motivated by this research gap,\nthis study proposes DeepFEA, a deep learning-based framework that leverages a\nmultilayer Convolutional Long Short-Term Memory (ConvLSTM) network branching\ninto two parallel convolutional neural networks to predict the solutions for\nboth nodes and elements of FEA models. The proposed network is optimized using\na novel adaptive learning algorithm, called Node-Element Loss Optimization\n(NELO). NELO minimizes the error occurring at both branches of the network\nenabling the prediction of solutions for transient FEA simulations. The\nexperimental evaluation of DeepFEA is performed on three datasets in the\ncontext of structural mechanics, generated to serve as publicly available\nreference datasets. The results show that DeepFEA can achieve less than 3%\nnormalized mean and root mean squared error for 2D and 3D simulation scenarios,\nand inference times that are two orders of magnitude faster than FEA. In\ncontrast, relevant state-of-the-art methods face challenges with\nmulti-dimensional output and dynamic input prediction. Furthermore, DeepFEA's\nrobustness was demonstrated in a real-life biomedical scenario, confirming its\nsuitability for accurate and efficient predictions of FEA simulations.\n","authors":["Georgios Triantafyllou","Panagiotis G. Kalozoumis","George Dimas","Dimitris K. Iakovidis"],"pdf_url":"https://arxiv.org/pdf/2412.04121v1.pdf","comment":"This work has been submitted to a journal for possible publication"},{"id":"http://arxiv.org/abs/2410.08020v2","updated":"2024-12-05T12:40:16Z","published":"2024-10-10T15:17:49Z","title":"Efficiently Learning at Test-Time: Active Fine-Tuning of LLMs","summary":" Recent efforts in fine-tuning language models often rely on automatic data\nselection, commonly using Nearest Neighbors retrieval from large datasets.\nHowever, we theoretically show that this approach tends to select redundant\ndata, limiting its effectiveness or even hurting performance. To address this,\nwe introduce SIFT, a data selection algorithm designed to reduce uncertainty\nabout the model's response given a prompt, which unifies ideas from retrieval\nand active learning. Whereas Nearest Neighbor retrieval typically fails in the\npresence of information duplication, SIFT accounts for information duplication\nand optimizes the overall information gain of the selected examples. We focus\nour evaluations on fine-tuning at test-time for prompt-specific language\nmodeling on the Pile dataset, and show that SIFT consistently outperforms\nNearest Neighbor retrieval, with minimal computational overhead. Moreover, we\nshow that our uncertainty estimates can predict the performance gain of\ntest-time fine-tuning, and use this to develop an adaptive algorithm that\ninvests test-time compute proportional to realized performance gains. We\nprovide the $\\texttt{activeft}$ (Active Fine-Tuning) library which can be used\nas a drop-in replacement for Nearest Neighbor retrieval.\n","authors":["Jonas Hübotter","Sascha Bongni","Ido Hakimi","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2410.08020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02865v2","updated":"2024-12-05T12:38:58Z","published":"2024-12-03T22:00:12Z","title":"Memory-efficient Continual Learning with Neural Collapse Contrastive","summary":" Contrastive learning has significantly improved representation quality,\nenhancing knowledge transfer across tasks in continual learning (CL). However,\ncatastrophic forgetting remains a key challenge, as contrastive based methods\nprimarily focus on \"soft relationships\" or \"softness\" between samples, which\nshift with changing data distributions and lead to representation overlap\nacross tasks. Recently, the newly identified Neural Collapse phenomenon has\nshown promise in CL by focusing on \"hard relationships\" or \"hardness\" between\nsamples and fixed prototypes. However, this approach overlooks \"softness\",\ncrucial for capturing intra-class variability, and this rigid focus can also\npull old class representations toward current ones, increasing forgetting.\nBuilding on these insights, we propose Focal Neural Collapse Contrastive\n(FNC2), a novel representation learning loss that effectively balances both\nsoft and hard relationships. Additionally, we introduce the Hardness-Softness\nDistillation (HSD) loss to progressively preserve the knowledge gained from\nthese relationships across tasks. Our method outperforms state-of-the-art\napproaches, particularly in minimizing memory reliance. Remarkably, even\nwithout the use of memory, our approach rivals rehearsal-based methods,\noffering a compelling solution for data privacy concerns.\n","authors":["Trung-Anh Dang","Vincent Nguyen","Ngoc-Son Vu","Christel Vrain"],"pdf_url":"https://arxiv.org/pdf/2412.02865v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2412.04114v1","updated":"2024-12-05T12:32:45Z","published":"2024-12-05T12:32:45Z","title":"Thermal and RGB Images Work Better Together in Wind Turbine Damage\n Detection","summary":" The inspection of wind turbine blades (WTBs) is crucial for ensuring their\nstructural integrity and operational efficiency. Traditional inspection methods\ncan be dangerous and inefficient, prompting the use of unmanned aerial vehicles\n(UAVs) that access hard-to-reach areas and capture high-resolution imagery. In\nthis study, we address the challenge of enhancing defect detection on WTBs by\nintegrating thermal and RGB images obtained from UAVs. We propose a\nmultispectral image composition method that combines thermal and RGB imagery\nthrough spatial coordinate transformation, key point detection, binary\ndescriptor creation, and weighted image overlay. Using a benchmark dataset of\nWTB images annotated for defects, we evaluated several state-of-the-art object\ndetection models. Our results show that composite images significantly improve\ndefect detection efficiency. Specifically, the YOLOv8 model's accuracy\nincreased from 91% to 95%, precision from 89% to 94%, recall from 85% to 92%,\nand F1-score from 87% to 93%. The number of false positives decreased from 6 to\n3, and missed defects reduced from 5 to 2. These findings demonstrate that\nintegrating thermal and RGB imagery enhances defect detection on WTBs,\ncontributing to improved maintenance and reliability.\n","authors":["Serhii Svystun","Oleksandr Melnychenko","Pavlo Radiuk","Oleg Savenko","Anatoliy Sachenko","Andrii Lysyi"],"pdf_url":"https://arxiv.org/pdf/2412.04114v1.pdf","comment":"Unmanned aerial vehicle, image composition, multispectral images,\n green energy, data quality management, weighted overlay"},{"id":"http://arxiv.org/abs/2412.04110v1","updated":"2024-12-05T12:24:54Z","published":"2024-12-05T12:24:54Z","title":"Enhancing Mathematical Reasoning in LLMs with Background Operators","summary":" We propose utilizing background operators for mathematical reasoning in large\nlanguage models (LLMs). To achieve this, we define a set of fundamental\nmathematical predicates as the basic building blocks. For each mathematical\nproblem, we develop a Prolog solution that includes problem-specific predicates\nand intermediate predicates derived from these background operators, ensuring\nthat each solution adheres to the defined operator set. We introduce the\nMATH-Prolog corpus, which is derived from the counting and probability\ncategories of the MATH corpus. For efficient data augmentation, we apply K-fold\ncross-validated self-training. This method incrementally generates new Prolog\nsolutions for each fold, incorporating those verified as correct into the\ntraining set throughout the model training process. Our experimental results\ndemonstrate that 5-fold crossvalidated self-training effectively identifies\nnew, accurate Prolog solutions, achieving an accuracy of 84.6% on the\ncross-validated set, and 84.8% on the test set during fine-tuning the\nMeta-Llama-3.1-8B-Instruct model. This approach successfully uncovers new\nsolutions with fully computable inference steps for previously unseen problems.\nAdditionally, incorporating the background mathematical predicates into the\nprompt enhances solution coverage.\n","authors":["Jiajun Chen","Yik-Cheung Tam"],"pdf_url":"https://arxiv.org/pdf/2412.04110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04107v1","updated":"2024-12-05T12:17:56Z","published":"2024-12-05T12:17:56Z","title":"Pre-train, Align, and Disentangle: Empowering Sequential Recommendation\n with Large Language Models","summary":" Sequential recommendation (SR) aims to model the sequential dependencies in\nusers' historical interactions to better capture their evolving interests.\nHowever, existing SR approaches primarily rely on collaborative data, which\nleads to limitations such as the cold-start problem and sub-optimal\nperformance. Meanwhile, despite the success of large language models (LLMs),\ntheir application in industrial recommender systems is hindered by high\ninference latency, inability to capture all distribution statistics, and\ncatastrophic forgetting. To this end, we propose a novel Pre-train, Align, and\nDisentangle (PAD) paradigm to empower recommendation models with LLMs.\nSpecifically, we first pre-train both the SR and LLM models to get\ncollaborative and textual embeddings. Next, a characteristic\nrecommendation-anchored alignment loss is proposed using multi-kernel maximum\nmean discrepancy with Gaussian kernels. Finally, a triple-experts architecture,\nconsisting aligned and modality-specific experts with disentangled embeddings,\nis fine-tuned in a frequency-aware manner. Experiments conducted on three\npublic datasets demonstrate the effectiveness of PAD, showing significant\nimprovements and compatibility with various SR backbone models, especially on\ncold items. The implementation code and datasets will be publicly available.\n","authors":["Yuhao Wang","Junwei Pan","Xiangyu Zhao","Pengyue Jia","Wanyu Wang","Yuan Wang","Yue Liu","Dapeng Liu","Jie Jiang"],"pdf_url":"https://arxiv.org/pdf/2412.04107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04100v1","updated":"2024-12-05T12:10:42Z","published":"2024-12-05T12:10:42Z","title":"Missing Melodies: AI Music Generation and its \"Nearly\" Complete Omission\n of the Global South","summary":" Recent advances in generative AI have sparked renewed interest and expanded\npossibilities for music generation. However, the performance and versatility of\nthese systems across musical genres are heavily influenced by the availability\nof training data. We conducted an extensive analysis of over one million hours\nof audio datasets used in AI music generation research and manually reviewed\nmore than 200 papers from eleven prominent AI and music conferences and\norganizations (AAAI, ACM, EUSIPCO, EURASIP, ICASSP, ICML, IJCAI, ISMIR,\nNeurIPS, NIME, SMC) to identify a critical gap in the fair representation and\ninclusion of the musical genres of the Global South in AI research. Our\nfindings reveal a stark imbalance: approximately 86% of the total dataset hours\nand over 93% of researchers focus primarily on music from the Global North.\nHowever, around 40% of these datasets include some form of non-Western music,\ngenres from the Global South account for only 14.6% of the data. Furthermore,\napproximately 51% of the papers surveyed concentrate on symbolic music\ngeneration, a method that often fails to capture the cultural nuances inherent\nin music from regions such as South Asia, the Middle East, and Africa. As AI\nincreasingly shapes the creation and dissemination of music, the significant\nunderrepresentation of music genres in datasets and research presents a serious\nthreat to global musical diversity. We also propose some important steps to\nmitigate these risks and foster a more inclusive future for AI-driven music\ngeneration.\n","authors":["Atharva Mehta","Shivam Chauhan","Monojit Choudhury"],"pdf_url":"https://arxiv.org/pdf/2412.04100v1.pdf","comment":"Submitted to CACM, 12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.04097v1","updated":"2024-12-05T12:03:02Z","published":"2024-12-05T12:03:02Z","title":"D-LORD for Motion Stylization","summary":" This paper introduces a novel framework named D-LORD (Double Latent\nOptimization for Representation Disentanglement), which is designed for motion\nstylization (motion style transfer and motion retargeting). The primary\nobjective of this framework is to separate the class and content information\nfrom a given motion sequence using a data-driven latent optimization approach.\nHere, class refers to person-specific style, such as a particular emotion or an\nindividual's identity, while content relates to the style-agnostic aspect of an\naction, such as walking or jumping, as universally understood concepts. The key\nadvantage of D-LORD is its ability to perform style transfer without needing\npaired motion data. Instead, it utilizes class and content labels during the\nlatent optimization process. By disentangling the representation, the framework\nenables the transformation of one motion sequences style to another's style\nusing Adaptive Instance Normalization. The proposed D-LORD framework is\ndesigned with a focus on generalization, allowing it to handle different class\nand content labels for various applications. Additionally, it can generate\ndiverse motion sequences when specific class and content labels are provided.\nThe framework's efficacy is demonstrated through experimentation on three\ndatasets: the CMU XIA dataset for motion style transfer, the MHAD dataset, and\nthe RRIS Ability dataset for motion retargeting. Notably, this paper presents\nthe first generalized framework for motion style transfer and motion\nretargeting, showcasing its potential contributions in this area.\n","authors":["Meenakshi Gupta","Mingyuan Lei","Tat-Jen Cham","Hwee Kuan Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18001v2","updated":"2024-12-05T11:58:07Z","published":"2024-10-23T16:24:23Z","title":"Benchmarking Foundation Models on Exceptional Cases: Dataset Creation\n and Validation","summary":" Foundation models (FMs) have achieved significant success across various\ntasks, leading to research on benchmarks for reasoning abilities. However,\nthere is a lack of studies on FMs performance in exceptional scenarios, which\nwe define as out-of-distribution (OOD) reasoning tasks. This paper is the first\nto address these cases, developing a novel dataset for evaluation of FMs across\nmultiple modalities, including graphic novels, calligraphy, news articles, and\nlyrics. It includes tasks for instance classification, character recognition,\ntoken prediction, and text generation. The paper also proposes prompt\nengineering techniques like Chain-of-Thought (CoT) and CoT+Few-Shot to enhance\nperformance. Validation of FMs using various methods revealed improvements. The\ncode repository is accessible at:\nhttps://github.com/MLAI-Yonsei/ExceptionalBenchmark\n","authors":["Suho Kang","Jungyang Park","Joonseo Ha","SoMin Kim","JinHyeong Kim","Subeen Park","Kyungwoo Song"],"pdf_url":"https://arxiv.org/pdf/2410.18001v2.pdf","comment":"EMNLP 2024 Workshop\n Genbench(https://genbench.org/workshop_programme/)"},{"id":"http://arxiv.org/abs/2412.04093v1","updated":"2024-12-05T11:57:49Z","published":"2024-12-05T11:57:49Z","title":"Practical Considerations for Agentic LLM Systems","summary":" As the strength of Large Language Models (LLMs) has grown over recent years,\nso too has interest in their use as the underlying models for autonomous\nagents. Although LLMs demonstrate emergent abilities and broad expertise across\nnatural language domains, their inherent unpredictability makes the\nimplementation of LLM agents challenging, resulting in a gap between related\nresearch and the real-world implementation of such systems. To bridge this gap,\nthis paper frames actionable insights and considerations from the research\ncommunity in the context of established application paradigms to enable the\nconstruction and facilitate the informed deployment of robust LLM agents.\nNamely, we position relevant research findings into four broad\ncategories--Planning, Memory, Tools, and Control Flow--based on common\npractices in application-focused literature and highlight practical\nconsiderations to make when designing agentic LLMs for real-world applications,\nsuch as handling stochasticity and managing resources efficiently. While we do\nnot conduct empirical evaluations, we do provide the necessary background for\ndiscussing critical aspects of agentic LLM designs, both in academia and\nindustry.\n","authors":["Chris Sypherd","Vaishak Belle"],"pdf_url":"https://arxiv.org/pdf/2412.04093v1.pdf","comment":"15 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2403.12562v2","updated":"2024-12-05T11:57:19Z","published":"2024-03-19T09:17:18Z","title":"PePR: Performance Per Resource Unit as a Metric to Promote Small-Scale\n Deep Learning in Medical Image Analysis","summary":" The recent advances in deep learning (DL) have been accelerated by access to\nlarge-scale data and compute. These large-scale resources have been used to\ntrain progressively larger models which are resource intensive in terms of\ncompute, data, energy, and carbon emissions. These costs are becoming a new\ntype of entry barrier to researchers and practitioners with limited access to\nresources at such scale, particularly in the Global South. In this work, we\ntake a comprehensive look at the landscape of existing DL models for medical\nimage analysis tasks and demonstrate their usefulness in settings where\nresources are limited. To account for the resource consumption of DL models, we\nintroduce a novel measure to estimate the performance per resource unit, which\nwe call the PePR score. Using a diverse family of 131 unique DL architectures\n(spanning 1M to 130M trainable parameters) and three medical image datasets, we\ncapture trends about the performance-resource trade-offs. In applications like\nmedical image analysis, we argue that small-scale, specialized models are\nbetter than striving for large-scale models. Furthermore, we show that using\nexisting pretrained models that are fine-tuned on new data can significantly\nreduce the computational resources and data required compared to training\nmodels from scratch. We hope this work will encourage the community to focus on\nimproving AI equity by developing methods and models with smaller resource\nfootprints.\n","authors":["Raghavendra Selvan","Bob Pepin","Christian Igel","Gabrielle Samuel","Erik B Dam"],"pdf_url":"https://arxiv.org/pdf/2403.12562v2.pdf","comment":"Accepted to be published at the Northern Lights Deep Learning\n Conference (NLDL), 2025. Source code available at\n https://github.com/saintslab/PePR"},{"id":"http://arxiv.org/abs/2412.04086v1","updated":"2024-12-05T11:48:54Z","published":"2024-12-05T11:48:54Z","title":"BodyMetric: Evaluating the Realism of HumanBodies in Text-to-Image\n Generation","summary":" Accurately generating images of human bodies from text remains a challenging\nproblem for state of the art text-to-image models. Commonly observed\nbody-related artifacts include extra or missing limbs, unrealistic poses,\nblurred body parts, etc. Currently, evaluation of such artifacts relies heavily\non time-consuming human judgments, limiting the ability to benchmark models at\nscale. We address this by proposing BodyMetric, a learnable metric that\npredicts body realism in images. BodyMetric is trained on realism labels and\nmulti-modal signals including 3D body representations inferred from the input\nimage, and textual descriptions. In order to facilitate this approach, we\ndesign an annotation pipeline to collect expert ratings on human body realism\nleading to a new dataset for this task, namely, BodyRealism. Ablation studies\nsupport our architectural choices for BodyMetric and the importance of\nleveraging a 3D human body prior in capturing body-related artifacts in 2D\nimages. In comparison to concurrent metrics which evaluate general user\npreference in images, BodyMetric specifically reflects body-related artifacts.\nWe demonstrate the utility of BodyMetric through applications that were\npreviously infeasible at scale. In particular, we use BodyMetric to benchmark\nthe generation ability of text-to-image models to produce realistic human\nbodies. We also demonstrate the effectiveness of BodyMetric in ranking\ngenerated images based on the predicted realism scores.\n","authors":["Nefeli Andreou","Varsha Vivek","Ying Wang","Alex Vorobiov","Tiffany Deng","Raja Bala","Larry Davis","Betty Mohler Tesch"],"pdf_url":"https://arxiv.org/pdf/2412.04086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04081v1","updated":"2024-12-05T11:32:14Z","published":"2024-12-05T11:32:14Z","title":"Federated Learning in Mobile Networks: A Comprehensive Case Study on\n Traffic Forecasting","summary":" The increasing demand for efficient resource allocation in mobile networks\nhas catalyzed the exploration of innovative solutions that could enhance the\ntask of real-time cellular traffic prediction. Under these circumstances,\nfederated learning (FL) stands out as a distributed and privacy-preserving\nsolution to foster collaboration among different sites, thus enabling\nresponsive near-the-edge solutions. In this paper, we comprehensively study the\npotential benefits of FL in telecommunications through a case study on\nfederated traffic forecasting using real-world data from base stations (BSs) in\nBarcelona (Spain). Our study encompasses relevant aspects within the federated\nexperience, including model aggregation techniques, outlier management, the\nimpact of individual clients, personalized learning, and the integration of\nexogenous sources of data. The performed evaluation is based on both prediction\naccuracy and sustainability, thus showcasing the environmental impact of\nemployed FL algorithms in various settings. The findings from our study\nhighlight FL as a promising and robust solution for mobile traffic prediction,\nemphasizing its twin merits as a privacy-conscious and environmentally\nsustainable approach, while also demonstrating its capability to overcome data\nheterogeneity and ensure high-quality predictions, marking a significant stride\ntowards its integration in mobile traffic management systems.\n","authors":["Nikolaos Pavlidis","Vasileios Perifanis","Selim F. Yilmaz","Francesc Wilhelmi","Marco Miozzo","Pavlos S. Efraimidis","Remous-Aris Koutsiamanis","Pavol Mulinka","Paolo Dini"],"pdf_url":"https://arxiv.org/pdf/2412.04081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09832v3","updated":"2024-12-05T11:23:14Z","published":"2023-05-16T22:19:19Z","title":"A Deep RL Approach on Task Placement and Scaling of Edge Resources for\n Cellular Vehicle-to-Network Service Provisioning","summary":" Cellular-Vehicle-to-Everything (C-V2X) is currently at the forefront of the\ndigital transformation of our society. By enabling vehicles to communicate with\neach other and with the traffic environment using cellular networks, we\nredefine transportation, improving road safety and transportation services,\nincreasing efficiency of vehicular traffic flows, and reducing environmental\nimpact. To effectively facilitate the provisioning of Cellular\nVehicular-to-Network (C-V2N) services, we tackle the interdependent problems of\nservice task placement and scaling of edge resources. Specifically, we\nformulate the joint problem and prove that it is not computationally tractable.\nTo address its complexity we propose Deep Hybrid Policy Gradient (DHPG), a new\nDeep Reinforcement Learning (DRL) approach that operates in hybrid action\nspaces, enabling holistic decision-making and enhancing overall performance. We\nevaluated the performance of DHPG using simulations with a real-world C-V2N\ntraffic dataset, comparing it to several state-of-the-art (SoA) solutions. DHPG\noutperforms these solutions, guaranteeing the $99^{th}$ percentile of C-V2N\nservice delay target, while simultaneously optimizing the utilization of\ncomputing resources. Finally, time complexity analysis is conducted to verify\nthat the proposed approach can support real-time C-V2N services.\n","authors":["Cyril Shih-Huan Hsu","Jorge Martín-Pérez","Danny De Vleeschauwer","Luca Valcarenghi","Xi Li","Chrysa Papagianni"],"pdf_url":"https://arxiv.org/pdf/2305.09832v3.pdf","comment":"This paper has been submitted to IEEE Transactions on Network and\n Service Management"},{"id":"http://arxiv.org/abs/2409.18017v2","updated":"2024-12-05T11:21:16Z","published":"2024-09-26T16:25:48Z","title":"Transferring disentangled representations: bridging the gap between\n synthetic and real images","summary":" Developing meaningful and efficient representations that separate the\nfundamental structure of the data generation mechanism is crucial in\nrepresentation learning. However, Disentangled Representation Learning has not\nfully shown its potential on real images, because of correlated generative\nfactors, their resolution and limited access to ground truth labels.\nSpecifically on the latter, we investigate the possibility of leveraging\nsynthetic data to learn general-purpose disentangled representations applicable\nto real data, discussing the effect of fine-tuning and what properties of\ndisentanglement are preserved after the transfer. We provide an extensive\nempirical study to address these issues. In addition, we propose a new\ninterpretable intervention-based metric, to measure the quality of factors\nencoding in the representation. Our results indicate that some level of\ndisentanglement, transferring a representation from synthetic to real data, is\npossible and effective.\n","authors":["Jacopo Dapueto","Nicoletta Noceti","Francesca Odone"],"pdf_url":"https://arxiv.org/pdf/2409.18017v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04075v1","updated":"2024-12-05T11:14:01Z","published":"2024-12-05T11:14:01Z","title":"Does your model understand genes? A benchmark of gene properties for\n biological and text models","summary":" The application of deep learning methods, particularly foundation models, in\nbiological research has surged in recent years. These models can be text-based\nor trained on underlying biological data, especially omics data of various\ntypes. However, comparing the performance of these models consistently has\nproven to be a challenge due to differences in training data and downstream\ntasks. To tackle this problem, we developed an architecture-agnostic\nbenchmarking approach that, instead of evaluating the models directly,\nleverages entity representation vectors from each model and trains simple\npredictive models for each benchmarking task. This ensures that all types of\nmodels are evaluated using the same input and output types. Here we focus on\ngene properties collected from professionally curated bioinformatics databases.\nThese gene properties are categorized into five major groups: genomic\nproperties, regulatory functions, localization, biological processes, and\nprotein properties. Overall, we define hundreds of tasks based on these\ndatabases, which include binary, multi-label, and multi-class classification\ntasks. We apply these benchmark tasks to evaluate expression-based models,\nlarge language models, protein language models, DNA-based models, and\ntraditional baselines. Our findings suggest that text-based models and protein\nlanguage models generally outperform expression-based models in genomic\nproperties and regulatory functions tasks, whereas expression-based models\ndemonstrate superior performance in localization tasks. These results should\naid in the development of more informed artificial intelligence strategies for\nbiological understanding and therapeutic discovery. To ensure the\nreproducibility and transparency of our findings, we have made the source code\nand benchmark data publicly accessible for further investigation and expansion\nat github.com/BiomedSciAI/gene-benchmark.\n","authors":["Yoav Kan-Tor","Michael Morris Danziger","Eden Zohar","Matan Ninio","Yishai Shimoni"],"pdf_url":"https://arxiv.org/pdf/2412.04075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04069v1","updated":"2024-12-05T11:05:46Z","published":"2024-12-05T11:05:46Z","title":"ProtDAT: A Unified Framework for Protein Sequence Design from Any\n Protein Text Description","summary":" Protein design has become a critical method in advancing significant\npotential for various applications such as drug development and enzyme\nengineering. However, protein design methods utilizing large language models\nwith solely pretraining and fine-tuning struggle to capture relationships in\nmulti-modal protein data. To address this, we propose ProtDAT, a de novo\nfine-grained framework capable of designing proteins from any descriptive\nprotein text input. ProtDAT builds upon the inherent characteristics of protein\ndata to unify sequences and text as a cohesive whole rather than separate\nentities. It leverages an innovative multi-modal cross-attention, integrating\nprotein sequences and textual information for a foundational level and seamless\nintegration. Experimental results demonstrate that ProtDAT achieves the\nstate-of-the-art performance in protein sequence generation, excelling in\nrationality, functionality, structural similarity, and validity. On 20,000\ntext-sequence pairs from Swiss-Prot, it improves pLDDT by 6%, TM-score by 0.26,\nand reduces RMSD by 1.2 {\\AA}, highlighting its potential to advance protein\ndesign.\n","authors":["Xiao-Yu Guo","Yi-Fan Li","Yuan Liu","Xiaoyong Pan","Hong-Bin Shen"],"pdf_url":"https://arxiv.org/pdf/2412.04069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04067v1","updated":"2024-12-05T11:05:12Z","published":"2024-12-05T11:05:12Z","title":"Automated Medical Report Generation for ECG Data: Bridging Medical Text\n and Signal Processing with Deep Learning","summary":" Recent advances in deep learning and natural language generation have\nsignificantly improved image captioning, enabling automated, human-like\ndescriptions for visual content. In this work, we apply these captioning\ntechniques to generate clinician-like interpretations of ECG data. This study\nleverages existing ECG datasets accompanied by free-text reports authored by\nhealthcare professionals (HCPs) as training data. These reports, while often\ninconsistent, provide a valuable foundation for automated learning. We\nintroduce an encoder-decoder-based method that uses these reports to train\nmodels to generate detailed descriptions of ECG episodes. This represents a\nsignificant advancement in ECG analysis automation, with potential applications\nin zero-shot classification and automated clinical decision support.\n The model is tested on various datasets, including both 1- and 12-lead ECGs.\nIt significantly outperforms the state-of-the-art reference model by Qiu et\nal., achieving a METEOR score of 55.53% compared to 24.51% achieved by the\nreference model. Furthermore, several key design choices are discussed,\nproviding a comprehensive overview of current challenges and innovations in\nthis domain.\n The source codes for this research are publicly available in our Git\nrepository https://git.zib.de/ableich/ecg-comment-generation-public\n","authors":["Amnon Bleich","Antje Linnemann","Bjoern H. Diem","Tim OF Conrad"],"pdf_url":"https://arxiv.org/pdf/2412.04067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07330v3","updated":"2024-12-05T11:03:41Z","published":"2023-01-18T06:37:24Z","title":"FPANet: Frequency-based Video Demoireing using Frame-level Post\n Alignment","summary":" Moire patterns, created by the interference between overlapping grid patterns\nin the pixel space, degrade the visual quality of images and videos. Therefore,\nremoving such patterns~(demoireing) is crucial, yet remains a challenge due to\ntheir complexities in sizes and distortions. Conventional methods mainly tackle\nthis task by only exploiting the spatial domain of the input images, limiting\ntheir capabilities in removing large-scale moire patterns. Therefore, this work\nproposes FPANet, an image-video demoireing network that learns filters in both\nfrequency and spatial domains, improving the restoration quality by removing\nvarious sizes of moire patterns. To further enhance, our model takes multiple\nconsecutive frames, learning to extract frame-invariant content features and\noutputting better quality temporally consistent images. We demonstrate the\neffectiveness of our proposed method with a publicly available large-scale\ndataset, observing that ours outperforms the state-of-the-art approaches in\nterms of image and video quality metrics and visual experience.\n","authors":["Gyeongrok Oh","Sungjune Kim","Heon Gu","Sang Ho Yoon","Jinkyu Kim","Sangpil Kim"],"pdf_url":"https://arxiv.org/pdf/2301.07330v3.pdf","comment":"Accepted version, to appear in Neural Networks"},{"id":"http://arxiv.org/abs/2408.08968v3","updated":"2024-12-05T11:01:30Z","published":"2024-08-16T18:34:11Z","title":"Online SLA Decomposition: Enabling Real-Time Adaptation to Evolving\n Systems","summary":" When a network slice spans multiple technology domains, it is crucial for\neach domain to uphold the End-to-End (E2E) Service Level Agreement (SLA)\nassociated with the slice. Consequently, the E2E SLA must be properly\ndecomposed into partial SLAs that are assigned to each domain involved. In a\nnetwork slice management system with a two-level architecture, comprising an\nE2E service orchestrator and local domain controllers, we consider that the\norchestrator has access solely to historical data regarding the responses of\nlocal controllers to previous requests, and this information is used to\nconstruct a risk model for each domain. In this study, we extend our previous\nwork by investigating the dynamic nature of real-world systems and introducing\nan online learning-decomposition framework to tackle the dynamicity. We propose\na framework that periodically updates the risk models based on the most recent\nfeedback. This approach leverages key components such as online gradient\ndescent and FIFO memory buffers, which enhance the stability and robustness of\nthe overall process. Our empirical study on an analytic model-based simulator\ndemonstrates that the proposed framework outperforms the state-of-the-art\nstatic approach, providing more accurate and resilient SLA decomposition even\nunder varying conditions and limited data scenarios.\n","authors":["Cyril Shih-Huan Hsu","Danny De Vleeschauwer","Chrysa Papagianni"],"pdf_url":"https://arxiv.org/pdf/2408.08968v3.pdf","comment":"The paper has been submitted to IEEE ICMLCN 2025"},{"id":"http://arxiv.org/abs/2412.04064v1","updated":"2024-12-05T10:59:20Z","published":"2024-12-05T10:59:20Z","title":"Graph Neural Networks Need Cluster-Normalize-Activate Modules","summary":" Graph Neural Networks (GNNs) are non-Euclidean deep learning models for\ngraph-structured data. Despite their successful and diverse applications,\noversmoothing prohibits deep architectures due to node features converging to a\nsingle fixed point. This severely limits their potential to solve complex\ntasks. To counteract this tendency, we propose a plug-and-play module\nconsisting of three steps: Cluster-Normalize-Activate (CNA). By applying CNA\nmodules, GNNs search and form super nodes in each layer, which are normalized\nand activated individually. We demonstrate in node classification and property\nprediction tasks that CNA significantly improves the accuracy over the\nstate-of-the-art. Particularly, CNA reaches 94.18% and 95.75% accuracy on Cora\nand CiteSeer, respectively. It further benefits GNNs in regression tasks as\nwell, reducing the mean squared error compared to all baselines. At the same\ntime, GNNs with CNA require substantially fewer learnable parameters than\ncompeting architectures.\n","authors":["Arseny Skryagin","Felix Divo","Mohammad Amin Ali","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2412.04064v1.pdf","comment":"17 pages, 6 figures, 6 tables, accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.09014v6","updated":"2024-12-05T10:57:12Z","published":"2024-06-13T11:38:58Z","title":"Deep learning empowered sensor fusion boosts infant movement\n classification","summary":" To assess the integrity of the developing nervous system, the Prechtl general\nmovement assessment (GMA) is recognized for its clinical value in diagnosing\nneurological impairments in early infancy. GMA has been increasingly augmented\nthrough machine learning approaches intending to scale-up its application,\ncircumvent costs in the training of human assessors and further standardize\nclassification of spontaneous motor patterns. Available deep learning tools,\nall of which are based on single sensor modalities, are however still\nconsiderably inferior to that of well-trained human assessors. These approaches\nare hardly comparable as all models are designed, trained and evaluated on\nproprietary/silo-data sets. With this study we propose a sensor fusion approach\nfor assessing fidgety movements (FMs). FMs were recorded from 51 typically\ndeveloping participants. We compared three different sensor modalities\n(pressure, inertial, and visual sensors). Various combinations and two sensor\nfusion approaches (late and early fusion) for infant movement classification\nwere tested to evaluate whether a multi-sensor system outperforms single\nmodality assessments. Convolutional neural network (CNN) architectures were\nused to classify movement patterns. The performance of the three-sensor fusion\n(classification accuracy of 94.5%) was significantly higher than that of any\nsingle modality evaluated. We show that the sensor fusion approach is a\npromising avenue for automated classification of infant motor patterns. The\ndevelopment of a robust sensor fusion system may significantly enhance AI-based\nearly recognition of neurofunctions, ultimately facilitating automated early\ndetection of neurodevelopmental conditions.\n","authors":["Tomas Kulvicius","Dajie Zhang","Luise Poustka","Sven Bölte","Lennart Jahn","Sarah Flügge","Marc Kraft","Markus Zweckstetter","Karin Nielsen-Saines","Florentin Wörgötter","Peter B Marschik"],"pdf_url":"https://arxiv.org/pdf/2406.09014v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04062v1","updated":"2024-12-05T10:57:08Z","published":"2024-12-05T10:57:08Z","title":"ZipAR: Accelerating Autoregressive Image Generation through Spatial\n Locality","summary":" In this paper, we propose ZipAR, a training-free, plug-and-play parallel\ndecoding framework for accelerating auto-regressive (AR) visual generation. The\nmotivation stems from the observation that images exhibit local structures, and\nspatially distant regions tend to have minimal interdependence. Given a\npartially decoded set of visual tokens, in addition to the original next-token\nprediction scheme in the row dimension, the tokens corresponding to spatially\nadjacent regions in the column dimension can be decoded in parallel, enabling\nthe ``next-set prediction'' paradigm. By decoding multiple tokens\nsimultaneously in a single forward pass, the number of forward passes required\nto generate an image is significantly reduced, resulting in a substantial\nimprovement in generation efficiency. Experiments demonstrate that ZipAR can\nreduce the number of model forward passes by up to 91% on the Emu3-Gen model\nwithout requiring any additional retraining.\n","authors":["Yefei He","Feng Chen","Yuanyu He","Shaoxuan He","Hong Zhou","Kaipeng Zhang","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2412.04062v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2412.04060v1","updated":"2024-12-05T10:55:54Z","published":"2024-12-05T10:55:54Z","title":"Expanding Deep Learning-based Sensing Systems with Multi-Source\n Knowledge Transfer","summary":" Expanding the existing sensing systems to provide high-quality deep learning\nmodels for more domains, such as new users or environments, is challenged by\nthe limited labeled data and the data and device heterogeneities. While\nknowledge distillation methods could overcome label scarcity and device\nheterogeneity, they assume the teachers are fully reliable and overlook the\ndata heterogeneity, which prevents the direct adoption of existing models. To\naddress this problem, this paper proposes an efficient knowledge transfer\nframework, HaKT, to expand sensing systems. It first selects multiple\nhigh-quality models from the system at a low cost and then fuses their\nknowledge by assigning sample-wise weights to their predictions. Later, the\nfused knowledge is selectively injected into the customized models for new\ndomains based on the knowledge quality. Extensive experiments on different\ntasks, modalities, and settings show that HaKT outperforms stat-of-the-art\nbaselines by at most 16.5% accuracy and saves up to 39% communication traffic.\n","authors":["Gaole Dai","Huatao Xu","Rui Tan","Mo Li"],"pdf_url":"https://arxiv.org/pdf/2412.04060v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.04057v1","updated":"2024-12-05T10:50:58Z","published":"2024-12-05T10:50:58Z","title":"From Code to Play: Benchmarking Program Search for Games Using Large\n Language Models","summary":" Large language models (LLMs) have shown impressive capabilities in generating\nprogram code, opening exciting opportunities for applying program synthesis to\ngames. In this work, we explore the potential of LLMs to directly synthesize\nusable code for a wide range of gaming applications, focusing on two\nprogramming languages, Python and Java. We use an evolutionary hill-climbing\nalgorithm, where the mutations and seeds of the initial programs are controlled\nby LLMs. For Python, the framework covers various game-related tasks, including\nfive miniature versions of Atari games, ten levels of Baba is You, an\nenvironment inspired by Asteroids, and a maze generation task. For Java, the\nframework contains 12 games from the TAG tabletop games framework. Across 29\ntasks, we evaluated 12 language models for Python and 8 for Java. Our findings\nsuggest that the performance of LLMs depends more on the task than on model\nsize. While larger models generate more executable programs, these do not\nalways result in higher-quality solutions but are much more expensive. No model\nhas a clear advantage, although on any specific task, one model may be better.\nTrying many models on a problem and using the best results across them is more\nreliable than using just one.\n","authors":["Manuel Eberhardinger","James Goodman","Alexander Dockhorn","Diego Perez-Liebana","Raluca D. Gaina","Duygu Çakmak","Setareh Maghsudi","Simon Lucas"],"pdf_url":"https://arxiv.org/pdf/2412.04057v1.pdf","comment":"Submitted to Transactions on Games Special Issue on Large Language\n Models and Games"},{"id":"http://arxiv.org/abs/2412.02788v2","updated":"2024-12-05T10:30:56Z","published":"2024-12-03T19:37:00Z","title":"Hybrid-SQuAD: Hybrid Scholarly Question Answering Dataset","summary":" Existing Scholarly Question Answering (QA) methods typically target\nhomogeneous data sources, relying solely on either text or Knowledge Graphs\n(KGs). However, scholarly information often spans heterogeneous sources,\nnecessitating the development of QA systems that integrate information from\nmultiple heterogeneous data sources. To address this challenge, we introduce\nHybrid-SQuAD (Hybrid Scholarly Question Answering Dataset), a novel large-scale\nQA dataset designed to facilitate answering questions incorporating both text\nand KG facts. The dataset consists of 10.5K question-answer pairs generated by\na large language model, leveraging the KGs DBLP and SemOpenAlex alongside\ncorresponding text from Wikipedia. In addition, we propose a RAG-based baseline\nhybrid QA model, achieving an exact match score of 69.65 on the Hybrid-SQuAD\ntest set.\n","authors":["Tilahun Abedissa Taffa","Debayan Banerjee","Yaregal Assabie","Ricardo Usbeck"],"pdf_url":"https://arxiv.org/pdf/2412.02788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04037v1","updated":"2024-12-05T10:20:34Z","published":"2024-12-05T10:20:34Z","title":"INFP: Audio-Driven Interactive Head Generation in Dyadic Conversations","summary":" Imagine having a conversation with a socially intelligent agent. It can\nattentively listen to your words and offer visual and linguistic feedback\npromptly. This seamless interaction allows for multiple rounds of conversation\nto flow smoothly and naturally. In pursuit of actualizing it, we propose INFP,\na novel audio-driven head generation framework for dyadic interaction. Unlike\nprevious head generation works that only focus on single-sided communication,\nor require manual role assignment and explicit role switching, our model drives\nthe agent portrait dynamically alternates between speaking and listening state,\nguided by the input dyadic audio. Specifically, INFP comprises a Motion-Based\nHead Imitation stage and an Audio-Guided Motion Generation stage. The first\nstage learns to project facial communicative behaviors from real-life\nconversation videos into a low-dimensional motion latent space, and use the\nmotion latent codes to animate a static image. The second stage learns the\nmapping from the input dyadic audio to motion latent codes through denoising,\nleading to the audio-driven head generation in interactive scenarios. To\nfacilitate this line of research, we introduce DyConv, a large scale dataset of\nrich dyadic conversations collected from the Internet. Extensive experiments\nand visualizations demonstrate superior performance and effectiveness of our\nmethod. Project Page: https://grisoon.github.io/INFP/.\n","authors":["Yongming Zhu","Longhao Zhang","Zhengkun Rong","Tianshu Hu","Shuang Liang","Zhipeng Ge"],"pdf_url":"https://arxiv.org/pdf/2412.04037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04036v1","updated":"2024-12-05T10:19:36Z","published":"2024-12-05T10:19:36Z","title":"SocialMind: LLM-based Proactive AR Social Assistive System with\n Human-like Perception for In-situ Live Interactions","summary":" Social interactions are fundamental to human life. The recent emergence of\nlarge language models (LLMs)-based virtual assistants has demonstrated their\npotential to revolutionize human interactions and lifestyles. However, existing\nassistive systems mainly provide reactive services to individual users, rather\nthan offering in-situ assistance during live social interactions with\nconversational partners. In this study, we introduce SocialMind, the first\nLLM-based proactive AR social assistive system that provides users with in-situ\nsocial assistance. SocialMind employs human-like perception leveraging\nmulti-modal sensors to extract both verbal and nonverbal cues, social factors,\nand implicit personas, incorporating these social cues into LLM reasoning for\nsocial suggestion generation. Additionally, SocialMind employs a multi-tier\ncollaborative generation strategy and proactive update mechanism to display\nsocial suggestions on Augmented Reality (AR) glasses, ensuring that suggestions\nare timely provided to users without disrupting the natural flow of\nconversation. Evaluations on three public datasets and a user study with 20\nparticipants show that SocialMind achieves 38.3% higher engagement compared to\nbaselines, and 95% of participants are willing to use SocialMind in their live\nsocial interactions.\n","authors":["Bufang Yang","Yunqi Guo","Lilin Xu","Zhenyu Yan","Hongkai Chen","Guoliang Xing","Xiaofan Jiang"],"pdf_url":"https://arxiv.org/pdf/2412.04036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10101v2","updated":"2024-12-05T10:06:43Z","published":"2024-01-18T16:10:07Z","title":"Bayesian Networks for Causal Analysis in Socioecological Systems","summary":" Causal and counterfactual reasoning are emerging directions in data science\nthat allow us to reason about hypothetical scenarios. This is particularly\nuseful in fields like environmental and ecological sciences, where\ninterventional data are usually not available. Structural causal models are\nprobabilistic models for causal analysis that simplify this kind of reasoning\ndue to their graphical representation. They can be regarded as extensions of\nthe so-called Bayesian networks, a well known modeling tool commonly used in\nenvironmental and ecological problems. The main contribution of this paper is\nto analyze the relations of necessity and sufficiency between the variables of\na socioecological system using counterfactual reasoning with Bayesian networks.\nIn particular, we consider a case study involving socioeconomic factors and\nland-uses in southern Spain. In addition, this paper aims to be a coherent\noverview of the fundamental concepts for applying counterfactual reasoning, so\nthat environmental researchers with a background in Bayesian networks can\neasily take advantage of the structural causal model formalism.\n","authors":["Rafael Cabañas","Ana D. Maldonado","María Morales","Pedro A. Aguilera","Antonio Salmerón"],"pdf_url":"https://arxiv.org/pdf/2401.10101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04029v1","updated":"2024-12-05T10:05:53Z","published":"2024-12-05T10:05:53Z","title":"Considerations Influencing Offense-Defense Dynamics From Artificial\n Intelligence","summary":" The rapid advancement of artificial intelligence (AI) technologies presents\nprofound challenges to societal safety. As AI systems become more capable,\naccessible, and integrated into critical services, the dual nature of their\npotential is increasingly clear. While AI can enhance defensive capabilities in\nareas like threat detection, risk assessment, and automated security\noperations, it also presents avenues for malicious exploitation and large-scale\nsocietal harm, for example through automated influence operations and cyber\nattacks. Understanding the dynamics that shape AI's capacity to both cause harm\nand enhance protective measures is essential for informed decision-making\nregarding the deployment, use, and integration of advanced AI systems. This\npaper builds on recent work on offense-defense dynamics within the realm of AI,\nproposing a taxonomy to map and examine the key factors that influence whether\nAI systems predominantly pose threats or offer protective benefits to society.\nBy establishing a shared terminology and conceptual foundation for analyzing\nthese interactions, this work seeks to facilitate further research and\ndiscourse in this critical area.\n","authors":["Giulio Corsi","Kyle Kilian","Richard Mallah"],"pdf_url":"https://arxiv.org/pdf/2412.04029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.19846v6","updated":"2024-12-05T09:56:35Z","published":"2024-05-30T08:50:55Z","title":"Quest: Query-centric Data Synthesis Approach for Long-context Scaling of\n Large Language Model","summary":" Recent advancements in large language models (LLMs) have highlighted the\nimportance of extending context lengths for handling complex tasks. While\ntraditional methods for training on long contexts often use filtered long\ndocuments, these approaches lead to domain imbalances, limiting model\nperformance. To address this, techniques like random document concatenation\n(Standard) and similarity-based methods (KNN, ICLM) have been developed.\nHowever, they either sacrifice semantic coherence or diversity. To balance both\naspects, we introduce Quest, a query-centric data synthesis method aggregating\nsemantically relevant yet diverse documents. Quest uses a generative model to\npredict potential queries for each document, grouping documents with similar\nqueries and keywords. Extensive experiments demonstrate Quest's superior\nperformance on long-context tasks, achieving remarkable results with context\nlengths of up to 1M tokens and confirming its scalability across various model\nsizes.\n","authors":["Chaochen Gao","Xing Wu","Qi Fu","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2405.19846v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04008v1","updated":"2024-12-05T09:41:33Z","published":"2024-12-05T09:41:33Z","title":"Deep-Unrolling Multidimensional Harmonic Retrieval Algorithms on\n Neuromorphic Hardware","summary":" This paper explores the potential of conversion-based neuromorphic algorithms\nfor highly accurate and energy-efficient single-snapshot multidimensional\nharmonic retrieval (MHR). By casting the MHR problem as a sparse recovery\nproblem, we devise the currently proposed, deep-unrolling-based Structured\nLearned Iterative Shrinkage and Thresholding (S-LISTA) algorithm to solve it\nefficiently using complex-valued convolutional neural networks with\ncomplex-valued activations, which are trained using a supervised regression\nobjective. Afterward, a novel method for converting the complex-valued\nconvolutional layers and activations into spiking neural networks (SNNs) is\ndeveloped. At the heart of this method lies the recently proposed Few Spikes\n(FS) conversion, which is extended by modifying the neuron model's parameters\nand internal dynamics to account for the inherent coupling between real and\nimaginary parts in complex-valued computations. Finally, the converted SNNs are\nmapped onto the SpiNNaker2 neuromorphic board, and a comparison in terms of\nestimation accuracy and power efficiency between the original CNNs deployed on\nan NVIDIA Jetson Xavier and the SNNs is being conducted. The measurement\nresults show that the converted SNNs achieve almost five-fold power efficiency\nat moderate performance loss compared to the original CNNs.\n","authors":["Vlad C. Andrei","Alexandru P. Drăguţoiu","Gabriel Béna","Mahmoud Akl","Yin Li","Matthias Lohrmann","Ullrich J. Mönich","Holger Boche"],"pdf_url":"https://arxiv.org/pdf/2412.04008v1.pdf","comment":"accepted to the 58th Asilomar Conference on Signals, Systems, and\n Computers, Oct. 27th - Oct. 30th, 2024, Pacific Grove, CA"},{"id":"http://arxiv.org/abs/2412.03993v1","updated":"2024-12-05T09:14:50Z","published":"2024-12-05T09:14:50Z","title":"LaserGuider: A Laser Based Physical Backdoor Attack against Deep Neural\n Networks","summary":" Backdoor attacks embed hidden associations between triggers and targets in\ndeep neural networks (DNNs), causing them to predict the target when a trigger\nis present while maintaining normal behavior otherwise. Physical backdoor\nattacks, which use physical objects as triggers, are feasible but lack remote\ncontrol, temporal stealthiness, flexibility, and mobility. To overcome these\nlimitations, in this work, we propose a new type of backdoor triggers utilizing\nlasers that feature long-distance transmission and instant-imaging properties.\nBased on the laser-based backdoor triggers, we present a physical backdoor\nattack, called LaserGuider, which possesses remote control ability and achieves\nhigh temporal stealthiness, flexibility, and mobility. We also introduce a\nsystematic approach to optimize laser parameters for improving attack\neffectiveness. Our evaluation on traffic sign recognition DNNs, critical in\nautonomous vehicles, demonstrates that LaserGuider with three different\nlaser-based triggers achieves over 90% attack success rate with negligible\nimpact on normal inputs. Additionally, we release LaserMark, the first dataset\nof real world traffic signs stamped with physical laser spots, to support\nfurther research in backdoor attacks and defenses.\n","authors":["Yongjie Xu","Guangke Chen","Fu Song","Yuqi Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03993v1.pdf","comment":"In Proceedings of the 23rd International Conference on Applied\n Cryptography and Network Security (ACNS), Munich, Germany, 23-26 June, 2025"},{"id":"http://arxiv.org/abs/2412.03987v1","updated":"2024-12-05T09:05:30Z","published":"2024-12-05T09:05:30Z","title":"MTMT: Consolidating Multiple Thinking Modes to Form a Thought Tree for\n Strengthening LLM","summary":" Large language models (LLMs) have shown limitations in tasks requiring\ncomplex logical reasoning and multi-step problem-solving. To address these\nchallenges, researchers have employed carefully designed prompts and\nflowcharts, simulating human cognitive processes to enhance LLM performance,\nsuch as the Chain of Thought approach. In this paper, we introduce MTMT\n(Multi-thinking Modes Tree), a novel method that interacts with LLMs to\nconstruct a thought tree, simulating various advanced cognitive processes,\nincluding but not limited to association, counterfactual thinking, task\ndecomposition, and comparison. By breaking down the original complex task into\nsimpler sub-questions, MTMT facilitates easier problem-solving for LLMs,\nenabling more effective utilization of the latent knowledge within LLMs. We\nevaluate the performance of MTMT under different parameter configurations,\nusing GPT-4o mini as the base model. Our results demonstrate that integrating\nmultiple modes of thinking significantly enhances the ability of LLMs to handle\ncomplex tasks.\n","authors":["Changcheng Li","Xiangyu Wang","Qiuju Chen","Xiren Zhou","Huanhuan Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18569v2","updated":"2024-12-05T09:02:11Z","published":"2024-03-27T13:50:13Z","title":"PDNNet: PDN-Aware GNN-CNN Heterogeneous Network for Dynamic IR Drop\n Prediction","summary":" IR drop on the power delivery network (PDN) is closely related to PDN's\nconfiguration and cell current consumption. As the integrated circuit (IC)\ndesign is growing larger, dynamic IR drop simulation becomes computationally\nunaffordable and machine learning based IR drop prediction has been explored as\na promising solution. Although CNN-based methods have been adapted to IR drop\nprediction task in several works, the shortcomings of overlooking PDN\nconfiguration is non-negligible. In this paper, we consider not only how to\nproperly represent cell-PDN relation, but also how to model IR drop following\nits physical nature in the feature aggregation procedure. Thus, we propose a\nnovel graph structure, PDNGraph, to unify the representations of the PDN\nstructure and the fine-grained cell-PDN relation. We further propose a\ndual-branch heterogeneous network, PDNNet, incorporating two parallel GNN-CNN\nbranches to favorably capture the above features during the learning process.\nSeveral key designs are presented to make the dynamic IR drop prediction highly\neffective and interpretable. We are the first work to apply graph structure to\ndeep-learning based dynamic IR drop prediction method. Experiments show that\nPDNNet outperforms the state-of-the-art CNN-based methods and achieves 545x\nspeedup compared to the commercial tool, which demonstrates the superiority of\nour method.\n","authors":["Yuxiang Zhao","Zhuomin Chai","Xun Jiang","Yibo Lin","Runsheng Wang","Ru Huang"],"pdf_url":"https://arxiv.org/pdf/2403.18569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03982v1","updated":"2024-12-05T08:58:25Z","published":"2024-12-05T08:58:25Z","title":"Exploring Fully Convolutional Networks for the Segmentation of\n Hyperspectral Imaging Applied to Advanced Driver Assistance Systems","summary":" Advanced Driver Assistance Systems (ADAS) are designed with the main purpose\nof increasing the safety and comfort of vehicle occupants. Most of current\ncomputer vision-based ADAS perform detection and tracking tasks quite\nsuccessfully under regular conditions, but are not completely reliable,\nparticularly under adverse weather and changing lighting conditions, neither in\ncomplex situations with many overlapping objects. In this work we explore the\nuse of hyperspectral imaging (HSI) in ADAS on the assumption that the distinct\nnear infrared (NIR) spectral reflectances of different materials can help to\nbetter separate the objects in a driving scene. In particular, this paper\ndescribes some experimental results of the application of fully convolutional\nnetworks (FCN) to the image segmentation of HSI for ADAS applications. More\nspecifically, our aim is to investigate to what extent the spatial features\ncodified by convolutional filters can be helpful to improve the performance of\nHSI segmentation systems. With that aim, we use the HSI-Drive v1.1 dataset,\nwhich provides a set of labelled images recorded in real driving conditions\nwith a small-size snapshot NIR-HSI camera. Finally, we analyze the\nimplementability of such a HSI segmentation system by prototyping the developed\nFCN model together with the necessary hyperspectral cube preprocessing stage\nand characterizing its performance on an MPSoC.\n","authors":["Jon Gutiérrez-Zaballa","Koldo Basterretxea","Javier Echanobe","M. Victoria Martínez","Inés del Campo"],"pdf_url":"https://arxiv.org/pdf/2412.03982v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2411.19274"},{"id":"http://arxiv.org/abs/2411.18220v2","updated":"2024-12-05T08:57:30Z","published":"2024-11-27T10:57:06Z","title":"R-MTLLMF: Resilient Multi-Task Large Language Model Fusion at the\n Wireless Edge","summary":" Multi-task large language models (MTLLMs) are important for many applications\nat the wireless edge, where users demand specialized models to handle multiple\ntasks efficiently. However, training MTLLMs is complex and exhaustive,\nparticularly when tasks are subject to change. Recently, the concept of model\nfusion via task vectors has emerged as an efficient approach for combining\nfine-tuning parameters to produce an MTLLM. In this paper, the problem of\nenabling edge users to collaboratively craft such MTLMs via tasks vectors is\nstudied, under the assumption of worst-case adversarial attacks. To this end,\nfirst the influence of adversarial noise to multi-task model fusion is\ninvestigated and a relationship between the so-called weight disentanglement\nerror and the mean squared error (MSE) is derived. Using hypothesis testing, it\nis directly shown that the MSE increases interference between task vectors,\nthereby rendering model fusion ineffective. Then, a novel resilient MTLLM\nfusion (R-MTLLMF) is proposed, which leverages insights about the LLM\narchitecture and fine-tuning process to safeguard task vector aggregation under\nadversarial noise by realigning the MTLLM. The proposed R-MTLLMF is then\ncompared for both worst-case and ideal transmission scenarios to study the\nimpact of the wireless channel. Extensive model fusion experiments with vision\nLLMs demonstrate R-MTLLMF's effectiveness, achieving close-to-baseline\nperformance across eight different tasks in ideal noise scenarios and\nsignificantly outperforming unprotected model fusion in worst-case scenarios.\nThe results further advocate for additional physical layer protection for a\nholistic approach to resilience, from both a wireless and LLM perspective.\n","authors":["Aladin Djuhera","Vlad C. Andrei","Mohsen Pourghasemian","Haris Gacanin","Holger Boche","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2411.18220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03970v1","updated":"2024-12-05T08:38:30Z","published":"2024-12-05T08:38:30Z","title":"A Data-Driven Framework for Discovering Fractional Differential\n Equations in Complex Systems","summary":" In complex physical systems, conventional differential equations often fall\nshort in capturing non-local and memory effects, as they are limited to local\ndynamics and integer-order interactions. This study introduces a stepwise\ndata-driven framework for discovering fractional differential equations (FDEs)\ndirectly from data. FDEs, known for their capacity to model non-local dynamics\nwith fewer parameters than integer-order derivatives, can represent complex\nsystems with long-range interactions. Our framework applies deep neural\nnetworks as surrogate models for denoising and reconstructing sparse and noisy\nobservations while using Gaussian-Jacobi quadrature to handle the challenges\nposed by singularities in fractional derivatives. To optimize both the sparse\ncoefficients and fractional order, we employ an alternating optimization\napproach that combines sparse regression with global optimization techniques.\nWe validate the framework across various datasets, including synthetic\nanomalous diffusion data, experimental data on the creep behavior of frozen\nsoils, and single-particle trajectories modeled by L\\'{e}vy motion. Results\ndemonstrate the framework's robustness in identifying the structure of FDEs\nacross diverse noise levels and its capacity to capture integer-order dynamics,\noffering a flexible approach for modeling memory effects in complex systems.\n","authors":["Xiangnan Yu","Hao Xu","Zhiping Mao","HongGuang Sun","Yong Zhang","Dongxiao Zhang","Yuntian Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03966v1","updated":"2024-12-05T08:33:52Z","published":"2024-12-05T08:33:52Z","title":"Demonstration Selection for In-Context Learning via Reinforcement\n Learning","summary":" Diversity in demonstration selection is crucial for enhancing model\ngeneralization, as it enables a broader coverage of structures and concepts.\nHowever, constructing an appropriate set of demonstrations has remained a focal\npoint of research. This paper presents the Relevance-Diversity Enhanced\nSelection (RDES), an innovative approach that leverages reinforcement learning\nto optimize the selection of diverse reference demonstrations for text\nclassification tasks using Large Language Models (LLMs), especially in few-shot\nprompting scenarios. RDES employs a Q-learning framework to dynamically\nidentify demonstrations that maximize both diversity and relevance to the\nclassification objective by calculating a diversity score based on label\ndistribution among selected demonstrations. This method ensures a balanced\nrepresentation of reference data, leading to improved classification accuracy.\nThrough extensive experiments on four benchmark datasets and involving 12\nclosed-source and open-source LLMs, we demonstrate that RDES significantly\nenhances classification accuracy compared to ten established baselines.\nFurthermore, we investigate the incorporation of Chain-of-Thought (CoT)\nreasoning in the reasoning process, which further enhances the model's\npredictive performance. The results underscore the potential of reinforcement\nlearning to facilitate adaptive demonstration selection and deepen the\nunderstanding of classification challenges.\n","authors":["Xubin Wang","Jianfei Wu","Yichen Yuan","Mingzhe Li","Deyu Cai","Weijia Jia"],"pdf_url":"https://arxiv.org/pdf/2412.03966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.02293v3","updated":"2024-12-05T08:29:51Z","published":"2024-11-04T17:21:42Z","title":"Tencent Hunyuan3D-1.0: A Unified Framework for Text-to-3D and\n Image-to-3D Generation","summary":" While 3D generative models have greatly improved artists' workflows, the\nexisting diffusion models for 3D generation suffer from slow generation and\npoor generalization. To address this issue, we propose a two-stage approach\nnamed Hunyuan3D-1.0 including a lite version and a standard version, that both\nsupport text- and image-conditioned generation. In the first stage, we employ a\nmulti-view diffusion model that efficiently generates multi-view RGB in\napproximately 4 seconds. These multi-view images capture rich details of the 3D\nasset from different viewpoints, relaxing the tasks from single-view to\nmulti-view reconstruction. In the second stage, we introduce a feed-forward\nreconstruction model that rapidly and faithfully reconstructs the 3D asset\ngiven the generated multi-view images in approximately 7 seconds. The\nreconstruction network learns to handle noises and in-consistency introduced by\nthe multi-view diffusion and leverages the available information from the\ncondition image to efficiently recover the 3D structure. Our framework involves\nthe text-to-image model, i.e., Hunyuan-DiT, making it a unified framework to\nsupport both text- and image-conditioned 3D generation. Our standard version\nhas 3x more parameters than our lite and other existing model. Our\nHunyuan3D-1.0 achieves an impressive balance between speed and quality,\nsignificantly reducing generation time while maintaining the quality and\ndiversity of the produced assets.\n","authors":["Xianghui Yang","Huiwen Shi","Bowen Zhang","Fan Yang","Jiacheng Wang","Hongxu Zhao","Xinhai Liu","Xinzhou Wang","Qingxiang Lin","Jiaao Yu","Lifu Wang","Zhuo Chen","Sicong Liu","Yuhong Liu","Yong Yang","Di Wang","Jie Jiang","Chunchao Guo"],"pdf_url":"https://arxiv.org/pdf/2411.02293v3.pdf","comment":"Technical Report; 3D Generation"},{"id":"http://arxiv.org/abs/2412.03963v1","updated":"2024-12-05T08:27:14Z","published":"2024-12-05T08:27:14Z","title":"Augmenting Minds or Automating Skills: The Differential Role of Human\n Capital in Generative AI's Impact on Creative Tasks","summary":" Generative AI is rapidly reshaping creative work, raising critical questions\nabout its beneficiaries and societal implications. This study challenges\nprevailing assumptions by exploring how generative AI interacts with diverse\nforms of human capital in creative tasks. Through two random controlled\nexperiments in flash fiction writing and song composition, we uncover a\nparadox: while AI democratizes access to creative tools, it simultaneously\namplifies cognitive inequalities. Our findings reveal that AI enhances general\nhuman capital (cognitive abilities and education) by facilitating adaptability\nand idea integration but diminishes the value of domain-specific expertise. We\nintroduce a novel theoretical framework that merges human capital theory with\nthe automation-augmentation perspective, offering a nuanced understanding of\nhuman-AI collaboration. This framework elucidates how AI shifts the locus of\ncreative advantage from specialized expertise to broader cognitive\nadaptability. Contrary to the notion of AI as a universal equalizer, our work\nhighlights its potential to exacerbate disparities in skill valuation,\nreshaping workplace hierarchies and redefining the nature of creativity in the\nAI era. These insights advance theories of human capital and automation while\nproviding actionable guidance for organizations navigating AI integration\namidst workforce inequalities.\n","authors":["Meiling Huang","Ming Jin","Ning Li"],"pdf_url":"https://arxiv.org/pdf/2412.03963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03957v1","updated":"2024-12-05T08:15:37Z","published":"2024-12-05T08:15:37Z","title":"A Framework For Image Synthesis Using Supervised Contrastive Learning","summary":" Text-to-image (T2I) generation aims at producing realistic images\ncorresponding to text descriptions. Generative Adversarial Network (GAN) has\nproven to be successful in this task. Typical T2I GANs are 2 phase methods that\nfirst pretrain an inter-modal representation from aligned image-text pairs and\nthen use GAN to train image generator on that basis. However, such\nrepresentation ignores the inner-modal semantic correspondence, e.g. the images\nwith same label. The semantic label in priory describes the inherent\ndistribution pattern with underlying cross-image relationships, which is\nsupplement to the text description for understanding the full characteristics\nof image. In this paper, we propose a framework leveraging both inter- and\ninner-modal correspondence by label guided supervised contrastive learning. We\nextend the T2I GANs to two parameter-sharing contrast branches in both\npretraining and generation phases. This integration effectively clusters the\nsemantically similar image-text pair representations, thereby fostering the\ngeneration of higher-quality images. We demonstrate our framework on four novel\nT2I GANs by both single-object dataset CUB and multi-object dataset COCO,\nachieving significant improvements in the Inception Score (IS) and Frechet\nInception Distance (FID) metrics of imagegeneration evaluation. Notably, on\nmore complex multi-object COCO, our framework improves FID by 30.1%, 27.3%,\n16.2% and 17.1% for AttnGAN, DM-GAN, SSA-GAN and GALIP, respectively. We also\nvalidate our superiority by comparing with other label guided T2I GANs. The\nresults affirm the effectiveness and competitiveness of our approach in\nadvancing the state-of-the-art GAN for T2I generation\n","authors":["Yibin Liu","Jianyu Zhang","Li Zhang","Shijian Li","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2412.03957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03944v1","updated":"2024-12-05T07:47:29Z","published":"2024-12-05T07:47:29Z","title":"Chain-of-Thought in Large Language Models: Decoding, Projection, and\n Activation","summary":" Chain-of-Thought prompting has significantly enhanced the reasoning\ncapabilities of large language models, with numerous studies exploring factors\ninfluencing its performance. However, the underlying mechanisms remain poorly\nunderstood. To further demystify the operational principles, this work examines\nthree key aspects: decoding, projection, and activation, aiming to elucidate\nthe changes that occur within models when employing Chainof-Thought. Our\nfindings reveal that LLMs effectively imitate exemplar formats while\nintegrating them with their understanding of the question, exhibiting\nfluctuations in token logits during generation but ultimately producing a more\nconcentrated logits distribution, and activating a broader set of neurons in\nthe final layers, indicating more extensive knowledge retrieval compared to\nstandard prompts. Our code and data will be publicly avialable when the paper\nis accepted.\n","authors":["Hao Yang","Qianghua Zhao","Lei Li"],"pdf_url":"https://arxiv.org/pdf/2412.03944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03941v1","updated":"2024-12-05T07:44:18Z","published":"2024-12-05T07:44:18Z","title":"Enhancing and Accelerating Diffusion-Based Inverse Problem Solving\n through Measurements Optimization","summary":" Diffusion models have recently demonstrated notable success in solving\ninverse problems. However, current diffusion model-based solutions typically\nrequire a large number of function evaluations (NFEs) to generate high-quality\nimages conditioned on measurements, as they incorporate only limited\ninformation at each step. To accelerate the diffusion-based inverse\nproblem-solving process, we introduce \\textbf{M}easurements\n\\textbf{O}ptimization (MO), a more efficient plug-and-play module for\nintegrating measurement information at each step of the inverse problem-solving\nprocess. This method is comprehensively evaluated across eight diverse linear\nand nonlinear tasks on the FFHQ and ImageNet datasets. By using MO, we\nestablish state-of-the-art (SOTA) performance across multiple tasks, with key\nadvantages: (1) it operates with no more than 100 NFEs, with phase retrieval on\nImageNet being the sole exception; (2) it achieves SOTA or near-SOTA results\neven at low NFE counts; and (3) it can be seamlessly integrated into existing\ndiffusion model-based solutions for inverse problems, such as DPS\n\\cite{chung2022diffusion} and Red-diff \\cite{mardani2023variational}. For\nexample, DPS-MO attains a peak signal-to-noise ratio (PSNR) of 28.71 dB on the\nFFHQ 256 dataset for high dynamic range imaging, setting a new SOTA benchmark\nwith only 100 NFEs, whereas current methods require between 1000 and 4000 NFEs\nfor comparable performance.\n","authors":["Tianyu Chen","Zhendong Wang","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2412.03941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03934v1","updated":"2024-12-05T07:32:20Z","published":"2024-12-05T07:32:20Z","title":"InfiniCube: Unbounded and Controllable Dynamic 3D Driving Scene\n Generation with World-Guided Video Models","summary":" We present InfiniCube, a scalable method for generating unbounded dynamic 3D\ndriving scenes with high fidelity and controllability. Previous methods for\nscene generation either suffer from limited scales or lack geometric and\nappearance consistency along generated sequences. In contrast, we leverage the\nrecent advancements in scalable 3D representation and video models to achieve\nlarge dynamic scene generation that allows flexible controls through HD maps,\nvehicle bounding boxes, and text descriptions. First, we construct a\nmap-conditioned sparse-voxel-based 3D generative model to unleash its power for\nunbounded voxel world generation. Then, we re-purpose a video model and ground\nit on the voxel world through a set of carefully designed pixel-aligned\nguidance buffers, synthesizing a consistent appearance. Finally, we propose a\nfast feed-forward approach that employs both voxel and pixel branches to lift\nthe dynamic videos to dynamic 3D Gaussians with controllable objects. Our\nmethod can generate controllable and realistic 3D driving scenes, and extensive\nexperiments validate the effectiveness and superiority of our model.\n","authors":["Yifan Lu","Xuanchi Ren","Jiawei Yang","Tianchang Shen","Zhangjie Wu","Jun Gao","Yue Wang","Siheng Chen","Mike Chen","Sanja Fidler","Jiahui Huang"],"pdf_url":"https://arxiv.org/pdf/2412.03934v1.pdf","comment":"Project Page: https://research.nvidia.com/labs/toronto-ai/infinicube/"},{"id":"http://arxiv.org/abs/2410.12672v3","updated":"2024-12-05T07:27:31Z","published":"2024-10-16T15:36:13Z","title":"Context Matters: Leveraging Contextual Features for Time Series\n Forecasting","summary":" Time series forecasts are often influenced by exogenous contextual features\nin addition to their corresponding history. For example, in financial settings,\nit is hard to accurately predict a stock price without considering public\nsentiments and policy decisions in the form of news articles, tweets, etc.\nThough this is common knowledge, the current state-of-the-art (SOTA)\nforecasting models fail to incorporate such contextual information, owing to\nits heterogeneity and multimodal nature. To address this, we introduce\nContextFormer, a novel plug-and-play method to surgically integrate multimodal\ncontextual information into existing pre-trained forecasting models.\nContextFormer effectively distills forecast-specific information from rich\nmultimodal contexts, including categorical, continuous, time-varying, and even\ntextual information, to significantly enhance the performance of existing base\nforecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on\na range of real-world datasets spanning energy, traffic, environmental, and\nfinancial domains.\n","authors":["Sameep Chattopadhyay","Pulkit Paliwal","Sai Shankar Narasimhan","Shubhankar Agarwal","Sandeep P. Chinchali"],"pdf_url":"https://arxiv.org/pdf/2410.12672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03933v1","updated":"2024-12-05T07:23:14Z","published":"2024-12-05T07:23:14Z","title":"Exploring AI Text Generation, Retrieval-Augmented Generation, and\n Detection Technologies: a Comprehensive Overview","summary":" The rapid development of Artificial Intelligence (AI) has led to the creation\nof powerful text generation models, such as large language models (LLMs), which\nare widely used for diverse applications. However, concerns surrounding\nAI-generated content, including issues of originality, bias, misinformation,\nand accountability, have become increasingly prominent. This paper offers a\ncomprehensive overview of AI text generators (AITGs), focusing on their\nevolution, capabilities, and ethical implications. This paper also introduces\nRetrieval-Augmented Generation (RAG), a recent approach that improves the\ncontextual relevance and accuracy of text generation by integrating dynamic\ninformation retrieval. RAG addresses key limitations of traditional models,\nincluding their reliance on static knowledge and potential inaccuracies in\nhandling real-world data. Additionally, the paper reviews detection tools that\nhelp differentiate AI-generated text from human-written content and discusses\nthe ethical challenges these technologies pose. The paper explores future\ndirections for improving detection accuracy, supporting ethical AI development,\nand increasing accessibility. The paper contributes to a more responsible and\nreliable use of AI in content creation through these discussions.\n","authors":["Fnu Neha","Deepshikha Bhati","Deepak Kumar Shukla","Angela Guercio","Ben Ward"],"pdf_url":"https://arxiv.org/pdf/2412.03933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16320v3","updated":"2024-12-05T07:14:52Z","published":"2024-09-21T03:45:05Z","title":"Developing a Thailand solar irradiance map using Himawari-8 satellite\n imageries and deep learning models","summary":" This paper presents an online platform showing Thailand solar irradiance map\nevery 30 minutes, available at https://www.cusolarforecast.com. The methodology\nfor estimating global horizontal irradiance (GHI) across Thailand relies on\ncloud index extracted from Himawari-8 satellite imagery, Ineichen clear-sky\nmodel with locally-tuned Linke turbidity, and machine learning models. The\nmethods take clear-sky irradiance, cloud index, re-analyzed GHI and temperature\ndata from the MERRA-2 database, and date-time as inputs for GHI estimation\nmodels, including LightGBM, LSTM, Informer, and Transformer. These are\nbenchmarked with the estimate from a commercial service X by evaluation of\n15-minute ground GHI data from 53 ground stations over 1.5 years during\n2022-2023. The results show that the four models exhibit comparable overall MAE\nperformance to the service X. The best model is LightGBM with an overall MAE of\n78.58 W/sqm and RMSE of 118.97 W/sqm, while the service X achieves the lowest\nMAE, RMSE, and MBE in cloudy condition. Obtaining re-analyzed MERRA-2 data for\nthe whole Thailand region is not economically feasible for deployment. When\nremoving these features, the Informer model has a winning performance in MAE of\n78.67 W/sqm. The obtained performance aligns with existing literature by taking\nthe climate zone and time granularity of data into consideration. As the map\nshows an estimate of GHI over 93,000 grids with a frequent update, the paper\nalso describes a computational framework for displaying the entire map. It\ntests the runtime performance of deep learning models in the GHI estimation\nprocess.\n","authors":["Suwichaya Suwanwimolkul","Natanon Tongamrak","Nuttamon Thungka","Naebboon Hoonchareon","Jitkomut Songsiri"],"pdf_url":"https://arxiv.org/pdf/2409.16320v3.pdf","comment":"23 pages, 14 figures"},{"id":"http://arxiv.org/abs/2412.03930v1","updated":"2024-12-05T07:12:53Z","published":"2024-12-05T07:12:53Z","title":"MIND: Effective Incorrect Assignment Detection through a Multi-Modal\n Structure-Enhanced Language Model","summary":" The rapid growth of academic publications has exacerbated the issue of author\nname ambiguity in online digital libraries. Despite advances in name\ndisambiguation algorithms, cumulative errors continue to undermine the\nreliability of academic systems. It is estimated that over 10% paper-author\nassignments are rectified when constructing the million-scale WhoIsWho\nbenchmark. Existing endeavors to detect incorrect assignments are either\nsemantic-based or graph-based approaches, which fall short of making full use\nof the rich text attributes of papers and implicit structural features defined\nvia the co-occurrence of paper attributes. To this end, this paper introduces a\nstructure-enhanced language model that combines key structural features from\ngraph-based methods with fine-grained semantic features from rich paper\nattributes to detect incorrect assignments. The proposed model is trained with\na highly effective multi-modal multi-turn instruction tuning framework, which\nincorporates task-guided instruction tuning, text-attribute modality, and\nstructural modality. Experimental results demonstrate that our model\noutperforms previous approaches, achieving top performance on the leaderboard\nof KDD Cup 2024. Our code has been publicly available.\n","authors":["Yunhe Pang","Bo Chen","Fanjin Zhang","Yanghui Rao","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2412.03930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21216v2","updated":"2024-12-05T07:09:27Z","published":"2024-10-28T17:01:52Z","title":"HoPE: A Novel Positional Encoding Without Long-Term Decay for Enhanced\n Context Awareness and Extrapolation","summary":" Many positional encodings (PEs) are designed to exhibit long-term decay,\nbased on an entrenched and long-standing inductive opinion: tokens farther away\nfrom the current position carry less relevant information. We argue that\nlong-term decay is outdated in the era of LLMs, as LLMs are now applied to\ntasks demanding precise retrieval of in-context information from arbitrary\npositions. Firstly, we present empirical analyses on various PEs, demonstrating\nthat models inherently learn attention with only a local-decay pattern while\nforming a U-shape pattern globally, contradicting the principle of long-term\ndecay. Furthermore, we conduct a detailed analysis of rotary position encoding\n(RoPE, a prevalent relative positional encoding in LLMs), and found that the\nU-shape attention is caused by some learned components, which are also the key\nfactor limiting RoPE's expressiveness and extrapolation.Inspired by these\ninsights, we propose High-frequency rotary Position Encoding (HoPE). HoPE\nreplaces the specific components in RoPE with position-independent ones,\nretaining only high-frequency signals, which also breaks the principle of\nlong-term decay in theory. HoPE achieves two major advantages: (1) Without\nconstraints imposed by long-term decay, contradictory factors that limit\nspontaneous attention optimization and model extrapolation performance are\nremoved. (2) Components representing positions and semantics are are optimized.\nThese enhances model's context awareness and extrapolation, as validated by\nextensive experiments.\n","authors":["Yuhan Chen","Ang Lv","Jian Luan","Bin Wang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.21216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03928v1","updated":"2024-12-05T07:07:35Z","published":"2024-12-05T07:07:35Z","title":"MT3DNet: Multi-Task learning Network for 3D Surgical Scene\n Reconstruction","summary":" In image-assisted minimally invasive surgeries (MIS), understanding surgical\nscenes is vital for real-time feedback to surgeons, skill evaluation, and\nimproving outcomes through collaborative human-robot procedures. Within this\ncontext, the challenge lies in accurately detecting, segmenting, and estimating\nthe depth of surgical scenes depicted in high-resolution images, while\nsimultaneously reconstructing the scene in 3D and providing segmentation of\nsurgical instruments along with detection labels for each instrument. To\naddress this challenge, a novel Multi-Task Learning (MTL) network is proposed\nfor performing these tasks concurrently. A key aspect of this approach involves\novercoming the optimization hurdles associated with handling multiple tasks\nconcurrently by integrating a Adversarial Weight Update into the MTL framework,\nthe proposed MTL model achieves 3D reconstruction through the integration of\nsegmentation, depth estimation, and object detection, thereby enhancing the\nunderstanding of surgical scenes, which marks a significant advancement\ncompared to existing studies that lack 3D capabilities. Comprehensive\nexperiments on the EndoVis2018 benchmark dataset underscore the adeptness of\nthe model in efficiently addressing all three tasks, demonstrating the efficacy\nof the proposed techniques.\n","authors":["Mithun Parab","Pranay Lendave","Jiyoung Kim","Thi Quynh Dan Nguyen","Palash Ingle"],"pdf_url":"https://arxiv.org/pdf/2412.03928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00741v3","updated":"2024-12-05T07:05:59Z","published":"2024-01-01T12:49:36Z","title":"ToolEyes: Fine-Grained Evaluation for Tool Learning Capabilities of\n Large Language Models in Real-world Scenarios","summary":" Existing evaluations of tool learning primarily focus on validating the\nalignment of selected tools for large language models (LLMs) with expected\noutcomes. However, these approaches rely on a limited set of scenarios where\nanswers can be pre-determined, diverging from genuine needs. Furthermore, a\nsole emphasis on outcomes disregards the complex capabilities required for LLMs\nto effectively use tools. To tackle this issue, we propose ToolEyes, a\nfine-grained system tailored for the evaluation of the LLMs' tool learning\ncapabilities in authentic scenarios. The system meticulously examines seven\nreal-world scenarios, analyzing five dimensions crucial to LLMs in tool\nlearning: format alignment, intent comprehension, behavior planning, tool\nselection, and answer organization. Additionally, ToolEyes incorporates a tool\nlibrary boasting approximately 600 tools, serving as an intermediary between\nLLMs and the physical world. Evaluations involving ten LLMs across three\ncategories reveal a preference for specific scenarios and limited cognitive\nabilities in tool learning. Intriguingly, expanding the model size even\nexacerbates the hindrance to tool learning. The code and data are available at\nhttps://github.com/Junjie-Ye/ToolEyes.\n","authors":["Junjie Ye","Guanyu Li","Songyang Gao","Caishuang Huang","Yilong Wu","Sixian Li","Xiaoran Fan","Shihan Dou","Tao Ji","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2401.00741v3.pdf","comment":"Accepted by COLING 2025 conference"},{"id":"http://arxiv.org/abs/2412.03331v2","updated":"2024-12-05T07:05:57Z","published":"2024-12-04T14:02:12Z","title":"LuxEmbedder: A Cross-Lingual Approach to Enhanced Luxembourgish Sentence\n Embeddings","summary":" Sentence embedding models play a key role in various Natural Language\nProcessing tasks, such as in Topic Modeling, Document Clustering and\nRecommendation Systems. However, these models rely heavily on parallel data,\nwhich can be scarce for many low-resource languages, including Luxembourgish.\nThis scarcity results in suboptimal performance of monolingual and\ncross-lingual sentence embedding models for these languages. To address this\nissue, we compile a relatively small but high-quality human-generated\ncross-lingual parallel dataset to train LuxEmbedder, an enhanced sentence\nembedding model for Luxembourgish with strong cross-lingual capabilities.\nAdditionally, we present evidence suggesting that including low-resource\nlanguages in parallel training datasets can be more advantageous for other\nlow-resource languages than relying solely on high-resource language pairs.\nFurthermore, recognizing the lack of sentence embedding benchmarks for\nlow-resource languages, we create a paraphrase detection benchmark specifically\nfor Luxembourgish, aiming to partially fill this gap and promote further\nresearch.\n","authors":["Fred Philippy","Siwen Guo","Jacques Klein","Tegawendé F. Bissyandé"],"pdf_url":"https://arxiv.org/pdf/2412.03331v2.pdf","comment":"Accepted at COLING 2025"},{"id":"http://arxiv.org/abs/2408.10618v2","updated":"2024-12-05T06:54:29Z","published":"2024-08-20T07:50:29Z","title":"OMEGA: Efficient Occlusion-Aware Navigation for Air-Ground Robot in\n Dynamic Environments via State Space Model","summary":" Air-ground robots (AGRs) are widely used in surveillance and disaster\nresponse due to their exceptional mobility and versatility (i.e., flying and\ndriving). Current AGR navigation systems perform well in static occlusion-prone\nenvironments (e.g., indoors) by using 3D semantic occupancy networks to predict\nocclusions for complete local mapping and then computing Euclidean Signed\nDistance Field (ESDF) for path planning. However, these systems face challenges\nin dynamic, severe occlusion scenes (e.g., crowds) due to limitations in\nperception networks' low prediction accuracy and path planners' high\ncomputation overhead. In this paper, we propose OMEGA, which contains OccMamba\nwith an Efficient AGR-Planner to address the above-mentioned problems. OccMamba\nadopts a novel architecture that separates semantic and occupancy prediction\ninto independent branches, incorporating two mamba blocks within these\nbranches. These blocks efficiently extract semantic and geometric features in\n3D environments with linear complexity, ensuring that the network can learn\nlong-distance dependencies to improve prediction accuracy. Semantic and\ngeometric features are combined within the Bird's Eye View (BEV) space to\nminimise computational overhead during feature fusion. The resulting semantic\noccupancy map is then seamlessly integrated into the local map, providing\nocclusion awareness of the dynamic environment. Our AGR-Planner utilizes this\nlocal map and employs kinodynamic A* search and gradient-based trajectory\noptimization to guarantee planning is ESDF-free and energy-efficient. Extensive\nexperiments demonstrate that OccMamba outperforms the state-of-the-art 3D\nsemantic occupancy network with 25.0% mIoU. End-to-end navigation experiments\nin dynamic scenes verify OMEGA's efficiency, achieving a 96% average planning\nsuccess rate. Code and video are available at\nhttps://jmwang0117.github.io/OMEGA/.\n","authors":["Junming Wang","Xiuxian Guan","Zekai Sun","Tianxiang Shen","Dong Huang","Fangming Liu","Heming Cui"],"pdf_url":"https://arxiv.org/pdf/2408.10618v2.pdf","comment":"Accepted to IEEE RA-L | OccMamba is here!"},{"id":"http://arxiv.org/abs/2412.01644v2","updated":"2024-12-05T06:49:37Z","published":"2024-12-02T15:56:08Z","title":"Concept Based Continuous Prompts for Interpretable Text Classification","summary":" Continuous prompts have become widely adopted for augmenting performance\nacross a wide range of natural language tasks. However, the underlying\nmechanism of this enhancement remains obscure. Previous studies rely on\nindividual words for interpreting continuous prompts, which lacks comprehensive\nsemantic understanding. Drawing inspiration from Concept Bottleneck Models, we\npropose a framework for interpreting continuous prompts by decomposing them\ninto human-readable concepts. Specifically, to ensure the feasibility of the\ndecomposition, we demonstrate that a corresponding concept embedding matrix and\na coefficient matrix can always be found to replace the prompt embedding\nmatrix. Then, we employ GPT-4o to generate a concept pool and choose potential\ncandidate concepts that are discriminative and representative using a novel\nsubmodular optimization algorithm. Experiments demonstrate that our framework\ncan achieve similar results as the original P-tuning and word-based approaches\nusing only a few concepts while providing more plausible results. Our code is\navailable at https://github.com/qq31415926/CD.\n","authors":["Qian Chen","Dongyang Li","Xiaofeng He"],"pdf_url":"https://arxiv.org/pdf/2412.01644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.07754v3","updated":"2024-12-05T06:49:06Z","published":"2024-02-12T16:23:28Z","title":"Diffusion of Thoughts: Chain-of-Thought Reasoning in Diffusion Language\n Models","summary":" Recently, diffusion models have garnered significant interest in the field of\ntext processing due to their many potential advantages compared to conventional\nautoregressive models. In this work, we propose Diffusion-of-Thought (DoT), a\nnovel approach that integrates diffusion models with Chain-of-Thought, a\nwell-established technique for improving the reasoning ability of\nautoregressive language models. In contrast to autoregressive language models\nthat make decisions in a left-to-right, token-by-token manner, DoT allows\nreasoning steps to diffuse over time through a diffusion language model and\noffers greater flexibility in trading-off computation for reasoning\nperformance. Our experimental results demonstrate the effectiveness of DoT in\nmulti-digit multiplication, boolean logic, and grade school math problems, with\na small diffusion model outperforming a much larger autoregressive model in\nboth efficiency and accuracy. In addition to that, DoT showcases promising\nself-correction abilities and benefits from existing reasoning-enhancing\ntechniques like self-consistency decoding. Our findings contribute to the\nunderstanding and development of reasoning with diffusion language models.\n","authors":["Jiacheng Ye","Shansan Gong","Liheng Chen","Lin Zheng","Jiahui Gao","Han Shi","Chuan Wu","Xin Jiang","Zhenguo Li","Wei Bi","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2402.07754v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03920v1","updated":"2024-12-05T06:46:46Z","published":"2024-12-05T06:46:46Z","title":"A Survey on Large Language Model-Based Social Agents in Game-Theoretic\n Scenarios","summary":" Game-theoretic scenarios have become pivotal in evaluating the social\nintelligence of Large Language Model (LLM)-based social agents. While numerous\nstudies have explored these agents in such settings, there is a lack of a\ncomprehensive survey summarizing the current progress. To address this gap, we\nsystematically review existing research on LLM-based social agents within\ngame-theoretic scenarios. Our survey organizes the findings into three core\ncomponents: Game Framework, Social Agent, and Evaluation Protocol. The game\nframework encompasses diverse game scenarios, ranging from choice-focusing to\ncommunication-focusing games. The social agent part explores agents'\npreferences, beliefs, and reasoning abilities. The evaluation protocol covers\nboth game-agnostic and game-specific metrics for assessing agent performance.\nBy reflecting on the current research and identifying future research\ndirections, this survey provides insights to advance the development and\nevaluation of social agents in game-theoretic scenarios.\n","authors":["Xiachong Feng","Longxu Dou","Ella Li","Qinghao Wang","Haochuan Wang","Yu Guo","Chang Ma","Lingpeng Kong"],"pdf_url":"https://arxiv.org/pdf/2412.03920v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02454v3","updated":"2024-12-05T06:38:13Z","published":"2024-04-03T04:50:43Z","title":"Techniques for Measuring the Inferential Strength of Forgetting Policies","summary":" The technique of forgetting in knowledge representation has been shown to be\na powerful and useful knowledge engineering tool with widespread application.\nYet, very little research has been done on how different policies of\nforgetting, or use of different forgetting operators, affects the inferential\nstrength of the original theory. The goal of this paper is to define loss\nfunctions for measuring changes in inferential strength based on intuitions\nfrom model counting and probability theory. Properties of such loss measures\nare studied and a pragmatic knowledge engineering tool is proposed for\ncomputing loss measures using ProbLog. The paper includes a working methodology\nfor studying and determining the strength of different forgetting policies, in\naddition to concrete examples showing how to apply the theoretical results\nusing ProbLog. Although the focus is on forgetting, the results are much more\ngeneral and should have wider application to other areas.\n","authors":["Patrick Doherty","Andrzej Szalas"],"pdf_url":"https://arxiv.org/pdf/2404.02454v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00310v2","updated":"2024-12-05T06:28:11Z","published":"2024-08-31T00:33:17Z","title":"Objective Features Extracted from Motor Activity Time Series for Food\n Addiction Analysis Using Machine Learning","summary":" This study investigates machine learning algorithms to identify objective\nfeatures for diagnosing food addiction (FA) and assessing confirmed symptoms\n(SC). Data were collected from 81 participants (mean age: 21.5 years, range:\n18-61 years, women: 77.8%) whose FA and SC were measured using the Yale Food\nAddiction Scale (YFAS). Participants provided demographic and anthropometric\ndata, completed the YFAS, the Zung Self-Rating Depression Scale, and the Dutch\nEating Behavior Questionnaire, and wore an actimeter on the non-dominant wrist\nfor a week to record motor activity. Analysis of the actimetric data identified\nsignificant statistical and entropy-based features that accurately predicted FA\nand SC using ML. The Matthews correlation coefficient (MCC) was the primary\nmetric. Activity-related features were more effective for FA prediction\n(MCC=0.88) than rest-related features (MCC=0.68). For SC, activity segments\nyielded MCC=0.47, rest segments MCC=0.38, and their combination MCC=0.51.\nSignificant correlations were also found between actimetric features related to\nFA, emotional, and restrained eating behaviors, supporting the model's\nvalidity. Our results support the concept of a human bionic suite composed of\nIoT devices and ML sensors, which implements health digital assistance with\nreal-time monitoring and analysis of physiological indicators related to FA and\nSC.\n","authors":["Mikhail Borisenkov","Andrei Velichko","Maksim Belyaev","Dmitry Korzun","Tatyana Tserne","Larisa Bakutova","Denis Gubin"],"pdf_url":"https://arxiv.org/pdf/2409.00310v2.pdf","comment":"16 pages, 3 figures, 14 tables"},{"id":"http://arxiv.org/abs/2412.03905v1","updated":"2024-12-05T06:21:31Z","published":"2024-12-05T06:21:31Z","title":"Integrating Various Software Artifacts for Better LLM-based Bug\n Localization and Program Repair","summary":" LLMs have garnered considerable attention for their potential to streamline\nAutomated Program Repair (APR). LLM-based approaches can either insert the\ncorrect code or directly generate patches when provided with buggy methods.\nHowever, most of LLM-based APR methods rely on a single type of software\ninformation, without fully leveraging different software artifacts. Despite\nthis, many LLM-based approaches do not explore which specific types of\ninformation best assist in APR. Addressing this gap is crucial for advancing\nLLM-based APR techniques. We propose DEVLoRe to use issue content (description\nand message) and stack error traces to localize buggy methods, then rely on\ndebug information in buggy methods and issue content and stack error to\nlocalize buggy lines and generate plausible patches which can pass all unit\ntests. The results show that while issue content is particularly effective in\nassisting LLMs with fault localization and program repair, different types of\nsoftware artifacts complement each other. By incorporating different artifacts,\nDEVLoRe successfully locates 49.3% and 47.6% of single and non-single buggy\nmethods and generates 56.0% and 14.5% plausible patches for the Defects4J v2.0\ndataset, respectively. This outperforms current state-of-the-art APR methods.\nThe source code and experimental results of this work for replication are\navailable at https://github.com/XYZboom/DEVLoRe.\n","authors":["Qiong Feng","Xiaotian Ma","Jiayi Sheng","Ziyuan Feng","Wei Song","Peng Liang"],"pdf_url":"https://arxiv.org/pdf/2412.03905v1.pdf","comment":"22 pages, 11 images, 9 tables, Manuscript submitted to a journal\n (2024)"},{"id":"http://arxiv.org/abs/2309.11087v6","updated":"2024-12-05T06:21:03Z","published":"2023-09-20T06:30:39Z","title":"Embed-Search-Align: DNA Sequence Alignment using Transformer Models","summary":" DNA sequence alignment involves assigning short DNA reads to the most\nprobable locations on an extensive reference genome. This process is crucial\nfor various genomic analyses, including variant calling, transcriptomics, and\nepigenomics. Conventional methods, refined over decades, tackle this challenge\nin 2 steps: genome indexing followed by efficient search to locate likely\npositions for given reads. Building on the success of Large Language Models in\nencoding text into embeddings, where the distance metric captures semantic\nsimilarity, recent efforts have explored whether the same Transformer\narchitecture can produce embeddings for DNA sequences. Such models have shown\nearly promise in classifying short DNA sequences, such as detecting\ncoding/non-coding regions, and enhancer, promoter sequences. However,\nperformance at sequence classification tasks does not translate to sequence\nalignment, where it is necessary to search across the genome to align each\nread, a significantly longer-range task. We bridge this gap by framing the\nSequence Alignment task for Transformer models as an \"Embed-Search-Align\" task.\nIn this framework, a novel Reference-Free DNA Embedding model generates\nembeddings of reads and reference fragments, which are projected into a shared\nvector space where the read-fragment distance is used as a surrogate for\nalignment. Technical contributions include: (1) Contrastive loss for\nself-supervised training of DNA sequence representations, facilitating rich\nreference-free, sequence-level embeddings, and (2) a DNA vector store to enable\nsearch across fragments on a global scale. DNA-ESA is 99% accurate when\naligning 250-length reads onto a human genome (3gb), rivaling conventional\nmethods such as Bowtie and BWA-Mem. DNA-ESA exceeds the performance of 6\nTransformer model baselines such as Nucleotide Transformer, Hyena-DNA, and\nshows task transfer across chromosomes and species.\n","authors":["Pavan Holur","K. C. Enevoldsen","Shreyas Rajesh","Lajoyce Mboning","Thalia Georgiou","Louis-S. Bouchard","Matteo Pellegrini","Vwani Roychowdhury"],"pdf_url":"https://arxiv.org/pdf/2309.11087v6.pdf","comment":"12 pages, Tables 7, Figures 6"},{"id":"http://arxiv.org/abs/2412.03904v1","updated":"2024-12-05T06:20:47Z","published":"2024-12-05T06:20:47Z","title":"MISR: Measuring Instrumental Self-Reasoning in Frontier Models","summary":" We propose a suite of tasks to evaluate the instrumental self-reasoning\nability of large language model (LLM) agents. Instrumental self-reasoning\nability could improve adaptability and enable self-modification, but it could\nalso pose significant risks, such as enabling deceptive alignment. Prior work\nhas only evaluated self-reasoning in non-agentic settings or in limited\ndomains. In this paper, we propose evaluations for instrumental self-reasoning\nability in agentic tasks in a wide range of scenarios, including\nself-modification, knowledge seeking, and opaque self-reasoning. We evaluate\nagents built using state-of-the-art LLMs, including commercial and open source\nsystems. We find that instrumental self-reasoning ability emerges only in the\nmost capable frontier models and that it is highly context-dependent. No model\npasses the the most difficult versions of our evaluations, hence our evaluation\ncan be used to measure increases in instrumental self-reasoning ability in\nfuture models. We open-source our evaluations at\nhttps://github.com/kaifronsdal/Self-Reasoning-Evals.\n","authors":["Kai Fronsdal","David Lindner"],"pdf_url":"https://arxiv.org/pdf/2412.03904v1.pdf","comment":"10 pages, 65 page appendix, 5 figures"},{"id":"http://arxiv.org/abs/2412.03903v1","updated":"2024-12-05T06:20:19Z","published":"2024-12-05T06:20:19Z","title":"Using SlowFast Networks for Near-Miss Incident Analysis in Dashcam\n Videos","summary":" This paper classifies near-miss traffic videos using the SlowFast deep neural\nnetwork that mimics the characteristics of the slow and fast visual information\nprocessed by two different streams from the M (Magnocellular) and P\n(Parvocellular) cells of the human brain. The approach significantly improves\nthe accuracy of the traffic near-miss video analysis and presents insights into\nhuman visual perception in traffic scenarios. Moreover, it contributes to\ntraffic safety enhancements and provides novel perspectives on the potential\ncognitive errors in traffic accidents.\n","authors":["Yucheng Zhang","Koichi Emura","Eiji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2412.03903v1.pdf","comment":"Best Research Paper Award for Asia-Pacific Region, The 30th ITS World\n Congress 2024"},{"id":"http://arxiv.org/abs/2412.03895v1","updated":"2024-12-05T06:09:56Z","published":"2024-12-05T06:09:56Z","title":"A Noise is Worth Diffusion Guidance","summary":" Diffusion models excel in generating high-quality images. However, current\ndiffusion models struggle to produce reliable images without guidance methods,\nsuch as classifier-free guidance (CFG). Are guidance methods truly necessary?\nObserving that noise obtained via diffusion inversion can reconstruct\nhigh-quality images without guidance, we focus on the initial noise of the\ndenoising pipeline. By mapping Gaussian noise to `guidance-free noise', we\nuncover that small low-magnitude low-frequency components significantly enhance\nthe denoising process, removing the need for guidance and thus improving both\ninference throughput and memory. Expanding on this, we propose \\ours, a novel\nmethod that replaces guidance methods with a single refinement of the initial\nnoise. This refined noise enables high-quality image generation without\nguidance, within the same diffusion pipeline. Our noise-refining model\nleverages efficient noise-space learning, achieving rapid convergence and\nstrong performance with just 50K text-image pairs. We validate its\neffectiveness across diverse metrics and analyze how refined noise can\neliminate the need for guidance. See our project page:\nhttps://cvlab-kaist.github.io/NoiseRefine/.\n","authors":["Donghoon Ahn","Jiwon Kang","Sanghyun Lee","Jaewon Min","Minjae Kim","Wooseok Jang","Hyoungwon Cho","Sayak Paul","SeonHwa Kim","Eunju Cha","Kyong Hwan Jin","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2412.03895v1.pdf","comment":"Project page: https://cvlab-kaist.github.io/NoiseRefine/"},{"id":"http://arxiv.org/abs/2412.03894v1","updated":"2024-12-05T06:05:12Z","published":"2024-12-05T06:05:12Z","title":"Machine Learning-based Android Intrusion Detection System","summary":" The android operating system is being installed in most of the smart devices.\nThe introduction of intrusions in such operating systems is rising at a\ntremendous rate. With the introduction of such malicious data streams, the\nsmart devices are being subjected to various attacks like Phishing, Spyware,\nSMS Fraud, Bots and Banking-Trojans and many such. The application of machine\nlearning classification algorithms for the security of android APK files is\nused in this paper. Each apk data stream was marked to be either malicious or\nnon malicious on the basis of different parameters. The machine learning\nclassification techniques are then used to classify whether the newly installed\napplications' signature falls within the malicious or non-malicious domain. If\nit falls within the malicious category, appropriate action can be taken, and\nthe Android operating system can be shielded against illegal activities.\n","authors":["Madiha Tahreem","Ifrah Andleeb","Bilal Zahid Hussain","Arsalan Hameed"],"pdf_url":"https://arxiv.org/pdf/2412.03894v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03893v1","updated":"2024-12-05T06:03:09Z","published":"2024-12-05T06:03:09Z","title":"Dual-Branch Subpixel-Guided Network for Hyperspectral Image\n Classification","summary":" Deep learning (DL) has been widely applied into hyperspectral image (HSI)\nclassification owing to its promising feature learning and representation\ncapabilities. However, limited by the spatial resolution of sensors, existing\nDL-based classification approaches mainly focus on pixel-level spectral and\nspatial information extraction through complex network architecture design,\nwhile ignoring the existence of mixed pixels in actual scenarios. To tackle\nthis difficulty, we propose a novel dual-branch subpixel-guided network for HSI\nclassification, called DSNet, which automatically integrates subpixel\ninformation and convolutional class features by introducing a deep autoencoder\nunmixing architecture to enhance classification performance. DSNet is capable\nof fully considering physically nonlinear properties within subpixels and\nadaptively generating diagnostic abundances in an unsupervised manner to\nachieve more reliable decision boundaries for class label distributions. The\nsubpixel fusion module is designed to ensure high-quality information fusion\nacross pixel and subpixel features, further promoting stable joint\nclassification. Experimental results on three benchmark datasets demonstrate\nthe effectiveness and superiority of DSNet compared with state-of-the-art\nDL-based HSI classification approaches. The codes will be available at\nhttps://github.com/hanzhu97702/DSNet, contributing to the remote sensing\ncommunity.\n","authors":["Zhu Han","Jin Yang","Lianru Gao","Zhiqiang Zeng","Bing Zhang","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2412.03893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04180v3","updated":"2024-12-05T06:01:33Z","published":"2023-05-07T03:39:31Z","title":"Train a Real-world Local Path Planner in One Hour via Partially\n Decoupled Reinforcement Learning and Vectorized Diversity","summary":" Deep Reinforcement Learning (DRL) has exhibited efficacy in resolving the\nLocal Path Planning (LPP) problem. However, such application in the real world\nis immensely limited due to the deficient training efficiency and\ngeneralization capability of DRL. To alleviate these two issues, a solution\nnamed Color is proposed, which consists of an Actor-Sharer-Learner (ASL)\ntraining framework and a mobile robot-oriented simulator Sparrow. Specifically,\nthe ASL intends to improve the training efficiency of DRL algorithms. It\nemploys a Vectorized Data Collection (VDC) mode to expedite data acquisition,\ndecouples the data collection from model optimization by multithreading, and\npartially connects the two procedures by harnessing a Time Feedback Mechanism\n(TFM) to evade data underuse or overuse. Meanwhile, the Sparrow simulator\nutilizes a 2D grid-based world, simplified kinematics, and conversion-free data\nflow to achieve a lightweight design. The lightness facilitates vectorized\ndiversity, allowing diversified simulation setups across extensive copies of\nthe vectorized environments, resulting in a notable enhancement in the\ngeneralization capability of the DRL algorithm being trained. Comprehensive\nexperiments, comprising 57 DRL benchmark environments, 32 simulated and 36\nreal-world LPP scenarios, have been conducted to corroborate the superiority of\nour method in terms of efficiency and generalization. The code and the video of\nthis paper are accessible at https://github.com/XinJingHao/Color.\n","authors":["Jinghao Xin","Jinwoo Kim","Zhi Li","Ning Li"],"pdf_url":"https://arxiv.org/pdf/2305.04180v3.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2412.02372v2","updated":"2024-12-05T06:00:34Z","published":"2024-12-03T10:58:34Z","title":"HERO: Hint-Based Efficient and Reliable Query Optimizer","summary":" We propose a novel model for learned query optimization which provides query\nhints leading to better execution plans. The model addresses the three key\nchallenges in learned hint-based query optimization: reliable hint\nrecommendation (ensuring non-degradation of query latency), efficient hint\nexploration, and fast inference. We provide an in-depth analysis of existing\nNN-based approaches to hint-based optimization and experimentally confirm the\nnamed challenges for them. Our alternative solution consists of a new inference\nschema based on an ensemble of context-aware models and a graph storage for\nreliable hint suggestion and fast inference, and a budget-controlled training\nprocedure with a local search algorithm that solves the issue of exponential\nsearch space exploration. In experiments on standard benchmarks, our model\ndemonstrates optimization capability close to the best achievable with\ncoarse-grained hints. Controlling the degree of parallelism (query dop) in\naddition to operator-related hints enables our model to achieve 3x latency\nimprovement on JOB benchmark which sets a new standard for optimization. Our\nmodel is interpretable and easy to debug, which is particularly important for\ndeployment in production.\n","authors":["Sergey Zinchenko","Sergey Iazov"],"pdf_url":"https://arxiv.org/pdf/2412.02372v2.pdf","comment":"Submitted to VLDB 2025; 13 pages; 13 figures"},{"id":"http://arxiv.org/abs/2109.13479v3","updated":"2024-12-05T05:50:39Z","published":"2021-09-28T04:31:23Z","title":"Knowledge Transfer based Evolutionary Deep Neural Network for\n Intelligent Fault Diagnosis","summary":" A fault diagnosis with commendable accuracy is essential for the reliability\nof industrial machines. Two main challenges affect the design of\nhigh-performing intelligent systems: (i) the selection of a suitable model and\n(ii) domain adaptation if there is a continuous change in operating conditions.\nTherefore, we propose an evolutionary Net2Net transformation (EvoN2N) that\nfinds the best suitable DNN architecture with limited availability of labeled\ndata samples. Net2Net transformation-based quick learning algorithm has been\nused in the evolutionary framework of Non-dominated sorting genetic algorithm\nII to obtain the best DNN architecture. Net2Net transformation-based quick\nlearning algorithm uses the concept of knowledge transfer from one generation\nto the next for faster fitness evaluation. The proposed framework can obtain\nthe best model for intelligent fault diagnosis without a long and\ntime-consuming search process. The proposed framework has been validated on the\nCase Western Reserve University dataset, the Paderborn University dataset, and\nthe gearbox fault detection dataset under different operating conditions. The\nbest models obtained are capable of demonstrating an excellent diagnostic\nperformance and classification accuracy of almost up to 100\\% for most of the\noperating conditions.\n","authors":["Arun K. Sharma","Nishchal K. Verma"],"pdf_url":"https://arxiv.org/pdf/2109.13479v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03886v1","updated":"2024-12-05T05:39:03Z","published":"2024-12-05T05:39:03Z","title":"Uniform Discretized Integrated Gradients: An effective attribution based\n method for explaining large language models","summary":" Integrated Gradients is a well-known technique for explaining deep learning\nmodels. It calculates feature importance scores by employing a gradient based\napproach computing gradients of the model output with respect to input features\nand accumulating them along a linear path. While this works well for continuous\nfeatures spaces, it may not be the most optimal way to deal with discrete\nspaces like word embeddings. For interpreting LLMs (Large Language Models),\nthere exists a need for a non-linear path where intermediate points, whose\ngradients are to be computed, lie close to actual words in the embedding space.\nIn this paper, we propose a method called Uniform Discretized Integrated\nGradients (UDIG) based on a new interpolation strategy where we choose a\nfavorable nonlinear path for computing attribution scores suitable for\npredictive language models. We evaluate our method on two types of NLP tasks-\nSentiment Classification and Question Answering against three metrics viz Log\nodds, Comprehensiveness and Sufficiency. For sentiment classification, we have\nused the SST2, IMDb and Rotten Tomatoes datasets for benchmarking and for\nQuestion Answering, we have used the fine-tuned BERT model on SQuAD dataset.\nOur approach outperforms the existing methods in almost all the metrics.\n","authors":["Swarnava Sinha Roy","Ayan Kundu"],"pdf_url":"https://arxiv.org/pdf/2412.03886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08188v4","updated":"2024-12-05T05:37:46Z","published":"2024-08-15T14:46:13Z","title":"Nl2Hltl2Plan: Scaling Up Natural Language Understanding for Multi-Robots\n Through Hierarchical Temporal Logic Task Representation","summary":" To enable non-experts to specify long-horizon, multi-robot collaborative\ntasks, language models are increasingly used to translate natural language\ncommands into formal specifications. However, because translation can occur in\nmultiple ways, such translations may lack accuracy or lead to inefficient\nmulti-robot planning. Our key insight is that concise hierarchical\nspecifications can simplify planning while remaining straightforward to derive\nfrom human instructions. We propose Nl2Hltl2Plan, a framework that translates\nnatural language commands into hierarchical Linear Temporal Logic (LTL) and\nsolves the corresponding planning problem. The translation involves two steps\nleveraging Large Language Models (LLMs). First, an LLM transforms instructions\ninto a Hierarchical Task Tree, capturing logical and temporal relations. Next,\na fine-tuned LLM converts sub-tasks into flat LTL formulas, which are\naggregated into hierarchical specifications, with the lowest level\ncorresponding to ordered robot actions. These specifications are then used with\noff-the-shelf planners. Our Nl2Hltl2Plan demonstrates the potential of LLMs in\nhierarchical reasoning for multi-robot task planning. Evaluations in simulation\nand real-world experiments with human participants show that Nl2Hltl2Plan\noutperforms existing methods, handling more complex instructions while\nachieving higher success rates and lower costs in task allocation and planning.\nAdditional details are available at https://nl2hltl2plan.github.io .\n","authors":["Shaojun Xu","Xusheng Luo","Yutong Huang","Letian Leng","Ruixuan Liu","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2408.08188v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09459v2","updated":"2024-12-05T05:37:23Z","published":"2024-05-15T15:52:27Z","title":"Fourier Boundary Features Network with Wider Catchers for Glass\n Segmentation","summary":" Glass largely blurs the boundary between the real world and the reflection.\nThe special transmittance and reflectance quality have confused the semantic\ntasks related to machine vision. Therefore, how to clear the boundary built by\nglass, and avoid over-capturing features as false positive information in deep\nstructure, matters for constraining the segmentation of reflection surface and\npenetrating glass. We proposed the Fourier Boundary Features Network with Wider\nCatchers (FBWC), which might be the first attempt to utilize sufficiently wide\nhorizontal shallow branches without vertical deepening for guiding the fine\ngranularity segmentation boundary through primary glass semantic information.\nSpecifically, we designed the Wider Coarse-Catchers (WCC) for anchoring large\narea segmentation and reducing excessive extraction from a structural\nperspective. We embed fine-grained features by Cross Transpose Attention (CTA),\nwhich is introduced to avoid the incomplete area within the boundary caused by\nreflection noise. For excavating glass features and balancing high-low layers\ncontext, a learnable Fourier Convolution Controller (FCC) is proposed to\nregulate information integration robustly. The proposed method has been\nvalidated on three different public glass segmentation datasets. Experimental\nresults reveal that the proposed method yields better segmentation performance\ncompared with the state-of-the-art (SOTA) methods in glass image segmentation.\n","authors":["Xiaolin Qin","Jiacen Liu","Qianlei Wang","Shaolin Zhang","Fei Zhu","Zhang Yi"],"pdf_url":"https://arxiv.org/pdf/2405.09459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03884v1","updated":"2024-12-05T05:30:10Z","published":"2024-12-05T05:30:10Z","title":"A Unified Framework for Evaluating the Effectiveness and Enhancing the\n Transparency of Explainable AI Methods in Real-World Applications","summary":" The rapid advancement of deep learning has resulted in substantial\nadvancements in AI-driven applications; however, the \"black box\" characteristic\nof these models frequently constrains their interpretability, transparency, and\nreliability. Explainable artificial intelligence (XAI) seeks to elucidate AI\ndecision-making processes, guaranteeing that explanations faithfully represent\nthe model's rationale and correspond with human comprehension. Despite\ncomprehensive research in XAI, a significant gap persists in standardized\nprocedures for assessing the efficacy and transparency of XAI techniques across\nmany real-world applications. This study presents a unified XAI evaluation\nframework incorporating extensive quantitative and qualitative criteria to\nsystematically evaluate the correctness, interpretability, robustness,\nfairness, and completeness of explanations generated by AI models. The\nframework prioritizes user-centric and domain-specific adaptations, hence\nimproving the usability and reliability of AI models in essential domains. To\naddress deficiencies in existing evaluation processes, we suggest defined\nbenchmarks and a systematic evaluation pipeline that includes data loading,\nexplanation development, and thorough method assessment. The suggested\nframework's relevance and variety are evidenced by case studies in healthcare,\nfinance, agriculture, and autonomous systems. These provide a solid basis for\nthe equitable and dependable assessment of XAI methodologies. This paradigm\nenhances XAI research by offering a systematic, flexible, and pragmatic method\nto guarantee transparency and accountability in AI systems across many\nreal-world contexts.\n","authors":["Md. Ariful Islam","M. F. Mridha","Md Abrar Jahin","Nilanjan Dey"],"pdf_url":"https://arxiv.org/pdf/2412.03884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03881v1","updated":"2024-12-05T05:29:19Z","published":"2024-12-05T05:29:19Z","title":"Weak-to-Strong Generalization Through the Data-Centric Lens","summary":" The weak-to-strong generalization phenomenon is the driver for important\nmachine learning applications including highly data-efficient learning and,\nmost recently, performing superalignment. While decades of research have\nresulted in numerous algorithms that produce strong empirical performance,\nunderstanding what aspects of data enable weak-to-strong generalization has\nbeen understudied. We propose a simple data-centric mechanism that\ncharacterizes weak-to-strong generalization: the overlap density. Intuitively,\ngeneralization tracks the number of points that contain overlaps, i.e., both\neasy patterns (learnable by a weak model) and challenging patterns (only\nlearnable by a stronger model), as with such points, weak predictions can be\nused to learn challenging patterns by stronger models. We provide a practical\noverlap detection algorithm to find such points in datasets and leverage them\nto learn, among multiple sources of data, which to query when seeking to\nmaximize overlap density and thereby enhance weak-to-strong generalization. We\npresent a theoretical result showing that the generalization benefit is a\nfunction of the overlap density and a regret bound for our data selection\nalgorithm. Empirically, we validate the mechanism and the overlap detection\nalgorithm on a wide array of settings.\n","authors":["Changho Shin","John Cooper","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2412.03881v1.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2412.03877v1","updated":"2024-12-05T05:18:09Z","published":"2024-12-05T05:18:09Z","title":"AyutthayaAlpha: A Thai-Latin Script Transliteration Transformer","summary":" This study introduces AyutthayaAlpha, an advanced transformer-based machine\nlearning model designed for the transliteration of Thai proper names into Latin\nscript. Our system achieves state-of-the-art performance with 82.32%\nfirst-token accuracy and 95.24% first-three-token accuracy, while maintaining a\nlow character error rate of 0.0047. The complexity of Thai phonology, including\ntonal features and vowel length distinctions, presents significant challenges\nfor accurate transliteration, which we address through a novel two-model\napproach: AyutthayaAlpha-Small, based on the ByT5 architecture, and\nAyutthayaAlpha-VerySmall, a computationally efficient variant that unexpectedly\noutperforms its larger counterpart. Our research combines linguistic rules with\ndeep learning, training on a carefully curated dataset of 1.2 million\nThai-Latin name pairs, augmented through strategic upsampling to 2.7 million\nexamples. Extensive evaluations against existing transliteration methods and\nhuman expert benchmarks demonstrate that AyutthayaAlpha not only achieves\nsuperior accuracy but also effectively captures personal and cultural\npreferences in name romanization. The system's practical applications extend to\ncross-lingual information retrieval, international data standardization, and\nidentity verification systems, with particular relevance for government\ndatabases, academic institutions, and global business operations. This work\nrepresents a significant advance in bridging linguistic gaps between Thai and\nLatin scripts, while respecting the cultural and personal dimensions of name\ntransliteration.\n","authors":["Davor Lauc","Attapol Rutherford","Weerin Wongwarawipatr"],"pdf_url":"https://arxiv.org/pdf/2412.03877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01129v2","updated":"2024-12-05T05:05:01Z","published":"2024-12-02T05:09:56Z","title":"RILQ: Rank-Insensitive LoRA-based Quantization Error Compensation for\n Boosting 2-bit Large Language Model Accuracy","summary":" Low-rank adaptation (LoRA) has become the dominant method for\nparameter-efficient LLM fine-tuning, with LoRA-based quantization error\ncompensation (LQEC) emerging as a powerful tool for recovering accuracy in\ncompressed LLMs. However, LQEC has underperformed in sub-4-bit scenarios, with\nno prior investigation into understanding this limitation. We propose RILQ\n(Rank-Insensitive LoRA-based Quantization Error Compensation) to understand\nfundamental limitation and boost 2-bit LLM accuracy. Based on rank analysis\nrevealing model-wise activation discrepancy loss's rank-insensitive nature,\nRILQ employs this loss to adjust adapters cooperatively across layers, enabling\nrobust error compensation with low-rank adapters. Evaluations on LLaMA-2 and\nLLaMA-3 demonstrate RILQ's consistent improvements in 2-bit quantized inference\nacross various state-of-the-art quantizers and enhanced accuracy in\ntask-specific fine-tuning. RILQ maintains computational efficiency comparable\nto existing LoRA methods, enabling adapter-merged weight-quantized LLM\ninference with significantly enhanced accuracy, making it a promising approach\nfor boosting 2-bit LLM performance.\n","authors":["Geonho Lee","Janghwan Lee","Sukjin Hong","Minsoo Kim","Euijai Ahn","Du-Seong Chang","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2412.01129v2.pdf","comment":"The typo in Table 4 has been corrected"},{"id":"http://arxiv.org/abs/2412.03873v1","updated":"2024-12-05T05:04:29Z","published":"2024-12-05T05:04:29Z","title":"Fine-Grained Sentiment Analysis of Electric Vehicle User Reviews: A\n Bidirectional LSTM Approach to Capturing Emotional Intensity in Chinese Text","summary":" The rapid expansion of the electric vehicle (EV) industry has highlighted the\nimportance of user feedback in improving product design and charging\ninfrastructure. Traditional sentiment analysis methods often oversimplify the\ncomplexity of user emotions, limiting their effectiveness in capturing nuanced\nsentiments and emotional intensities. This study proposes a Bidirectional Long\nShort-Term Memory (Bi-LSTM) network-based sentiment scoring model to analyze\nuser reviews of EV charging infrastructure. By assigning sentiment scores\nranging from 0 to 5, the model provides a fine-grained understanding of\nemotional expression. Leveraging a dataset of 43,678 reviews from PC Auto, the\nstudy employs rigorous data cleaning and preprocessing, including tokenization\nand stop word removal, to optimize input for deep learning. The Bi-LSTM model\ndemonstrates significant improvements over traditional approaches like SnowNLP\nacross key evaluation metrics, including Mean Squared Error (MSE), Mean\nAbsolute Error (MAE), and Explained Variance Score (EVS). These results\nhighlight the model's superior capability to capture nuanced sentiment\ndynamics, offering valuable insights for targeted product and service\nenhancements in the EV ecosystem.\n","authors":["Shuhao Chen","Chengyi Tu"],"pdf_url":"https://arxiv.org/pdf/2412.03873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04767v2","updated":"2024-12-05T09:07:23Z","published":"2022-08-09T13:23:29Z","title":"Combining Stochastic Defenses to Resist Gradient Inversion: An Ablation\n Study","summary":" Gradient Inversion (GI) attacks are a ubiquitous threat in Federated Learning\n(FL) as they exploit gradient leakage to reconstruct supposedly private\ntraining data. Common defense mechanisms such as Differential Privacy (DP) or\nstochastic Privacy Modules (PMs) introduce randomness during gradient\ncomputation to prevent such attacks. However, we pose that if an attacker\neffectively mimics a client's stochastic gradient computation, the attacker can\ncircumvent the defense and reconstruct clients' private training data. This\npaper introduces several targeted GI attacks that leverage this principle to\nbypass common defense mechanisms. As a result, we demonstrate that no\nindividual defense provides sufficient privacy protection. To address this\nissue, we propose to combine multiple defenses. We conduct an extensive\nablation study to evaluate the influence of various combinations of defenses on\nprivacy protection and model utility. We observe that only the combination of\nDP and a stochastic PM was sufficient to decrease the Attack Success Rate (ASR)\nfrom 100% to 0%, thus preserving privacy. Moreover, we found that this\ncombination of defenses consistently achieves the best trade-off between\nprivacy and model utility.\n","authors":["Daniel Scheliga","Patrick Mäder","Marco Seeland"],"pdf_url":"https://arxiv.org/pdf/2208.04767v2.pdf","comment":"This version represents a comprehensive rework of the initial study,\n including substantial updates to the methodology, analysis, and conclusions.\n 26 pages, 2 figures, 5 tables"}],"Genomics":[{"id":"http://arxiv.org/abs/2407.16940v2","updated":"2024-12-05T13:30:16Z","published":"2024-07-24T02:20:29Z","title":"GV-Rep: A Large-Scale Dataset for Genetic Variant Representation\n Learning","summary":" Genetic variants (GVs) are defined as differences in the DNA sequences among\nindividuals and play a crucial role in diagnosing and treating genetic\ndiseases. The rapid decrease in next generation sequencing cost has led to an\nexponential increase in patient-level GV data. This growth poses a challenge\nfor clinicians who must efficiently prioritize patient-specific GVs and\nintegrate them with existing genomic databases to inform patient management. To\naddressing the interpretation of GVs, genomic foundation models (GFMs) have\nemerged. However, these models lack standardized performance assessments,\nleading to considerable variability in model evaluations. This poses the\nquestion: How effectively do deep learning methods classify unknown GVs and\nalign them with clinically-verified GVs? We argue that representation learning,\nwhich transforms raw data into meaningful feature spaces, is an effective\napproach for addressing both indexing and classification challenges. We\nintroduce a large-scale Genetic Variant dataset, named GV-Rep, featuring\nvariable-length contexts and detailed annotations, designed for deep learning\nmodels to learn GV representations across various traits, diseases, tissue\ntypes, and experimental contexts. Our contributions are three-fold: (i)\nConstruction of a comprehensive dataset with 7 million records, each labeled\nwith characteristics of the corresponding variants, alongside additional data\nfrom 17,548 gene knockout tests across 1,107 cell types, 1,808 variant\ncombinations, and 156 unique clinically verified GVs from real-world patients.\n(ii) Analysis of the structure and properties of the dataset. (iii)\nExperimentation of the dataset with pre-trained GFMs. The results show a\nsignificant gap between GFMs current capabilities and accurate GV\nrepresentation. We hope this dataset will help advance genomic deep learning to\nbridge this gap.\n","authors":["Zehui Li","Vallijah Subasri","Guy-Bart Stan","Yiren Zhao","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16940v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.02882v2","updated":"2024-12-05T07:11:03Z","published":"2024-12-03T22:34:38Z","title":"iSEEtree: interactive explorer for hierarchical data","summary":" $\\textbf{Motivation:}$ Hierarchical data structures are prevalent across\nseveral fields of research, as they represent an organised and efficient\napproach to study complex interconnected systems. Their significance is\nparticularly evident in microbiome analysis, where microbial communities are\nclassified at various taxonomic levels along the phylogenetic tree. In light of\nthis trend, the R/Bioconductor community has established a reproducible\nanalytical framework for hierarchical data, which relies on the highly generic\nand optimised TreeSummarizedExperiment data container. However, using this\nframework requires basic proficiency in programming.\n $\\textbf{Results:}$ To reduce the entry requirements, we developed iSEEtree,\nan R shiny app which provides a visual interface for the analysis and\nexploration of TreeSummarizedExperiment objects, thereby expanding the\ninteractive graphics capabilities of related work to hierarchical structures.\nThis way, users can interactively explore several aspects of their data without\nthe need for extensive knowledge of R programming. We describe how iSEEtree\nenables the exploration of hierarchical multi-table data and demonstrate its\nfunctionality with applications to microbiome analysis.\n $\\textbf{Availability and Implementation:}$ iSEEtree was implemented in the R\nprogramming language and is available on Bioconductor at\nhttps://bioconductor.org/packages/iSEEtree under an Artistic 2.0 license.\n $\\textbf{Contact:}$ giulio.benedetti@utu.fi or leo.lahti@utu.fi.\n","authors":["Giulio Benedetti","Ely Seraidarian","Theotime Pralas","Akewak Jeba","Tuomas Borman","Leo Lahti"],"pdf_url":"https://arxiv.org/pdf/2412.02882v2.pdf","comment":"4 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.11087v6","updated":"2024-12-05T06:21:03Z","published":"2023-09-20T06:30:39Z","title":"Embed-Search-Align: DNA Sequence Alignment using Transformer Models","summary":" DNA sequence alignment involves assigning short DNA reads to the most\nprobable locations on an extensive reference genome. This process is crucial\nfor various genomic analyses, including variant calling, transcriptomics, and\nepigenomics. Conventional methods, refined over decades, tackle this challenge\nin 2 steps: genome indexing followed by efficient search to locate likely\npositions for given reads. Building on the success of Large Language Models in\nencoding text into embeddings, where the distance metric captures semantic\nsimilarity, recent efforts have explored whether the same Transformer\narchitecture can produce embeddings for DNA sequences. Such models have shown\nearly promise in classifying short DNA sequences, such as detecting\ncoding/non-coding regions, and enhancer, promoter sequences. However,\nperformance at sequence classification tasks does not translate to sequence\nalignment, where it is necessary to search across the genome to align each\nread, a significantly longer-range task. We bridge this gap by framing the\nSequence Alignment task for Transformer models as an \"Embed-Search-Align\" task.\nIn this framework, a novel Reference-Free DNA Embedding model generates\nembeddings of reads and reference fragments, which are projected into a shared\nvector space where the read-fragment distance is used as a surrogate for\nalignment. Technical contributions include: (1) Contrastive loss for\nself-supervised training of DNA sequence representations, facilitating rich\nreference-free, sequence-level embeddings, and (2) a DNA vector store to enable\nsearch across fragments on a global scale. DNA-ESA is 99% accurate when\naligning 250-length reads onto a human genome (3gb), rivaling conventional\nmethods such as Bowtie and BWA-Mem. DNA-ESA exceeds the performance of 6\nTransformer model baselines such as Nucleotide Transformer, Hyena-DNA, and\nshows task transfer across chromosomes and species.\n","authors":["Pavan Holur","K. C. Enevoldsen","Shreyas Rajesh","Lajoyce Mboning","Thalia Georgiou","Louis-S. Bouchard","Matteo Pellegrini","Vwani Roychowdhury"],"pdf_url":"https://arxiv.org/pdf/2309.11087v6.pdf","comment":"12 pages, Tables 7, Figures 6"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2412.04467v1","updated":"2024-12-05T18:59:53Z","published":"2024-12-05T18:59:53Z","title":"VisionZip: Longer is Better but Not Necessary in Vision Language Models","summary":" Recent advancements in vision-language models have enhanced performance by\nincreasing the length of visual tokens, making them much longer than text\ntokens and significantly raising computational costs. However, we observe that\nthe visual tokens generated by popular vision encoders, such as CLIP and\nSigLIP, contain significant redundancy. To address this, we introduce\nVisionZip, a simple yet effective method that selects a set of informative\ntokens for input to the language model, reducing visual token redundancy and\nimproving efficiency while maintaining model performance. The proposed\nVisionZip can be widely applied to image and video understanding tasks and is\nwell-suited for multi-turn dialogues in real-world scenarios, where previous\nmethods tend to underperform. Experimental results show that VisionZip\noutperforms the previous state-of-the-art method by at least 5% performance\ngains across nearly all settings. Moreover, our method significantly enhances\nmodel inference speed, improving the prefilling time by 8x and enabling the\nLLaVA-Next 13B model to infer faster than the LLaVA-Next 7B model while\nachieving better results. Furthermore, we analyze the causes of this redundancy\nand encourage the community to focus on extracting better visual features\nrather than merely increasing token length. Our code is available at\nhttps://github.com/dvlab-research/VisionZip .\n","authors":["Senqiao Yang","Yukang Chen","Zhuotao Tian","Chengyao Wang","Jingyao Li","Bei Yu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2412.04467v1.pdf","comment":"2 columns, 28 pages, 15 figures, 18 tables"},{"id":"http://arxiv.org/abs/2412.04455v1","updated":"2024-12-05T18:58:27Z","published":"2024-12-05T18:58:27Z","title":"Code-as-Monitor: Constraint-aware Visual Programming for Reactive and\n Proactive Robotic Failure Detection","summary":" Automatic detection and prevention of open-set failures are crucial in\nclosed-loop robotic systems. Recent studies often struggle to simultaneously\nidentify unexpected failures reactively after they occur and prevent\nforeseeable ones proactively. To this end, we propose Code-as-Monitor (CaM), a\nnovel paradigm leveraging the vision-language model (VLM) for both open-set\nreactive and proactive failure detection. The core of our method is to\nformulate both tasks as a unified set of spatio-temporal constraint\nsatisfaction problems and use VLM-generated code to evaluate them for real-time\nmonitoring. To enhance the accuracy and efficiency of monitoring, we further\nintroduce constraint elements that abstract constraint-related entities or\ntheir parts into compact geometric elements. This approach offers greater\ngenerality, simplifies tracking, and facilitates constraint-aware visual\nprogramming by leveraging these elements as visual prompts. Experiments show\nthat CaM achieves a 28.7% higher success rate and reduces execution time by\n31.8% under severe disturbances compared to baselines across three simulators\nand a real-world setting. Moreover, CaM can be integrated with open-loop\ncontrol policies to form closed-loop systems, enabling long-horizon tasks in\ncluttered scenes with dynamic environments.\n","authors":["Enshen Zhou","Qi Su","Cheng Chi","Zhizheng Zhang","Zhongyuan Wang","Tiejun Huang","Lu Sheng","He Wang"],"pdf_url":"https://arxiv.org/pdf/2412.04455v1.pdf","comment":"Project page: https://zhoues.github.io/Code-as-Monitor/"},{"id":"http://arxiv.org/abs/2412.04445v1","updated":"2024-12-05T18:57:04Z","published":"2024-12-05T18:57:04Z","title":"Moto: Latent Motion Token as the Bridging Language for Robot\n Manipulation","summary":" Recent developments in Large Language Models pre-trained on extensive corpora\nhave shown significant success in various natural language processing tasks\nwith minimal fine-tuning. This success offers new promise for robotics, which\nhas long been constrained by the high cost of action-labeled data. We ask:\ngiven the abundant video data containing interaction-related knowledge\navailable as a rich \"corpus\", can a similar generative pre-training approach be\neffectively applied to enhance robot learning? The key challenge is to identify\nan effective representation for autoregressive pre-training that benefits robot\nmanipulation tasks. Inspired by the way humans learn new skills through\nobserving dynamic environments, we propose that effective robotic learning\nshould emphasize motion-related knowledge, which is closely tied to low-level\nactions and is hardware-agnostic, facilitating the transfer of learned motions\nto actual robot actions. To this end, we introduce Moto, which converts video\ncontent into latent Motion Token sequences by a Latent Motion Tokenizer,\nlearning a bridging \"language\" of motion from videos in an unsupervised manner.\nWe pre-train Moto-GPT through motion token autoregression, enabling it to\ncapture diverse visual motion knowledge. After pre-training, Moto-GPT\ndemonstrates the promising ability to produce semantically interpretable motion\ntokens, predict plausible motion trajectories, and assess trajectory\nrationality through output likelihood. To transfer learned motion priors to\nreal robot actions, we implement a co-fine-tuning strategy that seamlessly\nbridges latent motion token prediction and real robot control. Extensive\nexperiments show that the fine-tuned Moto-GPT exhibits superior robustness and\nefficiency on robot manipulation benchmarks, underscoring its effectiveness in\ntransferring knowledge from video data to downstream visual manipulation tasks.\n","authors":["Yi Chen","Yuying Ge","Yizhuo Li","Yixiao Ge","Mingyu Ding","Ying Shan","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04445v1.pdf","comment":"Project released at: https://chenyi99.github.io/moto/"},{"id":"http://arxiv.org/abs/2409.03669v2","updated":"2024-12-05T18:56:04Z","published":"2024-09-05T16:23:07Z","title":"A method to benchmark high-dimensional process drift detection","summary":" Process curves are multivariate finite time series data coming from\nmanufacturing processes. This paper studies machine learning that detect drifts\nin process curve datasets. A theoretic framework to synthetically generate\nprocess curves in a controlled way is introduced in order to benchmark machine\nlearning algorithms for process drift detection. An evaluation score, called\nthe temporal area under the curve, is introduced, which allows to quantify how\nwell machine learning models unveil curves belonging to drift segments.\nFinally, a benchmark study comparing popular machine learning approaches on\nsynthetic data generated with the introduced framework is presented that shows\nthat existing algorithms often struggle with datasets containing multiple drift\nsegments.\n","authors":["Edgar Wolf","Tobias Windisch"],"pdf_url":"https://arxiv.org/pdf/2409.03669v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17728v3","updated":"2024-12-05T18:55:44Z","published":"2024-03-26T14:17:01Z","title":"Masked Autoencoders are PDE Learners","summary":" Neural solvers for partial differential equations (PDEs) have great potential\nto generate fast and accurate physics solutions, yet their practicality is\ncurrently limited by their generalizability. PDEs evolve over broad scales and\nexhibit diverse behaviors; predicting these phenomena will require learning\nrepresentations across a wide variety of inputs which may encompass different\ncoefficients, boundary conditions, resolutions, or even equations. As a step\ntowards generalizable PDE modeling, we adapt masked pretraining for physics\nproblems. Through self-supervised learning across PDEs, masked autoencoders can\nconsolidate heterogeneous physics to learn rich latent representations. We show\nthat learned representations can generalize to a limited set of unseen\nequations or parameters and are meaningful enough to regress PDE coefficients\nor the classify PDE features. Furthermore, conditioning neural solvers on\nlearned latent representations can improve time-stepping and super-resolution\nperformance across a variety of coefficients, discretizations, or boundary\nconditions, as well as on certain unseen PDEs. We hope that masked pretraining\ncan emerge as a unifying method across large, unlabeled, and heterogeneous\ndatasets to learn latent physics at scale.\n","authors":["Anthony Zhou","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2403.17728v3.pdf","comment":"29 pages, 9 figures"},{"id":"http://arxiv.org/abs/2412.04429v1","updated":"2024-12-05T18:52:00Z","published":"2024-12-05T18:52:00Z","title":"Grounding Descriptions in Images informs Zero-Shot Visual Recognition","summary":" Vision-language models (VLMs) like CLIP have been cherished for their ability\nto perform zero-shot visual recognition on open-vocabulary concepts. This is\nachieved by selecting the object category whose textual representation bears\nthe highest similarity with the query image. While successful in some domains,\nthis method struggles with identifying fine-grained entities as well as\ngeneralizing to unseen concepts that are not captured by the training\ndistribution. Recent works attempt to mitigate these challenges by integrating\ncategory descriptions at test time, albeit yielding modest improvements. We\nattribute these limited gains to a fundamental misalignment between image and\ndescription representations, which is rooted in the pretraining structure of\nCLIP. In this paper, we propose GRAIN, a new pretraining strategy aimed at\naligning representations at both fine and coarse levels simultaneously. Our\napproach learns to jointly ground textual descriptions in image regions along\nwith aligning overarching captions with global image representations. To drive\nthis pre-training, we leverage frozen Multimodal Large Language Models (MLLMs)\nto derive large-scale synthetic annotations. We demonstrate the enhanced\nzero-shot performance of our model compared to current state-of-the art methods\nacross 11 diverse image classification datasets. Additionally, we introduce\nProducts-2023, a newly curated, manually labeled dataset featuring novel\nconcepts, and showcase our model's ability to recognize these concepts by\nbenchmarking on it. Significant improvements achieved by our model on other\ndownstream tasks like retrieval further highlight the superior quality of\nrepresentations learned by our approach. Code available at\nhttps://github.com/shaunak27/grain-clip .\n","authors":["Shaunak Halbe","Junjiao Tian","K J Joseph","James Seale Smith","Katherine Stevo","Vineeth N Balasubramanian","Zsolt Kira"],"pdf_url":"https://arxiv.org/pdf/2412.04429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04426v1","updated":"2024-12-05T18:51:18Z","published":"2024-12-05T18:51:18Z","title":"Marvel: Accelerating Safe Online Reinforcement Learning with Finetuned\n Offline Policy","summary":" The high costs and risks involved in extensive environment interactions\nhinder the practical application of current online safe reinforcement learning\n(RL) methods. While offline safe RL addresses this by learning policies from\nstatic datasets, the performance therein is usually limited due to reliance on\ndata quality and challenges with out-of-distribution (OOD) actions. Inspired by\nrecent successes in offline-to-online (O2O) RL, it is crucial to explore\nwhether offline safe RL can be leveraged to facilitate faster and safer online\npolicy learning, a direction that has yet to be fully investigated. To fill\nthis gap, we first demonstrate that naively applying existing O2O algorithms\nfrom standard RL would not work well in the safe RL setting due to two unique\nchallenges: \\emph{erroneous Q-estimations}, resulted from offline-online\nobjective mismatch and offline cost sparsity, and \\emph{Lagrangian mismatch},\nresulted from difficulties in aligning Lagrange multipliers between offline and\nonline policies. To address these challenges, we introduce \\textbf{Marvel}, a\nnovel framework for O2O safe RL, comprising two key components that work in\nconcert: \\emph{Value Pre-Alignment} to align the Q-functions with the\nunderlying truth before online learning, and \\emph{Adaptive PID Control} to\neffectively adjust the Lagrange multipliers during online finetuning. Extensive\nexperiments demonstrate that Marvel significantly outperforms existing\nbaselines in both reward maximization and safety constraint satisfaction. By\nintroducing the first policy-finetuning based framework for O2O safe RL, which\nis compatible with many offline and online safe RL methods, our work has the\ngreat potential to advance the field towards more efficient and practical safe\nRL solutions.\n","authors":["Keru Chen","Honghao Wei","Zhigang Deng","Sen Lin"],"pdf_url":"https://arxiv.org/pdf/2412.04426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04425v1","updated":"2024-12-05T18:51:10Z","published":"2024-12-05T18:51:10Z","title":"CA-SSLR: Condition-Aware Self-Supervised Learning Representation for\n Generalized Speech Processing","summary":" We introduce Condition-Aware Self-Supervised Learning Representation\n(CA-SSLR), a generalist conditioning model broadly applicable to various\nspeech-processing tasks. Compared to standard fine-tuning methods that optimize\nfor downstream models, CA-SSLR integrates language and speaker embeddings from\nearlier layers, making the SSL model aware of the current language and speaker\ncontext. This approach reduces the reliance on input audio features while\npreserving the integrity of the base SSLR. CA-SSLR improves the model's\ncapabilities and demonstrates its generality on unseen tasks with minimal\ntask-specific tuning. Our method employs linear modulation to dynamically\nadjust internal representations, enabling fine-grained adaptability without\nsignificantly altering the original model behavior. Experiments show that\nCA-SSLR reduces the number of trainable parameters, mitigates overfitting, and\nexcels in under-resourced and unseen tasks. Specifically, CA-SSLR achieves a\n10% relative reduction in LID errors, a 37% improvement in ASR CER on the\nML-SUPERB benchmark, and a 27% decrease in SV EER on VoxCeleb-1, demonstrating\nits effectiveness.\n","authors":["Yen-Ju Lu","Jing Liu","Thomas Thebaud","Laureano Moro-Velazquez","Ariya Rastrow","Najim Dehak","Jesus Villalba"],"pdf_url":"https://arxiv.org/pdf/2412.04425v1.pdf","comment":"38th Conference on Neural Information Processing Systems (NeurIPS\n 2024)"},{"id":"http://arxiv.org/abs/2403.07384v2","updated":"2024-12-05T18:47:47Z","published":"2024-03-12T07:45:33Z","title":"SmallToLarge (S2L): Scalable Data Selection for Fine-tuning Large\n Language Models by Summarizing Training Trajectories of Small Models","summary":" Despite the effectiveness of data selection for large language models (LLMs)\nduring pretraining and instruction fine-tuning phases, improving data\nefficiency in supervised fine-tuning (SFT) for specialized domains poses\nsignificant challenges due to the complexity of fine-tuning data. To bridge\nthis gap, we introduce an effective and scalable data selection method for SFT,\nSmallToLarge (S2L), which leverages training trajectories from small models to\nguide the data selection for larger models. We demonstrate through extensive\nexperiments that S2L significantly improves data efficiency in SFT for\nmathematical problem-solving, reducing the training data to just 11% of the\noriginal MathInstruct dataset (Yue et al., 2023) to match full dataset\nperformance while outperforming state-of-the-art data selection algorithms by\nan average of 4.7% across 6 in- and out-domain evaluation datasets. Remarkably,\nselecting only 50K data for SFT, S2L achieves a 32.7% accuracy on the most\nchallenging MATH (Hendrycks et al., 2021) benchmark, improving Phi-2 (Li et\nal., 2023b) by 16.6%. In clinical text summarization on the MIMIC-III dataset\n(Johnson et al., 2016), S2L again outperforms training on the full dataset\nusing only 50% of the data. Notably, S2L can perform data selection using a\nreference model 40x smaller than the target model, proportionally reducing the\ncost of data selection.\n","authors":["Yu Yang","Siddhartha Mishra","Jeffrey N Chiang","Baharan Mirzasoleiman"],"pdf_url":"https://arxiv.org/pdf/2403.07384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01339v2","updated":"2024-12-05T18:43:25Z","published":"2024-12-02T10:06:57Z","title":"Negative Token Merging: Image-based Adversarial Feature Guidance","summary":" Text-based adversarial guidance using a negative prompt has emerged as a\nwidely adopted approach to steer diffusion models away from producing undesired\nconcepts. While useful, performing adversarial guidance using text alone can be\ninsufficient to capture complex visual concepts or avoid specific visual\nelements like copyrighted characters. In this paper, for the first time we\nexplore an alternate modality in this direction by performing adversarial\nguidance directly using visual features from a reference image or other images\nin a batch. We introduce negative token merging (NegToMe), a simple but\neffective training-free approach which performs adversarial guidance through\nimages by selectively pushing apart matching visual features between reference\nand generated images during the reverse diffusion process. By simply adjusting\nthe used reference, NegToMe enables a diverse range of applications. Notably,\nwhen using other images in same batch as reference, we find that NegToMe\nsignificantly enhances output diversity (e.g., racial, gender, visual) by\nguiding features of each image away from others. Similarly, when used w.r.t.\ncopyrighted reference images, NegToMe reduces visual similarity to copyrighted\ncontent by 34.57%. NegToMe is simple to implement using just few-lines of code,\nuses only marginally higher (<4%) inference time and is compatible with\ndifferent diffusion architectures, including those like Flux, which don't\nnatively support the use of a negative prompt. Code is available at\nhttps://negtome.github.io\n","authors":["Jaskirat Singh","Lindsey Li","Weijia Shi","Ranjay Krishna","Yejin Choi","Pang Wei Koh","Michael F. Cohen","Stephen Gould","Liang Zheng","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2412.01339v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04416v1","updated":"2024-12-05T18:42:29Z","published":"2024-12-05T18:42:29Z","title":"FedDUAL: A Dual-Strategy with Adaptive Loss and Dynamic Aggregation for\n Mitigating Data Heterogeneity in Federated Learning","summary":" Federated Learning (FL) marks a transformative approach to distributed model\ntraining by combining locally optimized models from various clients into a\nunified global model. While FL preserves data privacy by eliminating\ncentralized storage, it encounters significant challenges such as performance\ndegradation, slower convergence, and reduced robustness of the global model due\nto the heterogeneity in client data distributions. Among the various forms of\ndata heterogeneity, label skew emerges as a particularly formidable and\nprevalent issue, especially in domains such as image classification. To address\nthese challenges, we begin with comprehensive experiments to pinpoint the\nunderlying issues in the FL training process. Based on our findings, we then\nintroduce an innovative dual-strategy approach designed to effectively resolve\nthese issues. First, we introduce an adaptive loss function for client-side\ntraining, meticulously crafted to preserve previously acquired knowledge while\nmaintaining an optimal equilibrium between local optimization and global model\ncoherence. Secondly, we develop a dynamic aggregation strategy for aggregating\nclient models at the server. This approach adapts to each client's unique\nlearning patterns, effectively addressing the challenges of diverse data across\nthe network. Our comprehensive evaluation, conducted across three diverse\nreal-world datasets, coupled with theoretical convergence guarantees,\ndemonstrates the superior efficacy of our method compared to several\nestablished state-of-the-art approaches.\n","authors":["Pranab Sahoo","Ashutosh Tripathi","Sriparna Saha","Samrat Mondal"],"pdf_url":"https://arxiv.org/pdf/2412.04416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12924v3","updated":"2024-12-05T18:35:26Z","published":"2024-09-04T03:17:19Z","title":"WaveletGPT: Wavelets Meet Large Language Models","summary":" Large Language Models (LLMs) have ushered in a new wave of artificial\nintelligence advancements impacting every scientific field and discipline. They\nare trained on a simple objective: to predict the next token given the previous\ncontext. We live in a world where most of the data around us, e.g., text,\naudio, and music, has a multi-scale structure associated with it. This paper\ninfuses LLMs with traditional signal processing ideas, namely wavelets, during\npre-training to take advantage of the structure. Without adding \\textbf{any\nextra parameters} to a GPT-style LLM architecture, we achieve the same\npre-training performance almost twice as fast in text, raw audio, and symbolic\nmusic. This is achieved by imposing a structure on intermediate embeddings.\nWhen trained for the same number of training steps, we achieve significant\ngains in performance, which is comparable to pre-training a larger neural\narchitecture. Our architecture allows every next token prediction access to\nintermediate embeddings at different temporal resolutions in every Transformer\ndecoder block. This work will hopefully pave the way for incorporating\nmulti-rate signal processing ideas into traditional LLM pre-training. Further,\nwe showcase pushing model performance by improving internal structure instead\nof just going after scale.\n","authors":["Prateek Verma"],"pdf_url":"https://arxiv.org/pdf/2409.12924v3.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2412.04413v1","updated":"2024-12-05T18:33:59Z","published":"2024-12-05T18:33:59Z","title":"Efficient Task Grouping Through Samplewise Optimisation Landscape\n Analysis","summary":" Shared training approaches, such as multi-task learning (MTL) and\ngradient-based meta-learning, are widely used in various machine learning\napplications, but they often suffer from negative transfer, leading to\nperformance degradation in specific tasks. While several optimisation\ntechniques have been developed to mitigate this issue for pre-selected task\ncohorts, identifying optimal task combinations for joint learning - known as\ntask grouping - remains underexplored and computationally challenging due to\nthe exponential growth in task combinations and the need for extensive training\nand evaluation cycles. This paper introduces an efficient task grouping\nframework designed to reduce these overwhelming computational demands of the\nexisting methods. The proposed framework infers pairwise task similarities\nthrough a sample-wise optimisation landscape analysis, eliminating the need for\nthe shared model training required to infer task similarities in existing\nmethods. With task similarities acquired, a graph-based clustering algorithm is\nemployed to pinpoint near-optimal task groups, providing an approximate yet\nefficient and effective solution to the originally NP-hard problem. Empirical\nassessments conducted on 8 different datasets highlight the effectiveness of\nthe proposed framework, revealing a five-fold speed enhancement compared to\nprevious state-of-the-art methods. Moreover, the framework consistently\ndemonstrates comparable performance, confirming its remarkable efficiency and\neffectiveness in task grouping.\n","authors":["Anshul Thakur","Yichen Huang","Soheila Molaei","Yujiang Wang","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2412.04413v1.pdf","comment":"Under review at IEEE Transactions on Pattern Analysis and Machine\n Intelligence"},{"id":"http://arxiv.org/abs/2412.04409v1","updated":"2024-12-05T18:31:14Z","published":"2024-12-05T18:31:14Z","title":"Stabilizing and Solving Inverse Problems using Data and Machine Learning","summary":" We consider an inverse problem involving the reconstruction of the solution\nto a nonlinear partial differential equation (PDE) with unknown boundary\nconditions. Instead of direct boundary data, we are provided with a large\ndataset of boundary observations for typical solutions (collective data) and a\nbulk measurement of a specific realization. To leverage this collective data,\nwe first compress the boundary data using proper orthogonal decomposition (POD)\nin a linear expansion. Next, we identify a possible nonlinear low-dimensional\nstructure in the expansion coefficients using an auto-encoder, which provides a\nparametrization of the dataset in a lower-dimensional latent space. We then\ntrain a neural network to map the latent variables representing the boundary\ndata to the solution of the PDE. Finally, we solve the inverse problem by\noptimizing a data-fitting term over the latent space.\n We analyze the underlying stabilized finite element method in the linear\nsetting and establish optimal error estimates in the $H^1$ and $L^2$-norms. The\nnonlinear problem is then studied numerically, demonstrating the effectiveness\nof our approach.\n","authors":["Erik Burman","Mats G. Larson","Karl Larsson","Carl Lundholm"],"pdf_url":"https://arxiv.org/pdf/2412.04409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04408v1","updated":"2024-12-05T18:27:09Z","published":"2024-12-05T18:27:09Z","title":"Providing Differential Privacy for Federated Learning Over Wireless: A\n Cross-layer Framework","summary":" Federated Learning (FL) is a distributed machine learning framework that\ninherently allows edge devices to maintain their local training data, thus\nproviding some level of privacy. However, FL's model updates still pose a risk\nof privacy leakage, which must be mitigated. Over-the-air FL (OTA-FL) is an\nadapted FL design for wireless edge networks that leverages the natural\nsuperposition property of the wireless medium. We propose a wireless physical\nlayer (PHY) design for OTA-FL which improves differential privacy (DP) through\na decentralized, dynamic power control that utilizes both inherent Gaussian\nnoise in the wireless channel and a cooperative jammer (CJ) for additional\nartificial noise generation when higher privacy levels are required. Although\nprimarily implemented within the Upcycled-FL framework, where a\nresource-efficient method with first-order approximations is used at every even\niteration to decrease the required information from clients, our power control\nstrategy is applicable to any FL framework, including FedAvg and FedProx as\nshown in the paper. This adaptation showcases the flexibility and effectiveness\nof our design across different learning algorithms while maintaining a strong\nemphasis on privacy. Our design removes the need for client-side artificial\nnoise injection for DP, utilizing a cooperative jammer to enhance privacy\nwithout affecting transmission efficiency for higher privacy demands. Privacy\nanalysis is provided using the Moments Accountant method. We perform a\nconvergence analysis for non-convex objectives to tackle heterogeneous data\ndistributions, highlighting the inherent trade-offs between privacy and\naccuracy. Numerical results show that our approach with various FL algorithms\noutperforms the state-of-the-art under the same DP conditions on the non-i.i.d.\nFEMNIST dataset, and highlight the cooperative jammer's effectiveness in\nensuring strict privacy.\n","authors":["Jiayu Mao","Tongxin Yin","Aylin Yener","Mingyan Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04408v1.pdf","comment":"submitted for an IEEE publication"},{"id":"http://arxiv.org/abs/2412.04404v1","updated":"2024-12-05T18:23:44Z","published":"2024-12-05T18:23:44Z","title":"Federated Automated Feature Engineering","summary":" Automated feature engineering (AutoFE) is used to automatically create new\nfeatures from original features to improve predictive performance without\nneeding significant human intervention and expertise. Many algorithms exist for\nAutoFE, but very few approaches exist for the federated learning (FL) setting\nwhere data is gathered across many clients and is not shared between clients or\na central server. We introduce AutoFE algorithms for the horizontal, vertical,\nand hybrid FL settings, which differ in how the data is gathered across\nclients. To the best of our knowledge, we are the first to develop AutoFE\nalgorithms for the horizontal and hybrid FL cases, and we show that the\ndownstream model performance of federated AutoFE is similar to the case where\ndata is held centrally and AutoFE is performed centrally.\n","authors":["Tom Overman","Diego Klabjan"],"pdf_url":"https://arxiv.org/pdf/2412.04404v1.pdf","comment":"Preliminary Work"},{"id":"http://arxiv.org/abs/2311.10162v3","updated":"2024-12-05T18:16:10Z","published":"2023-11-16T19:34:18Z","title":"Learning to Reconstruct Accelerated MRI Through K-space Cold Diffusion\n without Noise","summary":" Deep learning-based MRI reconstruction models have achieved superior\nperformance these days. Most recently, diffusion models have shown remarkable\nperformance in image generation, in-painting, super-resolution, image editing\nand more. As a generalized diffusion model, cold diffusion further broadens the\nscope and considers models built around arbitrary image transformations such as\nblurring, down-sampling, etc. In this paper, we propose a k-space cold\ndiffusion model that performs image degradation and restoration in k-space\nwithout the need for Gaussian noise. We provide comparisons with multiple deep\nlearning-based MRI reconstruction models and perform tests on a well-known\nlarge open-source MRI dataset. Our results show that this novel way of\nperforming degradation can generate high-quality reconstruction images for\naccelerated MRI.\n","authors":["Guoyao Shen","Mengyu Li","Chad W. Farris","Stephan Anderson","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.10162v3.pdf","comment":"21 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.10968v2","updated":"2024-12-05T18:07:33Z","published":"2023-08-21T18:26:35Z","title":"Regularization by Neural Style Transfer for MRI Field-Transfer\n Reconstruction with Limited Data","summary":" Recent advances in MRI reconstruction have achieved remarkable success with\ndeep learning-based models. However, most methods depend on large-scale,\ntask-specific datasets, leaving reconstruction in data-limited settings as a\ncritical but underexplored challenge. Regularization by denoising (RED) is a\ngeneral pipeline that incorporates a denoiser as a prior for image\nreconstruction, showing promising results in various image processing tasks,\nincluding denoising, deblurring, and super-resolution. In this work, we propose\na regularization by neural style transfer (RNST) method to further leverage the\npriors from the neural transfer and denoising engine. RNST effectively\nreconstructs high-quality images from noisy, low-quality inputs across varying\nimage styles, even with limited data. We validate RNST on clinical MRI scans,\ndemonstrating its ability to significantly improve image quality. These\nfindings underline the potential of RNST for MRI field-transfer reconstruction\nand its promise in addressing reconstruction tasks in data-constrained\nscenarios.\n","authors":["Guoyao Shen","Yancheng Zhu","Mengyu Li","Ryan McNaughton","Hernan Jara","Sean B. Andersson","Chad W. Farris","Stephan Anderson","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10968v2.pdf","comment":"31 pages, 9 figures, 3 tables, 1 algorithm chart"},{"id":"http://arxiv.org/abs/2412.04392v1","updated":"2024-12-05T18:06:09Z","published":"2024-12-05T18:06:09Z","title":"Asynchronous Batch Bayesian Optimization with Pipelining Evaluations for\n Experimental Resource$\\unicode{x2013}$constrained Conditions","summary":" Bayesian optimization is efficient even with a small amount of data and is\nused in engineering and in science, including biology and chemistry. In\nBayesian optimization, a parameterized model with an uncertainty is fitted to\nexplain the experimental data, and then the model suggests parameters that\nwould most likely improve the results. Batch Bayesian optimization reduces the\nprocessing time of optimization by parallelizing experiments. However, batch\nBayesian optimization cannot be applied if the number of parallelized\nexperiments is limited by the cost or scarcity of equipment; in such cases,\nsequential methods require an unrealistic amount of time. In this study, we\ndeveloped pipelining Bayesian optimization (PipeBO) to reduce the processing\ntime of optimization even with a limited number of parallel experiments. PipeBO\nwas inspired by the pipelining of central processing unit architecture, which\ndivides computational tasks into multiple processes. PipeBO was designed to\nachieve experiment parallelization by overlapping various processes of the\nexperiments. PipeBO uses the results of completed experiments to update the\nparameters of running parallelized experiments. Using the Black-Box\nOptimization Benchmarking, which consists of 24 benchmark functions, we\ncompared PipeBO with the sequential Bayesian optimization methods. PipeBO\nreduced the average processing time of optimization to about 56% for the\nexperiments that consisted of two processes or even less for those with more\nprocesses for 20 out of the 24 functions. Overall, PipeBO parallelizes Bayesian\noptimization in the resource-constrained settings so that efficient\noptimization can be achieved.\n","authors":["Yujin Taguchi","Yusuke Shibuya","Yusuke Hiki","Takashi Morikura","Takahiro G. Yamada","Akira Funahashi"],"pdf_url":"https://arxiv.org/pdf/2412.04392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04384v1","updated":"2024-12-05T17:59:58Z","published":"2024-12-05T17:59:58Z","title":"Probabilistic Gaussian Superposition for Efficient 3D Occupancy\n Prediction","summary":" 3D semantic occupancy prediction is an important task for robust\nvision-centric autonomous driving, which predicts fine-grained geometry and\nsemantics of the surrounding scene. Most existing methods leverage dense\ngrid-based scene representations, overlooking the spatial sparsity of the\ndriving scenes. Although 3D semantic Gaussian serves as an object-centric\nsparse alternative, most of the Gaussians still describe the empty region with\nlow efficiency. To address this, we propose a probabilistic Gaussian\nsuperposition model which interprets each Gaussian as a probability\ndistribution of its neighborhood being occupied and conforms to probabilistic\nmultiplication to derive the overall geometry. Furthermore, we adopt the exact\nGaussian mixture model for semantics calculation to avoid unnecessary\noverlapping of Gaussians. To effectively initialize Gaussians in non-empty\nregion, we design a distribution-based initialization module which learns the\npixel-aligned occupancy distribution instead of the depth of surfaces. We\nconduct extensive experiments on nuScenes and KITTI-360 datasets and our\nGaussianFormer-2 achieves state-of-the-art performance with high efficiency.\nCode: https://github.com/huang-yh/GaussianFormer.\n","authors":["Yuanhui Huang","Amonnut Thammatadatrakoon","Wenzhao Zheng","Yunpeng Zhang","Dalong Du","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04384v1.pdf","comment":"Code is available at: https://github.com/huang-yh/GaussianFormer"},{"id":"http://arxiv.org/abs/2412.04380v1","updated":"2024-12-05T17:57:09Z","published":"2024-12-05T17:57:09Z","title":"EmbodiedOcc: Embodied 3D Occupancy Prediction for Vision-based Online\n Scene Understanding","summary":" 3D occupancy prediction provides a comprehensive description of the\nsurrounding scenes and has become an essential task for 3D perception. Most\nexisting methods focus on offline perception from one or a few views and cannot\nbe applied to embodied agents which demands to gradually perceive the scene\nthrough progressive embodied exploration. In this paper, we formulate an\nembodied 3D occupancy prediction task to target this practical scenario and\npropose a Gaussian-based EmbodiedOcc framework to accomplish it. We initialize\nthe global scene with uniform 3D semantic Gaussians and progressively update\nlocal regions observed by the embodied agent. For each update, we extract\nsemantic and structural features from the observed image and efficiently\nincorporate them via deformable cross-attention to refine the regional\nGaussians. Finally, we employ Gaussian-to-voxel splatting to obtain the global\n3D occupancy from the updated 3D Gaussians. Our EmbodiedOcc assumes an unknown\n(i.e., uniformly distributed) environment and maintains an explicit global\nmemory of it with 3D Gaussians. It gradually gains knowledge through local\nrefinement of regional Gaussians, which is consistent with how humans\nunderstand new scenes through embodied exploration. We reorganize an\nEmbodiedOcc-ScanNet benchmark based on local annotations to facilitate the\nevaluation of the embodied 3D occupancy prediction task. Experiments\ndemonstrate that our EmbodiedOcc outperforms existing local prediction methods\nand accomplishes the embodied occupancy prediction with high accuracy and\nstrong expandability. Our code is available at:\nhttps://github.com/YkiWu/EmbodiedOcc.\n","authors":["Yuqi Wu","Wenzhao Zheng","Sicheng Zuo","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2412.04380v1.pdf","comment":"Code: https://github.com/YkiWu/EmbodiedOcc"},{"id":"http://arxiv.org/abs/2412.04377v1","updated":"2024-12-05T17:52:35Z","published":"2024-12-05T17:52:35Z","title":"A Hitchhiker's Guide to Understanding Performances of Two-Class\n Classifiers","summary":" Properly understanding the performances of classifiers is essential in\nvarious scenarios. However, the literature often relies only on one or two\nstandard scores to compare classifiers, which fails to capture the nuances of\napplication-specific requirements, potentially leading to suboptimal classifier\nselection. Recently, a paper on the foundations of the theory of\nperformance-based ranking introduced a tool, called the Tile, that organizes an\ninfinity of ranking scores into a 2D map. Thanks to the Tile, it is now\npossible to evaluate and compare classifiers efficiently, displaying all\npossible application-specific preferences instead of having to rely on a pair\nof scores. In this paper, we provide a first hitchhiker's guide for\nunderstanding the performances of two-class classifiers by presenting four\nscenarios, each showcasing a different user profile: a theoretical analyst, a\nmethod designer, a benchmarker, and an application developer. Particularly, we\nshow that we can provide different interpretative flavors that are adapted to\nthe user's needs by mapping different values on the Tile. As an illustration,\nwe leverage the newly introduced Tile tool and the different flavors to rank\nand analyze the performances of 74 state-of-the-art semantic segmentation\nmodels in two-class classification through the eyes of the four user profiles.\nThrough these user profiles, we demonstrate that the Tile effectively captures\nthe behavior of classifiers in a single visualization, while accommodating an\ninfinite number of ranking scores.\n","authors":["Anaïs Halin","Sébastien Piérard","Anthony Cioppa","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2412.04377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.11224v2","updated":"2024-12-05T17:44:09Z","published":"2024-11-18T01:27:44Z","title":"Don't Be So Positive: Negative Step Sizes in Second-Order Methods","summary":" The value of second-order methods lies in the use of curvature information.\nYet, this information is costly to extract and once obtained, valuable negative\ncurvature information is often discarded so that the method is globally\nconvergent. This limits the effectiveness of second-order methods in modern\nmachine learning. In this paper, we show that second-order and\nsecond-order-like methods are promising optimizers for neural networks provided\nthat we add one ingredient: negative step sizes. We show that under very\ngeneral conditions, methods that produce ascent directions are globally\nconvergent when combined with a Wolfe line search that allows both positive and\nnegative step sizes. We experimentally demonstrate that using negative step\nsizes is often more effective than common Hessian modification methods.\n","authors":["Betty Shea","Mark Schmidt"],"pdf_url":"https://arxiv.org/pdf/2411.11224v2.pdf","comment":"added affiliation and more references"},{"id":"http://arxiv.org/abs/2412.04368v1","updated":"2024-12-05T17:36:22Z","published":"2024-12-05T17:36:22Z","title":"Finer Behavioral Foundation Models via Auto-Regressive Features and\n Advantage Weighting","summary":" The forward-backward representation (FB) is a recently proposed framework\n(Touati et al., 2023; Touati & Ollivier, 2021) to train behavior foundation\nmodels (BFMs) that aim at providing zero-shot efficient policies for any new\ntask specified in a given reinforcement learning (RL) environment, without\ntraining for each new task. Here we address two core limitations of FB model\ntraining. First, FB, like all successor-feature-based methods, relies on a\nlinear encoding of tasks: at test time, each new reward function is linearly\nprojected onto a fixed set of pre-trained features. This limits expressivity as\nwell as precision of the task representation. We break the linearity limitation\nby introducing auto-regressive features for FB, which let finegrained task\nfeatures depend on coarser-grained task information. This can represent\narbitrary nonlinear task encodings, thus significantly increasing expressivity\nof the FB framework. Second, it is well-known that training RL agents from\noffline datasets often requires specific techniques.We show that FB works well\ntogether with such offline RL techniques, by adapting techniques from (Nair et\nal.,2020b; Cetin et al., 2024) for FB. This is necessary to get non-flatlining\nperformance in some datasets, such as DMC Humanoid. As a result, we produce\nefficient FB BFMs for a number of new environments. Notably, in the D4RL\nlocomotion benchmark, the generic FB agent matches the performance of standard\nsingle-task offline agents (IQL, XQL). In many setups, the offline techniques\nare needed to get any decent performance at all. The auto-regressive features\nhave a positive but moderate impact, concentrated on tasks requiring spatial\nprecision and task generalization beyond the behaviors represented in the\ntrainset.\n","authors":["Edoardo Cetin","Ahmed Touati","Yann Ollivier"],"pdf_url":"https://arxiv.org/pdf/2412.04368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04367v1","updated":"2024-12-05T17:35:29Z","published":"2024-12-05T17:35:29Z","title":"Machine Theory of Mind for Autonomous Cyber-Defence","summary":" Intelligent autonomous agents hold much potential for the domain of\ncyber-security. However, due to many state-of-the-art approaches relying on\nuninterpretable black-box models, there is growing demand for methods that\noffer stakeholders clear and actionable insights into their latent beliefs and\nmotivations. To address this, we evaluate Theory of Mind (ToM) approaches for\nAutonomous Cyber Operations. Upon learning a robust prior, ToM models can\npredict an agent's goals, behaviours, and contextual beliefs given only a\nhandful of past behaviour observations. In this paper, we introduce a novel\nGraph Neural Network (GNN)-based ToM architecture tailored for cyber-defence,\nGraph-In, Graph-Out (GIGO)-ToM, which can accurately predict both the targets\nand attack trajectories of adversarial cyber agents over arbitrary computer\nnetwork topologies. To evaluate the latter, we propose a novel extension of the\nWasserstein distance for measuring the similarity of graph-based probability\ndistributions. Whereas the standard Wasserstein distance lacks a fixed\nreference scale, we introduce a graph-theoretic normalization factor that\nenables a standardized comparison between networks of different sizes. We\nfurnish this metric, which we term the Network Transport Distance (NTD), with a\nweighting function that emphasizes predictions according to custom node\nfeatures, allowing network operators to explore arbitrary strategic\nconsiderations. Benchmarked against a Graph-In, Dense-Out (GIDO)-ToM\narchitecture in an abstract cyber-defence environment, our empirical\nevaluations show that GIGO-ToM can accurately predict the goals and behaviours\nof various unseen cyber-attacking agents across a range of network topologies,\nas well as learn embeddings that can effectively characterize their policies.\n","authors":["Luke Swaby","Matthew Stewart","Daniel Harrold","Chris Willis","Gregory Palmer"],"pdf_url":"https://arxiv.org/pdf/2412.04367v1.pdf","comment":"29 pages, 17 figures, 12 tables"},{"id":"http://arxiv.org/abs/2401.01951v2","updated":"2024-12-05T17:31:43Z","published":"2024-01-03T19:27:20Z","title":"GeoPos: A Minimal Positional Encoding for Enhanced Fine-Grained Details\n in Image Synthesis Using Convolutional Neural Networks","summary":" The enduring inability of image generative models to recreate intricate\ngeometric features, such as those present in human hands and fingers has been\nan ongoing problem in image generation for nearly a decade. While strides have\nbeen made by increasing model sizes and diversifying training datasets, this\nissue remains prevalent across all models, from denoising diffusion models to\nGenerative Adversarial Networks (GAN), pointing to a fundamental shortcoming in\nthe underlying architectures. In this paper, we demonstrate how this problem\ncan be mitigated by augmenting convolution layers geometric capabilities\nthrough providing them with a single input channel incorporating the relative\nn-dimensional Cartesian coordinate system. We show this drastically improves\nquality of images generated by Diffusion Models, GANs, and Variational\nAutoEncoders (VAE).\n","authors":["Mehran Hosseini","Peyman Hosseini"],"pdf_url":"https://arxiv.org/pdf/2401.01951v2.pdf","comment":"Accepted at WACV 2025. Contains 19 pages, 15 figures, and 9 tables"},{"id":"http://arxiv.org/abs/2410.01910v2","updated":"2024-12-05T17:22:21Z","published":"2024-10-02T18:09:12Z","title":"Is uniform expressivity too restrictive? Towards efficient expressivity\n of graph neural networks","summary":" Uniform expressivity guarantees that a Graph Neural Network (GNN) can express\na query without the parameters depending on the size of the input graphs. This\nproperty is desirable in applications in order to have number of trainable\nparameters that is independent of the size of the input graphs. Uniform\nexpressivity of the two variable guarded fragment (GC2) of first order logic is\na well-celebrated result for Rectified Linear Unit (ReLU) GNNs [Barcelo & al.,\n2020]. In this article, we prove that uniform expressivity of GC2 queries is\nnot possible for GNNs with a wide class of Pfaffian activation functions\n(including the sigmoid and tanh), answering a question formulated by [Grohe,\n2021]. We also show that despite these limitations, many of those GNNs can\nstill efficiently express GC2 queries in a way that the number of parameters\nremains logarithmic on the maximal degree of the input graphs. Furthermore, we\ndemonstrate that a log-log dependency on the degree is achievable for a certain\nchoice of activation function. This shows that uniform expressivity can be\nsuccessfully relaxed by covering large graphs appearing in practical\napplications. Our experiments illustrates that our theoretical estimates hold\nin practice.\n","authors":["Sammy Khalife","Josué Tonelli-Cueto"],"pdf_url":"https://arxiv.org/pdf/2410.01910v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13000v2","updated":"2024-12-05T17:19:12Z","published":"2024-09-19T15:38:21Z","title":"Introducing the Large Medical Model: State of the art healthcare cost\n and risk prediction with transformers trained on patient event sequences","summary":" With U.S. healthcare spending approaching $5T (NHE Fact Sheet 2024), and 25%\nof it estimated to be wasteful (Waste in the US the health care system:\nestimated costs and potential for savings, n.d.), the need to better predict\nrisk and optimal patient care is evermore important. This paper introduces the\nLarge Medical Model (LMM), a generative pre-trained transformer (GPT) designed\nto guide and predict the broad facets of patient care and healthcare\nadministration. The model is trained on medical event sequences from over 140M\nlongitudinal patient claims records with a specialized vocabulary built from\nmedical terminology systems and demonstrates a superior capability to forecast\nhealthcare costs and identify potential risk factors. Through experimentation\nand validation, we showcase the LMM's proficiency in not only in cost and risk\npredictions, but also in discerning intricate patterns within complex medical\nconditions and an ability to identify novel relationships in patient care. The\nLMM is able to improve both cost prediction by 14.1% over the best commercial\nmodels and chronic conditions prediction by 1.9% over the best transformer\nmodels in research predicting a broad set of conditions. The LMM is a\nsubstantial advancement in healthcare analytics, offering the potential to\nsignificantly enhance risk assessment, cost management, and personalized\nmedicine.\n","authors":["Ricky Sahu","Eric Marriott","Ethan Siegel","David Wagner","Flore Uzan","Troy Yang","Asim Javed"],"pdf_url":"https://arxiv.org/pdf/2409.13000v2.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.04358v1","updated":"2024-12-05T17:17:28Z","published":"2024-12-05T17:17:28Z","title":"Approximate Top-$k$ for Increased Parallelism","summary":" We present an evaluation of bucketed approximate top-$k$ algorithms.\nComputing top-$k$ exactly suffers from limited parallelism, because the $k$\nlargest values must be aggregated along the vector, thus is not well suited to\ncomputation on highly-parallel machine learning accelerators. By relaxing the\nrequirement that the top-$k$ is exact, bucketed algorithms can dramatically\nincrease the parallelism available by independently computing many smaller\ntop-$k$ operations. We explore the design choices of this class of algorithms\nusing both theoretical analysis and empirical evaluation on downstream tasks.\nOur motivating examples are sparsity algorithms for language models, which\noften use top-$k$ to select the most important parameters or activations. We\nalso release a fast bucketed top-$k$ implementation for PyTorch.\n","authors":["Oscar Key","Luka Ribar","Alberto Cattaneo","Luke Hudlass-Galley","Douglas Orr"],"pdf_url":"https://arxiv.org/pdf/2412.04358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04354v1","updated":"2024-12-05T17:12:45Z","published":"2024-12-05T17:12:45Z","title":"Multi-Scale Node Embeddings for Graph Modeling and Generation","summary":" Lying at the interface between Network Science and Machine Learning, node\nembedding algorithms take a graph as input and encode its structure onto output\nvectors that represent nodes in an abstract geometric space, enabling various\nvector-based downstream tasks such as network modelling, data compression, link\nprediction, and community detection. Two apparently unrelated limitations\naffect these algorithms. On one hand, it is not clear what the basic operation\ndefining vector spaces, i.e. the vector sum, corresponds to in terms of the\noriginal nodes in the network. On the other hand, while the same input network\ncan be represented at multiple levels of resolution by coarse-graining the\nconstituent nodes into arbitrary block-nodes, the relationship between node\nembeddings obtained at different hierarchical levels is not understood. Here,\nbuilding on recent results in network renormalization theory, we address these\ntwo limitations at once and define a multiscale node embedding method that,\nupon arbitrary coarse-grainings, ensures statistical consistency of the\nembedding vector of a block-node with the sum of the embedding vectors of its\nconstituent nodes. We illustrate the power of this approach on two economic\nnetworks that can be naturally represented at multiple resolution levels:\nnamely, the international trade between (sets of) countries and the\ninput-output flows among (sets of) industries in the Netherlands. We confirm\nthe statistical consistency between networks retrieved from coarse-grained node\nvectors and networks retrieved from sums of fine-grained node vectors, a result\nthat cannot be achieved by alternative methods. Several key network properties,\nincluding a large number of triangles, are successfully replicated already from\nembeddings of very low dimensionality, allowing for the generation of faithful\nreplicas of the original networks at arbitrary resolution levels.\n","authors":["Riccardo Milocco","Fabian Jansen","Diego Garlaschelli"],"pdf_url":"https://arxiv.org/pdf/2412.04354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04353v1","updated":"2024-12-05T17:12:35Z","published":"2024-12-05T17:12:35Z","title":"ActFusion: a Unified Diffusion Model for Action Segmentation and\n Anticipation","summary":" Temporal action segmentation and long-term action anticipation are two\npopular vision tasks for the temporal analysis of actions in videos. Despite\napparent relevance and potential complementarity, these two problems have been\ninvestigated as separate and distinct tasks. In this work, we tackle these two\nproblems, action segmentation and action anticipation, jointly using a unified\ndiffusion model dubbed ActFusion. The key idea to unification is to train the\nmodel to effectively handle both visible and invisible parts of the sequence in\nan integrated manner; the visible part is for temporal segmentation, and the\ninvisible part is for future anticipation. To this end, we introduce a new\nanticipative masking strategy during training in which a late part of the video\nframes is masked as invisible, and learnable tokens replace these frames to\nlearn to predict the invisible future. Experimental results demonstrate the\nbi-directional benefits between action segmentation and anticipation. ActFusion\nachieves the state-of-the-art performance across the standard benchmarks of 50\nSalads, Breakfast, and GTEA, outperforming task-specific models in both of the\ntwo tasks with a single unified model through joint learning.\n","authors":["Dayoung Gong","Suha Kwak","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2412.04353v1.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.04346v1","updated":"2024-12-05T17:05:49Z","published":"2024-12-05T17:05:49Z","title":"Distributionally Robust Performative Prediction","summary":" Performative prediction aims to model scenarios where predictive outcomes\nsubsequently influence the very systems they target. The pursuit of a\nperformative optimum (PO) -- minimizing performative risk -- is generally\nreliant on modeling of the distribution map, which characterizes how a deployed\nML model alters the data distribution. Unfortunately, inevitable\nmisspecification of the distribution map can lead to a poor approximation of\nthe true PO. To address this issue, we introduce a novel framework of\ndistributionally robust performative prediction and study a new solution\nconcept termed as distributionally robust performative optimum (DRPO). We show\nprovable guarantees for DRPO as a robust approximation to the true PO when the\nnominal distribution map is different from the actual one. Moreover,\ndistributionally robust performative prediction can be reformulated as an\naugmented performative prediction problem, enabling efficient optimization. The\nexperimental results demonstrate that DRPO offers potential advantages over\ntraditional PO approach when the distribution map is misspecified at either\nmicro- or macro-level.\n","authors":["Songkai Xue","Yuekai Sun"],"pdf_url":"https://arxiv.org/pdf/2412.04346v1.pdf","comment":"In Proceedings of the 38th Conference on Neural Information\n Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2410.16340v3","updated":"2024-12-05T17:03:34Z","published":"2024-10-21T09:39:10Z","title":"Limit Theorems for Stochastic Gradient Descent with Infinite Variance","summary":" Stochastic gradient descent is a classic algorithm that has gained great\npopularity especially in the last decades as the most common approach for\ntraining models in machine learning. While the algorithm has been well-studied\nwhen stochastic gradients are assumed to have a finite variance, there is\nsignificantly less research addressing its theoretical properties in the case\nof infinite variance gradients. In this paper, we establish the asymptotic\nbehavior of stochastic gradient descent in the context of infinite variance\nstochastic gradients, assuming that the stochastic gradient is regular varying\nwith index $\\alpha\\in(1,2)$. The closest result in this context was established\nin 1969 , in the one-dimensional case and assuming that stochastic gradients\nbelong to a more restrictive class of distributions. We extend it to the\nmultidimensional case, covering a broader class of infinite variance\ndistributions. As we show, the asymptotic distribution of the stochastic\ngradient descent algorithm can be characterized as the stationary distribution\nof a suitably defined Ornstein-Uhlenbeck process driven by an appropriate\nstable L\\'evy process. Additionally, we explore the applications of these\nresults in linear regression and logistic regression models.\n","authors":["Jose Blanchet","Aleksandar Mijatović","Wenhao Yang"],"pdf_url":"https://arxiv.org/pdf/2410.16340v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04339v1","updated":"2024-12-05T16:58:45Z","published":"2024-12-05T16:58:45Z","title":"Likelihood-Scheduled Score-Based Generative Modeling for Fully 3D PET\n Image Reconstruction","summary":" Medical image reconstruction with pre-trained score-based generative models\n(SGMs) has advantages over other existing state-of-the-art deep-learned\nreconstruction methods, including improved resilience to different scanner\nsetups and advanced image distribution modeling. SGM-based reconstruction has\nrecently been applied to simulated positron emission tomography (PET) datasets,\nshowing improved contrast recovery for out-of-distribution lesions relative to\nthe state-of-the-art. However, existing methods for SGM-based reconstruction\nfrom PET data suffer from slow reconstruction, burdensome hyperparameter tuning\nand slice inconsistency effects (in 3D). In this work, we propose a practical\nmethodology for fully 3D reconstruction that accelerates reconstruction and\nreduces the number of critical hyperparameters by matching the likelihood of an\nSGM's reverse diffusion process to a current iterate of the maximum-likelihood\nexpectation maximization algorithm. Using the example of low-count\nreconstruction from simulated $[^{18}$F]DPA-714 datasets, we show our\nmethodology can match or improve on the NRMSE and SSIM of existing\nstate-of-the-art SGM-based PET reconstruction while reducing reconstruction\ntime and the need for hyperparameter tuning. We evaluate our methodology\nagainst state-of-the-art supervised and conventional reconstruction algorithms.\nFinally, we demonstrate a first-ever implementation of SGM-based reconstruction\nfor real 3D PET data, specifically $[^{18}$F]DPA-714 data, where we integrate\nperpendicular pre-trained SGMs to eliminate slice inconsistency issues.\n","authors":["George Webber","Yuya Mizuno","Oliver D. Howes","Alexander Hammers","Andrew P. King","Andrew J. Reader"],"pdf_url":"https://arxiv.org/pdf/2412.04339v1.pdf","comment":"11 pages, 12 figures. Submitted to Transactions on Medical Imaging"},{"id":"http://arxiv.org/abs/2412.04327v1","updated":"2024-12-05T16:42:45Z","published":"2024-12-05T16:42:45Z","title":"Action Mapping for Reinforcement Learning in Continuous Environments\n with Constraints","summary":" Deep reinforcement learning (DRL) has had success across various domains, but\napplying it to environments with constraints remains challenging due to poor\nsample efficiency and slow convergence. Recent literature explored\nincorporating model knowledge to mitigate these problems, particularly through\nthe use of models that assess the feasibility of proposed actions. However,\nintegrating feasibility models efficiently into DRL pipelines in environments\nwith continuous action spaces is non-trivial. We propose a novel DRL training\nstrategy utilizing action mapping that leverages feasibility models to\nstreamline the learning process. By decoupling the learning of feasible actions\nfrom policy optimization, action mapping allows DRL agents to focus on\nselecting the optimal action from a reduced feasible action set. We demonstrate\nthrough experiments that action mapping significantly improves training\nperformance in constrained environments with continuous action spaces,\nespecially with imperfect feasibility models.\n","authors":["Mirco Theile","Lukas Dirnberger","Raphael Trumpp","Marco Caccamo","Alberto L. Sangiovanni-Vincentelli"],"pdf_url":"https://arxiv.org/pdf/2412.04327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04323v1","updated":"2024-12-05T16:39:01Z","published":"2024-12-05T16:39:01Z","title":"GRAM: Generalization in Deep RL with a Robust Adaptation Module","summary":" The reliable deployment of deep reinforcement learning in real-world settings\nrequires the ability to generalize across a variety of conditions, including\nboth in-distribution scenarios seen during training as well as novel\nout-of-distribution scenarios. In this work, we present a framework for\ndynamics generalization in deep reinforcement learning that unifies these two\ndistinct types of generalization within a single architecture. We introduce a\nrobust adaptation module that provides a mechanism for identifying and reacting\nto both in-distribution and out-of-distribution environment dynamics, along\nwith a joint training pipeline that combines the goals of in-distribution\nadaptation and out-of-distribution robustness. Our algorithm GRAM achieves\nstrong generalization performance across in-distribution and\nout-of-distribution scenarios upon deployment, which we demonstrate on a\nvariety of realistic simulated locomotion tasks with a quadruped robot.\n","authors":["James Queeney","Xiaoyi Cai","Mouhacine Benosman","Jonathan P. How"],"pdf_url":"https://arxiv.org/pdf/2412.04323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02951v2","updated":"2024-12-05T16:35:46Z","published":"2023-10-04T16:41:36Z","title":"A Fisher-Rao gradient flow for entropy-regularised Markov decision\n processes in Polish spaces","summary":" We study the global convergence of a Fisher-Rao policy gradient flow for\ninfinite-horizon entropy-regularised Markov decision processes with Polish\nstate and action space. The flow is a continuous-time analogue of a policy\nmirror descent method. We establish the global well-posedness of the gradient\nflow and demonstrate its exponential convergence to the optimal policy.\nMoreover, we prove the flow is stable with respect to gradient evaluation,\noffering insights into the performance of a natural policy gradient flow with\nlog-linear policy parameterisation. To overcome challenges stemming from the\nlack of the convexity of the objective function and the discontinuity arising\nfrom the entropy regulariser, we leverage the performance difference lemma and\nthe duality relationship between the gradient and mirror descent flows. Our\nanalysis provides a theoretical foundation for developing various discrete\npolicy gradient algorithms.\n","authors":["Bekzhan Kerimkulov","James-Michael Leahy","David Siska","Lukasz Szpruch","Yufei Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.02951v2.pdf","comment":"add discretizations of gradient flow and their convergence analysis"},{"id":"http://arxiv.org/abs/2412.04319v1","updated":"2024-12-05T16:35:43Z","published":"2024-12-05T16:35:43Z","title":"Generative-Model-Based Fully 3D PET Image Reconstruction by Conditional\n Diffusion Sampling","summary":" Score-based generative models (SGMs) have recently shown promising results\nfor image reconstruction on simulated positron emission tomography (PET)\ndatasets. In this work we have developed and implemented practical methodology\nfor 3D image reconstruction with SGMs, and perform (to our knowledge) the first\nSGM-based reconstruction of real fully 3D PET data. We train an SGM on\nfull-count reference brain images, and extend methodology to allow SGM-based\nreconstructions at very low counts (1% of original, to simulate low-dose or\nshort-duration scanning). We then perform reconstructions for multiple\nindependent realisations of 1% count data, allowing us to analyse the bias and\nvariance characteristics of the method. We sample from the learned posterior\ndistribution of the generative algorithm to calculate uncertainty images for\nour reconstructions. We evaluate the method's performance on real full- and\nlow-count PET data and compare with conventional OSEM and MAP-EM baselines,\nshowing that our SGM-based low-count reconstructions match full-dose\nreconstructions more closely and in a bias-variance trade-off comparison, our\nSGM-reconstructed images have lower variance than existing baselines. Future\nwork will compare to supervised deep-learned methods, with other avenues for\ninvestigation including how data conditioning affects the SGM's posterior\ndistribution and the algorithm's performance with different tracers.\n","authors":["George Webber","Yuya Mizuno","Oliver D. Howes","Alexander Hammers","Andrew P. King","Andrew J. Reader"],"pdf_url":"https://arxiv.org/pdf/2412.04319v1.pdf","comment":"2 pages, 2 figures. Accepted for oral presentation at IEEE NSS MIC\n RTSD 2024 (submitted May 2024; accepted July 2024; presented Nov 2024)"},{"id":"http://arxiv.org/abs/2311.12068v3","updated":"2024-12-05T16:34:21Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":" In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at https://rohit901.github.io/coop-foundation-models/ .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v3.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2412.04309v1","updated":"2024-12-05T16:27:59Z","published":"2024-12-05T16:27:59Z","title":"The Tile: A 2D Map of Ranking Scores for Two-Class Classification","summary":" In the computer vision and machine learning communities, as well as in many\nother research domains, rigorous evaluation of any new method, including\nclassifiers, is essential. One key component of the evaluation process is the\nability to compare and rank methods. However, ranking classifiers and\naccurately comparing their performances, especially when taking\napplication-specific preferences into account, remains challenging. For\ninstance, commonly used evaluation tools like Receiver Operating Characteristic\n(ROC) and Precision/Recall (PR) spaces display performances based on two\nscores. Hence, they are inherently limited in their ability to compare\nclassifiers across a broader range of scores and lack the capability to\nestablish a clear ranking among classifiers. In this paper, we present a novel\nversatile tool, named the Tile, that organizes an infinity of ranking scores in\na single 2D map for two-class classifiers, including common evaluation scores\nsuch as the accuracy, the true positive rate, the positive predictive value,\nJaccard's coefficient, and all F-beta scores. Furthermore, we study the\nproperties of the underlying ranking scores, such as the influence of the\npriors or the correspondences with the ROC space, and depict how to\ncharacterize any other score by comparing them to the Tile. Overall, we\ndemonstrate that the Tile is a powerful tool that effectively captures all the\nrankings in a single visualization and allows interpreting them.\n","authors":["Sébastien Piérard","Anaïs Halin","Anthony Cioppa","Adrien Deliège","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2412.04309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.18825v2","updated":"2024-12-05T16:27:08Z","published":"2024-11-27T23:58:32Z","title":"ELEMENTAL: Interactive Learning from Demonstrations and Vision-Language\n Models for Reward Design in Robotics","summary":" Reinforcement learning (RL) has demonstrated compelling performance in\nrobotic tasks, but its success often hinges on the design of complex, ad hoc\nreward functions. Researchers have explored how Large Language Models (LLMs)\ncould enable non-expert users to specify reward functions more easily. However,\nLLMs struggle to balance the importance of different features, generalize\npoorly to out-of-distribution robotic tasks, and cannot represent the problem\nproperly with only text-based descriptions. To address these challenges, we\npropose ELEMENTAL (intEractive LEarning froM dEmoNstraTion And Language), a\nnovel framework that combines natural language guidance with visual user\ndemonstrations to align robot behavior with user intentions better. By\nincorporating visual inputs, ELEMENTAL overcomes the limitations of text-only\ntask specifications, while leveraging inverse reinforcement learning (IRL) to\nbalance feature weights and match the demonstrated behaviors optimally.\nELEMENTAL also introduces an iterative feedback-loop through self-reflection to\nimprove feature, reward, and policy learning. Our experiment results\ndemonstrate that ELEMENTAL outperforms prior work by 42.3% on task success, and\nachieves 41.3% better generalization in out-of-distribution tasks, highlighting\nits robustness in LfD.\n","authors":["Letian Chen","Matthew Gombolay"],"pdf_url":"https://arxiv.org/pdf/2411.18825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04305v1","updated":"2024-12-05T16:26:31Z","published":"2024-12-05T16:26:31Z","title":"ALMA: Alignment with Minimal Annotation","summary":" Recent approaches to large language model (LLM) alignment typically require\nmillions of human annotations or rely on external aligned models for synthetic\ndata generation. This paper introduces ALMA: Alignment with Minimal Annotation,\ndemonstrating that effective alignment can be achieved using only 9,000 labeled\nexamples -- less than 1% of conventional approaches. ALMA generates large\namounts of high-quality synthetic alignment data through new techniques:\ndiverse prompt synthesis via few-shot learning, diverse response generation\nwith multiple model checkpoints, and judge (reward model) enhancement through\nscore aggregation and self-distillation. Using only a pretrained Llama3 base\nmodel, 5,000 SFT examples, and 4,000 judge annotations, ALMA achieves\nperformance close to Llama3-Instruct across diverse alignment benchmarks (e.g.,\n0.1% difference on AlpacaEval 2.0 score). These results are achieved with a\nmulti-round, self-bootstrapped data synthesis and training recipe that\ncontinues to improve for 10 rounds, surpassing the typical 3-round ceiling of\nprevious methods. These results suggest that base models already possess\nsufficient knowledge for effective alignment, and that synthetic data\ngeneration methods can expose it.\n","authors":["Michihiro Yasunaga","Leonid Shamis","Chunting Zhou","Andrew Cohen","Jason Weston","Luke Zettlemoyer","Marjan Ghazvininejad"],"pdf_url":"https://arxiv.org/pdf/2412.04305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17978v2","updated":"2024-12-05T16:24:15Z","published":"2024-09-26T15:52:36Z","title":"HydraViT: Stacking Heads for a Scalable ViT","summary":" The architecture of Vision Transformers (ViTs), particularly the Multi-head\nAttention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs\non devices with varying constraints, such as mobile phones, requires multiple\nmodels of different sizes. However, this approach has limitations, such as\ntraining and storing each required model separately. This paper introduces\nHydraViT, a novel approach that addresses these limitations by stacking\nattention heads to achieve a scalable ViT. By repeatedly changing the size of\nthe embedded dimensions throughout each layer and their corresponding number of\nattention heads in MHA during training, HydraViT induces multiple subnetworks.\nThereby, HydraViT achieves adaptability across a wide spectrum of hardware\nenvironments while maintaining performance. Our experimental results\ndemonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10\nsubnetworks, covering a wide range of resource constraints. HydraViT achieves\nup to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy\nwith the same throughput on ImageNet-1K compared to the baselines, making it an\neffective solution for scenarios where hardware availability is diverse or\nvaries over time. Source code available at https://github.com/ds-kiel/HydraViT.\n","authors":["Janek Haberer","Ali Hojjat","Olaf Landsiedel"],"pdf_url":"https://arxiv.org/pdf/2409.17978v2.pdf","comment":"Accepted at NeurIPS'24, please cite the conference version"},{"id":"http://arxiv.org/abs/2412.04296v1","updated":"2024-12-05T16:15:32Z","published":"2024-12-05T16:15:32Z","title":"Structure-Aware Stylized Image Synthesis for Robust Medical Image\n Segmentation","summary":" Accurate medical image segmentation is essential for effective diagnosis and\ntreatment planning but is often challenged by domain shifts caused by\nvariations in imaging devices, acquisition conditions, and patient-specific\nattributes. Traditional domain generalization methods typically require\ninclusion of parts of the test domain within the training set, which is not\nalways feasible in clinical settings with limited diverse data. Additionally,\nalthough diffusion models have demonstrated strong capabilities in image\ngeneration and style transfer, they often fail to preserve the critical\nstructural information necessary for precise medical analysis. To address these\nissues, we propose a novel medical image segmentation method that combines\ndiffusion models and Structure-Preserving Network for structure-aware one-shot\nimage stylization. Our approach effectively mitigates domain shifts by\ntransforming images from various sources into a consistent style while\nmaintaining the location, size, and shape of lesions. This ensures robust and\naccurate segmentation even when the target domain is absent from the training\ndata. Experimental evaluations on colonoscopy polyp segmentation and skin\nlesion segmentation datasets show that our method enhances the robustness and\naccuracy of segmentation models, achieving superior performance metrics\ncompared to baseline models without style transfer. This structure-aware\nstylization framework offers a practical solution for improving medical image\nsegmentation across diverse domains, facilitating more reliable clinical\ndiagnoses.\n","authors":["Jie Bao","Zhixin Zhou","Wen Jung Li","Rui Luo"],"pdf_url":"https://arxiv.org/pdf/2412.04296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04285v1","updated":"2024-12-05T16:06:23Z","published":"2024-12-05T16:06:23Z","title":"Deep Causal Inference for Point-referenced Spatial Data with Continuous\n Treatments","summary":" Causal reasoning is often challenging with spatial data, particularly when\nhandling high-dimensional inputs. To address this, we propose a neural network\n(NN) based framework integrated with an approximate Gaussian process to manage\nspatial interference and unobserved confounding. Additionally, we adopt a\ngeneralized propensity-score-based approach to address partially observed\noutcomes when estimating causal effects with continuous treatments. We evaluate\nour framework using synthetic, semi-synthetic, and real-world data inferred\nfrom satellite imagery. Our results demonstrate that NN-based models\nsignificantly outperform linear spatial regression models in estimating causal\neffects. Furthermore, in real-world case studies, NN-based models offer more\nreasonable predictions of causal effects, facilitating decision-making in\nrelevant applications.\n","authors":["Ziyang Jiang","Zach Calhoun","Yiling Liu","Lei Duan","David Carlson"],"pdf_url":"https://arxiv.org/pdf/2412.04285v1.pdf","comment":"16 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2411.15046v2","updated":"2024-12-05T16:04:02Z","published":"2024-11-22T16:31:36Z","title":"On Multi-Agent Inverse Reinforcement Learning","summary":" In multi-agent systems, the agent behavior is highly influenced by its\nutility function, as these utilities shape both individual goals as well as\ninteractions with the other agents. Inverse Reinforcement Learning (IRL) is a\nwell-established approach to inferring the utility function by observing an\nexpert behavior within a given environment. In this paper, we extend the IRL\nframework to the multi-agent setting, assuming to observe agents who are\nfollowing Nash Equilibrium (NE) policies. We theoretically investigate the set\nof utilities that explain the behavior of NE experts. Specifically, we provide\nan explicit characterization of the feasible reward set and analyze how errors\nin estimating the transition dynamics and expert behavior impact the recovered\nrewards. Building on these findings, we provide the first sample complexity\nanalysis for the multi-agent IRL problem. Finally, we provide a numerical\nevaluation of our theoretical results.\n","authors":["Till Freihaut","Giorgia Ramponi"],"pdf_url":"https://arxiv.org/pdf/2411.15046v2.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2412.04274v1","updated":"2024-12-05T15:56:54Z","published":"2024-12-05T15:56:54Z","title":"Complexity of Vector-valued Prediction: From Linear Models to Stochastic\n Convex Optimization","summary":" We study the problem of learning vector-valued linear predictors: these are\nprediction rules parameterized by a matrix that maps an $m$-dimensional feature\nvector to a $k$-dimensional target. We focus on the fundamental case with a\nconvex and Lipschitz loss function, and show several new theoretical results\nthat shed light on the complexity of this problem and its connection to related\nlearning models. First, we give a tight characterization of the sample\ncomplexity of Empirical Risk Minimization (ERM) in this setting, establishing\nthat $\\smash{\\widetilde{\\Omega}}(k/\\epsilon^2)$ examples are necessary for ERM\nto reach $\\epsilon$ excess (population) risk; this provides for an exponential\nimprovement over recent results by Magen and Shamir (2023) in terms of the\ndependence on the target dimension $k$, and matches a classical upper bound due\nto Maurer (2016). Second, we present a black-box conversion from general\n$d$-dimensional Stochastic Convex Optimization (SCO) to vector-valued linear\nprediction, showing that any SCO problem can be embedded as a prediction\nproblem with $k=\\Theta(d)$ outputs. These results portray the setting of\nvector-valued linear prediction as bridging between two extensively studied yet\ndisparate learning models: linear models (corresponds to $k=1$) and general\n$d$-dimensional SCO (with $k=\\Theta(d)$).\n","authors":["Matan Schliserman","Tomer Koren"],"pdf_url":"https://arxiv.org/pdf/2412.04274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04273v1","updated":"2024-12-05T15:55:23Z","published":"2024-12-05T15:55:23Z","title":"Reinforcement Learning from Wild Animal Videos","summary":" We propose to learn legged robot locomotion skills by watching thousands of\nwild animal videos from the internet, such as those featured in nature\ndocumentaries. Indeed, such videos offer a rich and diverse collection of\nplausible motion examples, which could inform how robots should move. To\nachieve this, we introduce Reinforcement Learning from Wild Animal Videos\n(RLWAV), a method to ground these motions into physical robots. We first train\na video classifier on a large-scale animal video dataset to recognize actions\nfrom RGB clips of animals in their natural habitats. We then train a\nmulti-skill policy to control a robot in a physics simulator, using the\nclassification score of a third-person camera capturing videos of the robot's\nmovements as a reward for reinforcement learning. Finally, we directly transfer\nthe learned policy to a real quadruped Solo. Remarkably, despite the extreme\ngap in both domain and embodiment between animals in the wild and robots, our\napproach enables the policy to learn diverse skills such as walking, jumping,\nand keeping still, without relying on reference trajectories nor skill-specific\nrewards.\n","authors":["Elliot Chane-Sane","Constant Roux","Olivier Stasse","Nicolas Mansard"],"pdf_url":"https://arxiv.org/pdf/2412.04273v1.pdf","comment":"Project website: https://elliotchanesane31.github.io/RLWAV/"},{"id":"http://arxiv.org/abs/2405.20331v2","updated":"2024-12-05T15:48:24Z","published":"2024-05-30T17:59:04Z","title":"CoSy: Evaluating Textual Explanations of Neurons","summary":" A crucial aspect of understanding the complex nature of Deep Neural Networks\n(DNNs) is the ability to explain learned concepts within their latent\nrepresentations. While methods exist to connect neurons to human-understandable\ntextual descriptions, evaluating the quality of these explanations is\nchallenging due to the lack of a unified quantitative approach. We introduce\nCoSy (Concept Synthesis), a novel, architecture-agnostic framework for\nevaluating textual explanations of latent neurons. Given textual explanations,\nour proposed framework uses a generative model conditioned on textual input to\ncreate data points representing the explanations. By comparing the neuron's\nresponse to these generated data points and control data points, we can\nestimate the quality of the explanation. We validate our framework through\nsanity checks and benchmark various neuron description methods for Computer\nVision tasks, revealing significant differences in quality.\n","authors":["Laura Kopf","Philine Lou Bommer","Anna Hedström","Sebastian Lapuschkin","Marina M. -C. Höhne","Kirill Bykov"],"pdf_url":"https://arxiv.org/pdf/2405.20331v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.02137v2","updated":"2024-12-05T15:46:44Z","published":"2024-11-04T14:50:15Z","title":"Finite-sample performance of the maximum likelihood estimator in\n logistic regression","summary":" Logistic regression is a classical model for describing the probabilistic\ndependence of binary responses to multivariate covariates. We consider the\npredictive performance of the maximum likelihood estimator (MLE) for logistic\nregression, assessed in terms of logistic risk. We consider two questions:\nfirst, that of the existence of the MLE (which occurs when the dataset is not\nlinearly separated), and second that of its accuracy when it exists. These\nproperties depend on both the dimension of covariates and on the signal\nstrength. In the case of Gaussian covariates and a well-specified logistic\nmodel, we obtain sharp non-asymptotic guarantees for the existence and excess\nlogistic risk of the MLE. We then generalize these results in two ways: first,\nto non-Gaussian covariates satisfying a certain two-dimensional margin\ncondition, and second to the general case of statistical learning with a\npossibly misspecified logistic model. Finally, we consider the case of a\nBernoulli design, where the behavior of the MLE is highly sensitive to the\nparameter direction.\n","authors":["Hugo Chardon","Matthieu Lerasle","Jaouad Mourtada"],"pdf_url":"https://arxiv.org/pdf/2411.02137v2.pdf","comment":"Simplified some statements and added a proof sketch in Sec. 4"},{"id":"http://arxiv.org/abs/2412.04262v1","updated":"2024-12-05T15:42:59Z","published":"2024-12-05T15:42:59Z","title":"SynFinTabs: A Dataset of Synthetic Financial Tables for Information and\n Table Extraction","summary":" Table extraction from document images is a challenging AI problem, and\nlabelled data for many content domains is difficult to come by. Existing table\nextraction datasets often focus on scientific tables due to the vast amount of\nacademic articles that are readily available, along with their source code.\nHowever, there are significant layout and typographical differences between\ntables found across scientific, financial, and other domains. Current datasets\noften lack the words, and their positions, contained within the tables, instead\nrelying on unreliable OCR to extract these features for training modern machine\nlearning models on natural language processing tasks. Therefore, there is a\nneed for a more general method of obtaining labelled data. We present\nSynFinTabs, a large-scale, labelled dataset of synthetic financial tables. Our\nhope is that our method of generating these synthetic tables is transferable to\nother domains. To demonstrate the effectiveness of our dataset in training\nmodels to extract information from table images, we create FinTabQA, a layout\nlarge language model trained on an extractive question-answering task. We test\nour model using real-world financial tables and compare it to a\nstate-of-the-art generative model and discuss the results. We make the dataset,\nmodel, and dataset generation code publicly available.\n","authors":["Ethan Bradley","Muhammad Roman","Karen Rafferty","Barry Devereux"],"pdf_url":"https://arxiv.org/pdf/2412.04262v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.04259v1","updated":"2024-12-05T15:39:13Z","published":"2024-12-05T15:39:13Z","title":"SCADE: Scalable Command-line Anomaly Detection Engine","summary":" As command-line interfaces remain an integral part of high-computation\nenvironments, the risk of exploitation through stealthy, complex command-line\nabuse continues to grow. Conventional security solutions often struggle with\nthese command-line-based anomalies due to their context-specific nature and\nlack of labeled data, especially in detecting rare, malicious patterns amidst\nlegitimate, high-volume activity. This gap has left organizations vulnerable to\nsophisticated threats like Living-off-the-Land (LOL) attacks, where standard\ndetection tools frequently miss or misclassify anomalous command-line behavior.\nWe introduce Scalable Command-Line Anomaly Detection Engine (SCADE), who\naddresses these challenges by introducing a dual-layered detection framework\nthat combines a global statistical analysis with local context-specific anomaly\ndetection, innovatively using a novel ensemble of statistical models such as\nBM25 and Log Entropy, adapted for command-line data. The framework also\nfeatures a dynamic thresholding mechanism for adaptive anomaly detection,\nensuring high precision and recall even in environments with extremely high\nSignal-to-Noise Ratios (SNRs). Initial experimental results demonstrate the\neffectiveness of the framework, achieving above 98% SNR in identifying unusual\ncommand-line behavior while minimizing false positives. In this paper, we\npresent SCADE's core architecture, including its metadata-enriched approach to\nanomaly detection and the design choices behind its scalability for\nenterprise-level deployment. We argue that SCADE represents a significant\nadvancement in command-line anomaly detection, offering a robust, adaptive\nframework for security analysts and researchers seeking to enhance detection\naccuracy in high-computation environments.\n","authors":["Vaishali Vinay","Anjali Mangal"],"pdf_url":"https://arxiv.org/pdf/2412.04259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.17010v2","updated":"2024-12-05T15:33:29Z","published":"2024-03-25T17:59:59Z","title":"Calib3D: Calibrating Model Preferences for Reliable 3D Scene\n Understanding","summary":" Safety-critical 3D scene understanding tasks necessitate not only accurate\nbut also confident predictions from 3D perception models. This study introduces\nCalib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D\nscene understanding models from an uncertainty estimation viewpoint. We\ncomprehensively evaluate 28 state-of-the-art models across 10 diverse 3D\ndatasets, uncovering insightful phenomena that cope with both the aleatoric and\nepistemic uncertainties in 3D scene understanding. We discover that despite\nachieving impressive levels of accuracy, existing models frequently fail to\nprovide reliable uncertainty estimates -- a pitfall that critically undermines\ntheir applicability in safety-sensitive contexts. Through extensive analysis of\nkey factors such as network capacity, LiDAR representations, rasterization\nresolutions, and 3D data augmentation techniques, we correlate these aspects\ndirectly with the model calibration efficacy. Furthermore, we introduce DeptS,\na novel depth-aware scaling approach aimed at enhancing 3D model calibration.\nExtensive experiments across a wide range of configurations validate the\nsuperiority of our method. We hope this work could serve as a cornerstone for\nfostering reliable 3D scene understanding. Code and benchmark toolkit are\npublicly available.\n","authors":["Lingdong Kong","Xiang Xu","Jun Cen","Wenwei Zhang","Liang Pan","Kai Chen","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2403.17010v2.pdf","comment":"WACV 2025; 26 pages, 8 figures, 12 tables; Code at\n https://github.com/ldkong1205/Calib3D"},{"id":"http://arxiv.org/abs/2404.12294v3","updated":"2024-12-05T15:27:14Z","published":"2024-04-18T16:16:02Z","title":"Bayesian evidence estimation from posterior samples with normalizing\n flows","summary":" We propose a novel method ($floZ$), based on normalizing flows, to estimate\nthe Bayesian evidence (and its numerical uncertainty) from a pre-existing set\nof samples drawn from the unnormalized posterior distribution. We validate it\non distributions whose evidence is known analytically, up to 15 parameter space\ndimensions, and compare with two state-of-the-art techniques for estimating the\nevidence: nested sampling (which computes the evidence as its main target) and\na $k$-nearest-neighbors technique that produces evidence estimates from\nposterior samples. Provided representative samples from the target posterior\nare available, our method is more robust to posterior distributions with sharp\nfeatures, especially in higher dimensions. For a simple multivariate Gaussian,\nwe demonstrate its accuracy for up to 200 dimensions with $10^5$ posterior\nsamples. $floZ$ has wide applicability, e.g., to estimate evidence from\nvariational inference, Markov Chain Monte Carlo samples, or any other method\nthat delivers samples and their likelihood from the unnormalized posterior\ndensity. As a physical application, we use $floZ$ to compute the Bayes factor\nfor the presence of the first overtone in the ringdown signal of the\ngravitational wave data of GW150914, finding good agreement with nested\nsampling.\n","authors":["Rahul Srinivasan","Marco Crisostomi","Roberto Trotta","Enrico Barausse","Matteo Breschi"],"pdf_url":"https://arxiv.org/pdf/2404.12294v3.pdf","comment":"15 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2412.04243v1","updated":"2024-12-05T15:25:51Z","published":"2024-12-05T15:25:51Z","title":"Quantifying the Limits of Segment Anything Model: Analyzing Challenges\n in Segmenting Tree-Like and Low-Contrast Structures","summary":" Segment Anything Model (SAM) has shown impressive performance in interactive\nand zero-shot segmentation across diverse domains, suggesting that they have\nlearned a general concept of \"objects\" from their large-scale training.\nHowever, we observed that SAM struggles with certain types of objects,\nparticularly those featuring dense, tree-like structures and low textural\ncontrast from their surroundings. These failure modes are critical for\nunderstanding its limitations in real-world use. In order to systematically\nexamine this issue, we propose metrics to quantify two key object\ncharacteristics: tree-likeness and textural separability. Through extensive\ncontrolled synthetic experiments and testing on real datasets, we demonstrate\nthat SAM's performance is noticeably correlated with these factors. We link\nthese behaviors under the concept of \"textural confusion\", where SAM\nmisinterprets local structure as global texture, leading to over-segmentation,\nor struggles to differentiate objects from similarly textured backgrounds.\nThese findings offer the first quantitative framework to model SAM's\nchallenges, providing valuable insights into its limitations and guiding future\nimprovements for vision foundation models.\n","authors":["Yixin Zhang","Nicholas Konz","Kevin Kramer","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2412.04243v1.pdf","comment":"Code: https://github.com/mazurowski-lab/SAM-TexturalConfusion-Metrics"},{"id":"http://arxiv.org/abs/2412.04242v1","updated":"2024-12-05T15:25:18Z","published":"2024-12-05T15:25:18Z","title":"LMDM:Latent Molecular Diffusion Model For 3D Molecule Generation","summary":" n this work, we propose a latent molecular diffusion model that can make the\ngenerated 3D molecules rich in diversity and maintain rich geometric features.\nThe model captures the information of the forces and local constraints between\natoms so that the generated molecules can maintain Euclidean transformation and\nhigh level of effectiveness and diversity. We also use the lowerrank manifold\nadvantage of the latent variables of the latent model to fuse the information\nof the forces between atoms to better maintain the geometric equivariant\nproperties of the molecules. Because there is no need to perform information\nfusion encoding in stages like traditional encoders and decoders, this reduces\nthe amount of calculation in the back-propagation process. The model keeps the\nforces and local constraints of particle bonds in the latent variable space,\nreducing the impact of underfitting on the surface of the network on the large\nposition drift of the particle geometry, so that our model can converge\nearlier. We introduce a distribution control variable in each backward step to\nstrengthen exploration and improve the diversity of generation. In the\nexperiment, the quality of the samples we generated and the convergence speed\nof the model have been significantly improved.\n","authors":["Xiang Chen"],"pdf_url":"https://arxiv.org/pdf/2412.04242v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2209.05710 by other authors"},{"id":"http://arxiv.org/abs/2410.14086v2","updated":"2024-12-05T15:24:33Z","published":"2024-10-17T23:37:34Z","title":"In-context learning and Occam's razor","summary":" A central goal of machine learning is generalization. While the No Free Lunch\nTheorem states that we cannot obtain theoretical guarantees for generalization\nwithout further assumptions, in practice we observe that simple models which\nexplain the training data generalize best: a principle called Occam's razor.\nDespite the need for simple models, most current approaches in machine learning\nonly minimize the training error, and at best indirectly promote simplicity\nthrough regularization or architecture design. Here, we draw a connection\nbetween Occam's razor and in-context learning: an emergent ability of certain\nsequence models like Transformers to learn at inference time from past\nobservations in a sequence. In particular, we show that the next-token\nprediction loss used to train in-context learners is directly equivalent to a\ndata compression technique called prequential coding, and that minimizing this\nloss amounts to jointly minimizing both the training error and the complexity\nof the model that was implicitly learned from context. Our theory and the\nempirical experiments we use to support it not only provide a normative account\nof in-context learning, but also elucidate the shortcomings of current\nin-context learning methods, suggesting ways in which they can be improved. We\nmake our code available at https://github.com/3rdCore/PrequentialCode.\n","authors":["Eric Elmoznino","Tom Marty","Tejas Kasetty","Leo Gagnon","Sarthak Mittal","Mahan Fathi","Dhanya Sridhar","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2410.14086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08339v3","updated":"2024-12-05T15:23:20Z","published":"2022-10-15T17:15:53Z","title":"Reachable Polyhedral Marching (RPM): An Exact Analysis Tool for\n Deep-Learned Control Systems","summary":" Neural networks are increasingly used in robotics as policies, state\ntransition models, state estimation models, or all of the above. With these\ncomponents being learned from data, it is important to be able to analyze what\nbehaviors were learned and how this affects closed-loop performance. In this\npaper we take steps toward this goal by developing methods for computing\ncontrol invariant sets and regions of attraction (ROAs) of dynamical systems\nrepresented as neural networks. We focus our attention on feedforward neural\nnetworks with the rectified linear unit (ReLU) activation, which are known to\nimplement continuous piecewise-affine (PWA) functions. We describe the\nReachable Polyhedral Marching (RPM) algorithm for enumerating the affine pieces\nof a neural network through an incremental connected walk. We then use this\nalgorithm to compute exact forward and backward reachable sets, from which we\nprovide methods for computing control invariant sets and ROAs. Our approach is\nunique in that we find these sets incrementally, without Lyapunov-based tools.\nIn our examples we demonstrate the ability of our approach to find non-convex\ncontrol invariant sets and ROAs on tasks with learned van der Pol oscillator\nand pendulum models. Further, we provide an accelerated algorithm for computing\nROAs that leverages the incremental and connected enumeration of affine regions\nthat RPM provides. We show this acceleration to lead to a 15x speedup in our\nexamples. Finally, we apply our methods to find a set of states that are\nstabilized by an image-based controller for an aircraft runway control problem.\n","authors":["Joseph A. Vincent","Mac Schwager"],"pdf_url":"https://arxiv.org/pdf/2210.08339v3.pdf","comment":"Submitted to IEEE Transactions on Neural Networks and Learning\n Systems. arXiv admin note: text overlap with arXiv:2011.11609"},{"id":"http://arxiv.org/abs/2410.14817v2","updated":"2024-12-05T15:20:28Z","published":"2024-10-18T18:37:27Z","title":"A Complexity-Based Theory of Compositionality","summary":" Compositionality is believed to be fundamental to intelligence. In humans, it\nunderlies the structure of thought, language, and higher-level reasoning. In\nAI, compositional representations can enable a powerful form of\nout-of-distribution generalization, in which a model systematically adapts to\nnovel combinations of known concepts. However, while we have strong intuitions\nabout what compositionality is, there currently exists no formal definition for\nit that is measurable and mathematical. Here, we propose such a definition,\nwhich we call representational compositionality, that accounts for and extends\nour intuitions about compositionality. The definition is conceptually simple,\nquantitative, grounded in algorithmic information theory, and applicable to any\nrepresentation. Intuitively, representational compositionality states that a\ncompositional representation satisfies three properties. First, it must be\nexpressive. Second, it must be possible to re-describe the representation as a\nfunction of discrete symbolic sequences with re-combinable parts, analogous to\nsentences in natural language. Third, the function that relates these symbolic\nsequences to the representation, analogous to semantics in natural language,\nmust be simple. Through experiments on both synthetic and real world data, we\nvalidate our definition of compositionality and show how it unifies disparate\nintuitions from across the literature in both AI and cognitive science. We also\nshow that representational compositionality, while theoretically intractable,\ncan be readily estimated using standard deep learning tools. Our definition has\nthe potential to inspire the design of novel, theoretically-driven models that\nbetter capture the mechanisms of compositional thought.\n","authors":["Eric Elmoznino","Thomas Jiralerspong","Yoshua Bengio","Guillaume Lajoie"],"pdf_url":"https://arxiv.org/pdf/2410.14817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04236v1","updated":"2024-12-05T15:14:16Z","published":"2024-12-05T15:14:16Z","title":"A History of Philosophy in Colombia through Topic Modelling","summary":" Data-driven approaches to philosophy have emerged as a valuable tool for\nstudying the history of the discipline. However, most studies in this area have\nfocused on a limited number of journals from specific regions and subfields. We\nexpand the scope of this research by applying dynamic topic modelling\ntechniques to explore the history of philosophy in Colombia and Latin America.\nOur study examines the Colombian philosophy journal Ideas y Valores, founded in\n1951 and currently one of the most influential academic philosophy journals in\nthe region. By analyzing the evolution of topics across the journal's history,\nwe identify various trends and specific dynamics in philosophical discourse\nwithin the Colombian and Latin American context. Our findings reveal that the\nmost prominent topics are value theory (including ethics, political philosophy,\nand aesthetics), epistemology, and the philosophy of science. We also trace the\nevolution of articles focusing on the historical and interpretive aspects of\nphilosophical texts, and we note a notable emphasis on German philosophers such\nas Kant, Husserl, and Hegel on various topics throughout the journal's\nlifetime. Additionally, we investigate whether articles with a historical focus\nhave decreased over time due to editorial pressures. Our analysis suggests no\nsignificant decline in such articles. Finally, we propose ideas for extending\nthis research to other Latin American journals and suggest improvements for\nnatural language processing workflows in non-English languages.\n","authors":["Juan R. Loaiza","Miguel González-Duque"],"pdf_url":"https://arxiv.org/pdf/2412.04236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04233v1","updated":"2024-12-05T15:09:51Z","published":"2024-12-05T15:09:51Z","title":"HyperMARL: Adaptive Hypernetworks for Multi-Agent RL","summary":" Balancing individual specialisation and shared behaviours is a critical\nchallenge in multi-agent reinforcement learning (MARL). Existing methods\ntypically focus on encouraging diversity or leveraging shared representations.\nFull parameter sharing (FuPS) improves sample efficiency but struggles to learn\ndiverse behaviours when required, while no parameter sharing (NoPS) enables\ndiversity but is computationally expensive and sample inefficient. To address\nthese challenges, we introduce HyperMARL, a novel approach using hypernetworks\nto balance efficiency and specialisation. HyperMARL generates agent-specific\nactor and critic parameters, enabling agents to adaptively exhibit diverse or\nhomogeneous behaviours as needed, without modifying the learning objective or\nrequiring prior knowledge of the optimal diversity. Furthermore, HyperMARL\ndecouples agent-specific and state-based gradients, which empirically\ncorrelates with reduced policy gradient variance, potentially offering insights\ninto its ability to capture diverse behaviours. Across MARL benchmarks\nrequiring homogeneous, heterogeneous, or mixed behaviours, HyperMARL\nconsistently matches or outperforms FuPS, NoPS, and diversity-focused methods,\nachieving NoPS-level diversity with a shared architecture. These results\nhighlight the potential of hypernetworks as a versatile approach to the\ntrade-off between specialisation and shared behaviours in MARL.\n","authors":["Kale-ab Abebe Tessera","Arrasy Rahman","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2412.04233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05357v2","updated":"2024-12-05T15:08:56Z","published":"2024-10-07T15:55:55Z","title":"Model-GLUE: Democratized LLM Scaling for A Large Model Zoo in the Wild","summary":" As Large Language Models (LLMs) excel across tasks and specialized domains,\nscaling LLMs based on existing models has garnered significant attention, which\nfaces the challenge of decreasing performance when combining disparate models.\nVarious techniques have been proposed for the aggregation of pre-trained LLMs,\nincluding model merging, Mixture-of-Experts, and stacking. Despite their\nmerits, a comprehensive comparison and synergistic application of them to a\ndiverse model zoo is yet to be adequately addressed. In light of this research\ngap, this paper introduces Model-GLUE, a holistic LLM scaling guideline. First,\nour work starts with a benchmarking of existing LLM scaling techniques,\nespecially selective merging, and variants of mixture. Utilizing the insights\nfrom the benchmark results, we formulate an optimal strategy for the selection\nand aggregation of a heterogeneous model zoo characterizing different\narchitectures and initialization.Our methodology involves the clustering of\nmergeable models and optimal merging strategy selection, and the integration of\nclusters through a model mixture. Finally, evidenced by our experiments on a\ndiverse Llama-2-based model zoo, Model-GLUE shows an average performance\nenhancement of 5.61%, achieved without additional training. Codes are available\nat: https://github.com/Model-GLUE/Model-GLUE.\n","authors":["Xinyu Zhao","Guoheng Sun","Ruisi Cai","Yukun Zhou","Pingzhi Li","Peihao Wang","Bowen Tan","Yexiao He","Li Chen","Yi Liang","Beidi Chen","Binhang Yuan","Hongyi Wang","Ang Li","Zhangyang Wang","Tianlong Chen"],"pdf_url":"https://arxiv.org/pdf/2410.05357v2.pdf","comment":"24 pages, 4 figures, accepted to NeurIPS 2024 Datasets and Benchmarks\n Track"},{"id":"http://arxiv.org/abs/2412.04227v1","updated":"2024-12-05T15:05:25Z","published":"2024-12-05T15:05:25Z","title":"Foundations of the Theory of Performance-Based Ranking","summary":" Ranking entities such as algorithms, devices, methods, or models based on\ntheir performances, while accounting for application-specific preferences, is a\nchallenge. To address this challenge, we establish the foundations of a\nuniversal theory for performance-based ranking. First, we introduce a rigorous\nframework built on top of both the probability and order theories. Our new\nframework encompasses the elements necessary to (1) manipulate performances as\nmathematical objects, (2) express which performances are worse than or\nequivalent to others, (3) model tasks through a variable called satisfaction,\n(4) consider properties of the evaluation, (5) define scores, and (6) specify\napplication-specific preferences through a variable called importance. On top\nof this framework, we propose the first axiomatic definition of performance\norderings and performance-based rankings. Then, we introduce a universal\nparametric family of scores, called ranking scores, that can be used to\nestablish rankings satisfying our axioms, while considering\napplication-specific preferences. Finally, we show, in the case of two-class\nclassification, that the family of ranking scores encompasses well-known\nperformance scores, including the accuracy, the true positive rate (recall,\nsensitivity), the true negative rate (specificity), the positive predictive\nvalue (precision), and F1. However, we also show that some other scores\ncommonly used to compare classifiers are unsuitable to derive performance\norderings satisfying the axioms. Therefore, this paper provides the computer\nvision and machine learning communities with a rigorous framework for\nevaluating and ranking entities.\n","authors":["Sébastien Piérard","Anaïs Halin","Anthony Cioppa","Adrien Deliège","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2412.04227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03441v2","updated":"2024-12-05T15:03:26Z","published":"2024-12-04T16:30:03Z","title":"PBP: Post-training Backdoor Purification for Malware Classifiers","summary":" In recent years, the rise of machine learning (ML) in cybersecurity has\nbrought new challenges, including the increasing threat of backdoor poisoning\nattacks on ML malware classifiers. For instance, adversaries could inject\nmalicious samples into public malware repositories, contaminating the training\ndata and potentially misclassifying malware by the ML model. Current\ncountermeasures predominantly focus on detecting poisoned samples by leveraging\ndisagreements within the outputs of a diverse set of ensemble models on\ntraining data points. However, these methods are not suitable for scenarios\nwhere Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove\nbackdoors from a model after it has been trained. Addressing this scenario, we\nintroduce PBP, a post-training defense for malware classifiers that mitigates\nvarious types of backdoor embeddings without assuming any specific backdoor\nembedding mechanism. Our method exploits the influence of backdoor attacks on\nthe activation distribution of neural networks, independent of the\ntrigger-embedding method. In the presence of a backdoor attack, the activation\ndistribution of each layer is distorted into a mixture of distributions. By\nregulating the statistics of the batch normalization layers, we can guide a\nbackdoored model to perform similarly to a clean one. Our method demonstrates\nsubstantial advantages over several state-of-the-art methods, as evidenced by\nexperiments on two datasets, two types of backdoor methods, and various attack\nconfigurations. Notably, our approach requires only a small portion of the\ntraining data -- only 1\\% -- to purify the backdoor and reduce the attack\nsuccess rate from 100\\% to almost 0\\%, a 100-fold improvement over the baseline\nmethods. Our code is available at\n\\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}.\n","authors":["Dung Thuy Nguyen","Ngoc N. Tran","Taylor T. Johnson","Kevin Leach"],"pdf_url":"https://arxiv.org/pdf/2412.03441v2.pdf","comment":"Accepted at NDSS 2025"},{"id":"http://arxiv.org/abs/2410.03960v2","updated":"2024-12-05T14:56:56Z","published":"2024-10-04T22:45:26Z","title":"SwiftKV: Fast Prefill-Optimized Inference with Knowledge-Preserving\n Model Transformation","summary":" LLM inference for popular enterprise use cases, such as summarization, RAG,\nand code-generation, typically observes orders of magnitude longer prompt\nlengths than generation lengths. This characteristic leads to high cost of\nprefill and increased response latency. In this paper, we present SwiftKV, a\nnovel model transformation and distillation procedure specifically designed to\nreduce the time and cost of processing prompt tokens while preserving high\nquality of generated tokens. SwiftKV combines three key mechanisms: i)\nSingleInputKV, which prefills later layers' KV cache using a much earlier\nlayer's output, allowing prompt tokens to skip much of the model computation,\nii) AcrossKV, which merges the KV caches of neighboring layers to reduce the\nmemory footprint and support larger batch size for higher throughput, and iii)\na knowledge-preserving distillation procedure that can adapt existing LLMs for\nSwiftKV with minimal accuracy impact and low compute and data requirement. For\nLlama-3.1-8B and 70B, SwiftKV reduces the compute requirement of prefill by 50%\nand the memory requirement of the KV cache by 62.5% while incurring minimum\nquality degradation across a wide range of tasks. In the end-to-end inference\nserving using an optimized vLLM implementation, SwiftKV realizes up to 2x\nhigher aggregate throughput and 60% lower time per output token. It can achieve\na staggering 560 TFlops/GPU of normalized inference throughput, which\ntranslates to 16K tokens/s for Llama-3.1-70B in 16-bit precision on 4x H100\nGPUs. Our training, inference, and model implementations are open-sourced and\ncan be found through\nhttps://huggingface.co/collections/Snowflake/swiftkv-models-674f7d7474eb789e185d31cb.\n","authors":["Aurick Qiao","Zhewei Yao","Samyam Rajbhandari","Yuxiong He"],"pdf_url":"https://arxiv.org/pdf/2410.03960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06740v4","updated":"2024-12-05T14:56:30Z","published":"2024-11-11T06:25:13Z","title":"Dockformer: A transformer-based molecular docking paradigm for\n large-scale virtual screening","summary":" Molecular docking is a crucial step in drug development, which enables the\nvirtual screening of compound libraries to identify potential ligands that\ntarget proteins of interest. However, the computational complexity of\ntraditional docking models increases as the size of the compound library\nincreases. Recently, deep learning algorithms can provide data-driven research\nand development models to increase the speed of the docking process.\nUnfortunately, few models can achieve superior screening performance compared\nto that of traditional models. Therefore, a novel deep learning-based docking\napproach named Dockformer is introduced in this study. Dockformer leverages\nmultimodal information to capture the geometric topology and structural\nknowledge of molecules and can directly generate binding conformations with the\ncorresponding confidence measures in an end-to-end manner. The experimental\nresults show that Dockformer achieves success rates of 90.53% and 82.71% on the\nPDBbind core set and PoseBusters benchmarks, respectively, and more than a\n100-fold increase in the inference process speed, outperforming almost all\nstate-of-the-art docking methods. In addition, the ability of Dockformer to\nidentify the main protease inhibitors of coronaviruses is demonstrated in a\nreal-world virtual screening scenario. Considering its high docking accuracy\nand screening efficiency, Dockformer can be regarded as a powerful and robust\ntool in the field of drug design.\n","authors":["Zhangfan Yang","Junkai Ji","Shan He","Jianqiang Li","Tiantian He","Ruibin Bai","Zexuan Zhu","Yew Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2411.06740v4.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.04213v1","updated":"2024-12-05T14:47:38Z","published":"2024-12-05T14:47:38Z","title":"Physics-informed Deep Learning for Muscle Force Prediction with\n Unlabeled sEMG Signals","summary":" Computational biomechanical analysis plays a pivotal role in understanding\nand improving human movements and physical functions. Although physics-based\nmodeling methods can interpret the dynamic interaction between the neural drive\nto muscle dynamics and joint kinematics, they suffer from high computational\nlatency. In recent years, data-driven methods have emerged as a promising\nalternative due to their fast execution speed, but label information is still\nrequired during training, which is not easy to acquire in practice. To tackle\nthese issues, this paper presents a novel physics-informed deep learning method\nto predict muscle forces without any label information during model training.\nIn addition, the proposed method could also identify personalized muscle-tendon\nparameters. To achieve this, the Hill muscle model-based forward dynamics is\nembedded into the deep neural network as the additional loss to further\nregulate the behavior of the deep neural network. Experimental validations on\nthe wrist joint from six healthy subjects are performed, and a fully connected\nneural network (FNN) is selected to implement the proposed method. The\npredicted results of muscle forces show comparable or even lower root mean\nsquare error (RMSE) and higher coefficient of determination compared with\nbaseline methods, which have to use the labeled surface electromyography (sEMG)\nsignals, and it can also identify muscle-tendon parameters accurately,\ndemonstrating the effectiveness of the proposed physics-informed deep learning\nmethod.\n","authors":["Shuhao Ma","Jie Zhang","Chaoyang Shi","Pei Di","Ian D. Robertson","Zhi-Qiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.04213v1.pdf","comment":"11pages, 8 figures, journal"},{"id":"http://arxiv.org/abs/2410.19725v2","updated":"2024-12-05T14:34:08Z","published":"2024-10-25T17:52:53Z","title":"On the Benefits of Active Data Collection in Operator Learning","summary":" We investigate active data collection strategies for operator learning when\nthe target operator is linear and the input functions are drawn from a\nmean-zero stochastic process with continuous covariance kernels. With an active\ndata collection strategy, we establish an error convergence rate in terms of\nthe decay rate of the eigenvalues of the covariance kernel. Thus, with\nsufficiently rapid eigenvalue decay of the covariance kernels, arbitrarily fast\nerror convergence rates can be achieved. This contrasts with the passive\n(i.i.d.) data collection strategies, where the convergence rate is never faster\nthan $\\sim n^{-1}$. In fact, for our setting, we establish a\n\\emph{non-vanishing} lower bound for any passive data collection strategy,\nregardless of the eigenvalues decay rate of the covariance kernel. Overall, our\nresults show the benefit of active over passive data collection strategies in\noperator learning.\n","authors":["Unique Subedi","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2410.19725v2.pdf","comment":"Added experiments"},{"id":"http://arxiv.org/abs/2403.10182v4","updated":"2024-12-05T14:30:41Z","published":"2024-03-15T10:38:48Z","title":"Fast and reliable uncertainty quantification with neural network\n ensembles for industrial image classification","summary":" Image classification with neural networks (NNs) is widely used in industrial\nprocesses, situations where the model likely encounters unknown objects during\ndeployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make\nconfident yet incorrect predictions when confronted with OOD data. To increase\nthe models' reliability, they should quantify the uncertainty in their own\npredictions, communicating when the output should (not) be trusted. Deep\nensembles, composed of multiple independent NNs, have been shown to perform\nstrongly but are computationally expensive. Recent research has proposed more\nefficient NN ensembles, namely the snapshot, batch, and multi-input\nmulti-output ensemble. This study investigates the predictive and uncertainty\nperformance of efficient NN ensembles in the context of image classification\nfor industrial processes. It is the first to provide a comprehensive comparison\nand it proposes a novel Diversity Quality metric to quantify the ensembles'\nperformance on the in-distribution and OOD sets in one single metric. The\nresults highlight the batch ensemble as a cost-effective and competitive\nalternative to the deep ensemble. It matches the deep ensemble in both\nuncertainty and accuracy while exhibiting considerable savings in training\ntime, test time, and memory storage.\n","authors":["Arthur Thuy","Dries F. Benoit"],"pdf_url":"https://arxiv.org/pdf/2403.10182v4.pdf","comment":"Submitted to Annals of Operations Research"},{"id":"http://arxiv.org/abs/2412.04190v1","updated":"2024-12-05T14:30:18Z","published":"2024-12-05T14:30:18Z","title":"Directed Structural Adaptation to Overcome Statistical Conflicts and\n Enable Continual Learning","summary":" Adaptive networks today rely on overparameterized fixed topologies that\ncannot break through the statistical conflicts they encounter in the data they\nare exposed to, and are prone to \"catastrophic forgetting\" as the network\nattempts to reuse the existing structures to learn new task. We propose a\nstructural adaptation method, DIRAD, that can complexify as needed and in a\ndirected manner without being limited by statistical conflicts within a\ndataset. We then extend this method and present the PREVAL framework, designed\nto prevent \"catastrophic forgetting\" in continual learning by detection of new\ndata and assigning encountered data to suitable models adapted to process them,\nwithout needing task labels anywhere in the workflow. We show the reliability\nof the DIRAD in growing a network with high performance and orders-of-magnitude\nsimpler than fixed topology networks; and demonstrate the proof-of-concept\noperation of PREVAL, in which continual adaptation to new tasks is observed\nwhile being able to detect and discern previously-encountered tasks.\n","authors":["Zeki Doruk Erden","Boi Faltings"],"pdf_url":"https://arxiv.org/pdf/2412.04190v1.pdf","comment":"Presented in Deployable AI (DAI) workshop at AAAI-2024"},{"id":"http://arxiv.org/abs/2409.17146v2","updated":"2024-12-05T14:28:40Z","published":"2024-09-25T17:59:51Z","title":"Molmo and PixMo: Open Weights and Open Data for State-of-the-Art\n Vision-Language Models","summary":" Today's most advanced vision-language models (VLMs) remain proprietary. The\nstrongest open-weight models rely heavily on synthetic data from proprietary\nVLMs to achieve good performance, effectively distilling these closed VLMs into\nopen ones. As a result, the community has been missing foundational knowledge\nabout how to build performant VLMs from scratch. We present Molmo, a new family\nof VLMs that are state-of-the-art in their class of openness. Our key\ncontribution is a collection of new datasets called PixMo, including a dataset\nof highly detailed image captions for pre-training, a free-form image Q&A\ndataset for fine-tuning, and an innovative 2D pointing dataset, all collected\nwithout the use of external VLMs. The success of our approach relies on careful\nmodeling choices, a well-tuned training pipeline, and, most critically, the\nquality of our newly collected datasets. Our best-in-class 72B model not only\noutperforms others in the class of open weight and data models, but also\noutperforms larger proprietary models including Claude 3.5 Sonnet, and Gemini\n1.5 Pro and Flash, second only to GPT-4o based on both academic benchmarks and\non a large human evaluation. Our model weights, new datasets, and source code\nare available at https://molmo.allenai.org/blog.\n","authors":["Matt Deitke","Christopher Clark","Sangho Lee","Rohun Tripathi","Yue Yang","Jae Sung Park","Mohammadreza Salehi","Niklas Muennighoff","Kyle Lo","Luca Soldaini","Jiasen Lu","Taira Anderson","Erin Bransom","Kiana Ehsani","Huong Ngo","YenSung Chen","Ajay Patel","Mark Yatskar","Chris Callison-Burch","Andrew Head","Rose Hendrix","Favyen Bastani","Eli VanderBilt","Nathan Lambert","Yvonne Chou","Arnavi Chheda","Jenna Sparks","Sam Skjonsberg","Michael Schmitz","Aaron Sarnat","Byron Bischoff","Pete Walsh","Chris Newell","Piper Wolters","Tanmay Gupta","Kuo-Hao Zeng","Jon Borchardt","Dirk Groeneveld","Crystal Nam","Sophie Lebrecht","Caitlin Wittlif","Carissa Schoenick","Oscar Michel","Ranjay Krishna","Luca Weihs","Noah A. Smith","Hannaneh Hajishirzi","Ross Girshick","Ali Farhadi","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2409.17146v2.pdf","comment":"Updated with ablations and more technical details"},{"id":"http://arxiv.org/abs/2412.04183v1","updated":"2024-12-05T14:21:18Z","published":"2024-12-05T14:21:18Z","title":"Linear Discriminant Analysis in Credit Scoring: A Transparent Hybrid\n Model Approach","summary":" The development of computing has made credit scoring approaches possible,\nwith various machine learning (ML) and deep learning (DL) techniques becoming\nmore and more valuable. While complex models yield more accurate predictions,\ntheir interpretability is often weakened, which is a concern for credit scoring\nthat places importance on decision fairness. As features of the dataset are a\ncrucial factor for the credit scoring system, we implement Linear Discriminant\nAnalysis (LDA) as a feature reduction technique, which reduces the burden of\nthe models complexity. We compared 6 different machine learning models, 1 deep\nlearning model, and a hybrid model with and without using LDA. From the result,\nwe have found our hybrid model, XG-DNN, outperformed other models with the\nhighest accuracy of 99.45% and a 99% F1 score with LDA. Lastly, to interpret\nmodel decisions, we have applied 2 different explainable AI techniques named\nLIME (local) and Morris Sensitivity Analysis (global). Through this research,\nwe showed how feature reduction techniques can be used without affecting the\nperformance and explainability of the model, which can be very useful in\nresource-constrained settings to optimize the computational workload.\n","authors":["Md Shihab Reza","Monirul Islam Mahmud","Ifti Azad Abeer","Nova Ahmed"],"pdf_url":"https://arxiv.org/pdf/2412.04183v1.pdf","comment":"Accepted on International Conference on Computer and Information\n Technology (ICCIT) 2024"},{"id":"http://arxiv.org/abs/2412.04180v1","updated":"2024-12-05T14:19:59Z","published":"2024-12-05T14:19:59Z","title":"SKIM: Any-bit Quantization Pushing The Limits of Post-Training\n Quantization","summary":" Large Language Models (LLMs) exhibit impressive performance across various\ntasks, but deploying them for inference poses challenges. Their high resource\ndemands often necessitate complex, costly multi-GPU pipelines, or the use of\nsmaller, less capable models. While quantization offers a promising solution\nutilizing lower precision for model storage, existing methods frequently\nexperience significant performance drops at lower precision levels.\nAdditionally, they typically provide only a limited set of solutions at\nspecific bit levels, many of which are extensively manually tuned. To address\nthese challenges, we propose a new method called SKIM: Scaled K-means\nclustering wIth Mixed precision. Our approach introduces two novel techniques:\n1. A greedy algorithm to solve approximately optimal bit allocation across\nweight channels, and 2. A trainable scaling vector for non-differentiable\nK-means clustering. These techniques substantially improve performance and can\nbe adapted to any given bit. Notably, in terms of model perplexity, our method\nnarrows the gap between 3-bit quantized LLaMA models and their full precision\ncounterparts by 16.3% on average.\n","authors":["Runsheng Bai","Qiang Liu","Bo Liu"],"pdf_url":"https://arxiv.org/pdf/2412.04180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04178v1","updated":"2024-12-05T14:18:50Z","published":"2024-12-05T14:18:50Z","title":"Multi-Layer Privacy-Preserving Record Linkage with Clerical Review based\n on gradual information disclosure","summary":" Privacy-Preserving Record linkage (PPRL) is an essential component in data\nintegration tasks of sensitive information. The linkage quality determines the\nusability of combined datasets and (machine learning) applications based on\nthem. We present a novel privacy-preserving protocol that integrates clerical\nreview in PPRL using a multi-layer active learning process. Uncertain match\ncandidates are reviewed on several layers by human and non-human oracles to\nreduce the amount of disclosed information per record and in total. Predictions\nare propagated back to update previous layers, resulting in an improved linkage\nperformance for non-reviewed candidates as well. The data owners remain in\ncontrol of the amount of information they share for each record. Therefore, our\napproach follows need-to-know and data sovereignty principles. The experimental\nevaluation on real-world datasets shows considerable linkage quality\nimprovements with limited labeling effort and privacy risks.\n","authors":["Florens Rohde","Victor Christen","Martin Franke","Erhard Rahm"],"pdf_url":"https://arxiv.org/pdf/2412.04178v1.pdf","comment":"Accepted at 21st Conference on Database Systems for Business,\n Technology and Web (BTW)"},{"id":"http://arxiv.org/abs/2412.04177v1","updated":"2024-12-05T14:17:16Z","published":"2024-12-05T14:17:16Z","title":"Fixed-Mean Gaussian Processes for Post-hoc Bayesian Deep Learning","summary":" Recently, there has been an increasing interest in performing post-hoc\nuncertainty estimation about the predictions of pre-trained deep neural\nnetworks (DNNs). Given a pre-trained DNN via back-propagation, these methods\nenhance the original network by adding output confidence measures, such as\nerror bars, without compromising its initial accuracy. In this context, we\nintroduce a novel family of sparse variational Gaussian processes (GPs), where\nthe posterior mean is fixed to any continuous function when using a universal\nkernel. Specifically, we fix the mean of this GP to the output of the\npre-trained DNN, allowing our approach to effectively fit the GP's predictive\nvariances to estimate the DNN prediction uncertainty. Our approach leverages\nvariational inference (VI) for efficient stochastic optimization, with training\ncosts that remain independent of the number of training points, scaling\nefficiently to large datasets such as ImageNet. The proposed method, called\nfixed mean GP (FMGP), is architecture-agnostic, relying solely on the\npre-trained model's outputs to adjust the predictive variances. Experimental\nresults demonstrate that FMGP improves both uncertainty estimation and\ncomputational efficiency when compared to state-of-the-art methods.\n","authors":["Luis A. Ortega","Simón Rodríguez-Santana","Daniel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2412.04177v1.pdf","comment":"12 pages, 6 figures and 2 tables. Submitted to IEEE TRANSACTIONS ON\n PATTERN ANALYSIS AND MACHINE INTELLIGENCE"},{"id":"http://arxiv.org/abs/2411.16105v2","updated":"2024-12-05T14:16:57Z","published":"2024-11-25T05:32:34Z","title":"Adaptive Circuit Behavior and Generalization in Mechanistic\n Interpretability","summary":" Mechanistic interpretability aims to understand the inner workings of large\nneural networks by identifying circuits, or minimal subgraphs within the model\nthat implement algorithms responsible for performing specific tasks. These\ncircuits are typically discovered and analyzed using a narrowly defined prompt\nformat. However, given the abilities of large language models (LLMs) to\ngeneralize across various prompt formats for the same task, it remains unclear\nhow well these circuits generalize. For instance, it is unclear whether the\nmodels generalization results from reusing the same circuit components, the\ncomponents behaving differently, or the use of entirely different components.\nIn this paper, we investigate the generality of the indirect object\nidentification (IOI) circuit in GPT-2 small, which is well-studied and believed\nto implement a simple, interpretable algorithm. We evaluate its performance on\nprompt variants that challenge the assumptions of this algorithm. Our findings\nreveal that the circuit generalizes surprisingly well, reusing all of its\ncomponents and mechanisms while only adding additional input edges. Notably,\nthe circuit generalizes even to prompt variants where the original algorithm\nshould fail; we discover a mechanism that explains this which we term S2\nHacking. Our findings indicate that circuits within LLMs may be more flexible\nand general than previously recognized, underscoring the importance of studying\ncircuit generalization to better understand the broader capabilities of these\nmodels.\n","authors":["Jatin Nainani","Sankaran Vaidyanathan","AJ Yeung","Kartik Gupta","David Jensen"],"pdf_url":"https://arxiv.org/pdf/2411.16105v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2412.04166v1","updated":"2024-12-05T14:03:16Z","published":"2024-12-05T14:03:16Z","title":"An In-Depth Examination of Risk Assessment in Multi-Class Classification\n Algorithms","summary":" Advanced classification algorithms are being increasingly used in\nsafety-critical applications like health-care, engineering, etc. In such\napplications, miss-classifications made by ML algorithms can result in\nsubstantial financial or health-related losses. To better anticipate and\nprepare for such losses, the algorithm user seeks an estimate for the\nprobability that the algorithm miss-classifies a sample. We refer to this task\nas the risk-assessment. For a variety of models and datasets, we numerically\nanalyze the performance of different methods in solving the risk-assessment\nproblem. We consider two solution strategies: a) calibration techniques that\ncalibrate the output probabilities of classification models to provide accurate\nprobability outputs; and b) a novel approach based upon the prediction interval\ngeneration technique of conformal prediction. Our conformal prediction based\napproach is model and data-distribution agnostic, simple to implement, and\nprovides reasonable results for a variety of use-cases. We compare the\ndifferent methods on a broad variety of models and datasets.\n","authors":["Disha Ghandwani","Neeraj Sarna","Yuanyuan Li","Yang Lin"],"pdf_url":"https://arxiv.org/pdf/2412.04166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04163v1","updated":"2024-12-05T13:54:53Z","published":"2024-12-05T13:54:53Z","title":"On the Lack of Robustness of Binary Function Similarity Systems","summary":" Binary function similarity, which often relies on learning-based algorithms\nto identify what functions in a pool are most similar to a given query\nfunction, is a sought-after topic in different communities, including machine\nlearning, software engineering, and security. Its importance stems from the\nimpact it has in facilitating several crucial tasks, from reverse engineering\nand malware analysis to automated vulnerability detection. Whereas recent work\ncast light around performance on this long-studied problem, the research\nlandscape remains largely lackluster in understanding the resiliency of the\nstate-of-the-art machine learning models against adversarial attacks. As\nsecurity requires to reason about adversaries, in this work we assess the\nrobustness of such models through a simple yet effective black-box greedy\nattack, which modifies the topology and the content of the control flow of the\nattacked functions. We demonstrate that this attack is successful in\ncompromising all the models, achieving average attack success rates of 57.06%\nand 95.81% depending on the problem settings (targeted and untargeted attacks).\nOur findings are insightful: top performance on clean data does not necessarily\nrelate to top robustness properties, which explicitly highlights\nperformance-robustness trade-offs one should consider when deploying such\nmodels, calling for further research.\n","authors":["Gianluca Capozzi","Tong Tang","Jie Wan","Ziqi Yang","Daniele Cono D'Elia","Giuseppe Antonio Di Luna","Lorenzo Cavallaro","Leonardo Querzoni"],"pdf_url":"https://arxiv.org/pdf/2412.04163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.06181v2","updated":"2024-12-05T13:50:59Z","published":"2021-09-13T16:06:10Z","title":"When Stability meets Sufficiency: Informative Explanations that do not\n Overwhelm","summary":" Recent studies evaluating various criteria for explainable artificial\nintelligence (XAI) suggest that fidelity, stability, and comprehensibility are\namong the most important metrics considered by users of AI across a diverse\ncollection of usage contexts. We consider these criteria as applied to\nfeature-based attribution methods, which are amongst the most prevalent in XAI\nliterature. Going beyond standard correlation, methods have been proposed that\nhighlight what should be minimally sufficient to justify the classification of\nan input (viz. pertinent positives). While minimal sufficiency is an attractive\nproperty akin to comprehensibility, the resulting explanations are often too\nsparse for a human to understand and evaluate the local behavior of the model.\nTo overcome these limitations, we incorporate the criteria of stability and\nfidelity and propose a novel method called Path-Sufficient Explanations Method\n(PSEM) that outputs a sequence of stable and sufficient explanations for a\ngiven input of strictly decreasing size (or value) -- from original input to a\nminimally sufficient explanation -- which can be thought to trace the local\nboundary of the model in a stable manner, thus providing better intuition about\nthe local model behavior for the specific input. We validate these claims, both\nqualitatively and quantitatively, with experiments that show the benefit of\nPSEM across three modalities (image, tabular and text) as well as versus other\npath explanations. A user study depicts the strength of the method in\ncommunicating the local behavior, where (many) users are able to correctly\ndetermine the prediction made by a model.\n","authors":["Ronny Luss","Amit Dhurandhar"],"pdf_url":"https://arxiv.org/pdf/2109.06181v2.pdf","comment":"Published at TMLR"},{"id":"http://arxiv.org/abs/2412.04158v1","updated":"2024-12-05T13:46:55Z","published":"2024-12-05T13:46:55Z","title":"LossVal: Efficient Data Valuation for Neural Networks","summary":" Assessing the importance of individual training samples is a key challenge in\nmachine learning. Traditional approaches retrain models with and without\nspecific samples, which is computationally expensive and ignores dependencies\nbetween data points. We introduce LossVal, an efficient data valuation method\nthat computes importance scores during neural network training by embedding a\nself-weighting mechanism into loss functions like cross-entropy and mean\nsquared error. LossVal reduces computational costs, making it suitable for\nlarge datasets and practical applications. Experiments on classification and\nregression tasks across multiple datasets show that LossVal effectively\nidentifies noisy samples and is able to distinguish helpful from harmful\nsamples. We examine the gradient calculation of LossVal to highlight its\nadvantages. The source code is available at:\nhttps://github.com/twibiral/LossVal\n","authors":["Tim Wibiral","Mohamed Karim Belaid","Maximilian Rabus","Ansgar Scherp"],"pdf_url":"https://arxiv.org/pdf/2412.04158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04157v1","updated":"2024-12-05T13:45:35Z","published":"2024-12-05T13:45:35Z","title":"Non-Asymptotic Bounds for Closed-Loop Identification of Unstable\n Nonlinear Stochastic Systems","summary":" We consider the problem of least squares parameter estimation from\nsingle-trajectory data for discrete-time, unstable, closed-loop nonlinear\nstochastic systems, with linearly parameterised uncertainty. Assuming a region\nof the state space produces informative data, and the system is\nsub-exponentially unstable, we establish non-asymptotic guarantees on the\nestimation error at times where the state trajectory evolves in this region. If\nthe whole state space is informative, high probability guarantees on the error\nhold for all times. Examples are provided where our results are useful for\nanalysis, but existing results are not.\n","authors":["Seth Siriya","Jingge Zhu","Dragan Nešić","Ye Pu"],"pdf_url":"https://arxiv.org/pdf/2412.04157v1.pdf","comment":"21 pages, 2 figures"},{"id":"http://arxiv.org/abs/2407.17449v3","updated":"2024-12-05T13:37:51Z","published":"2024-07-24T17:30:21Z","title":"Looking at Model Debiasing through the Lens of Anomaly Detection","summary":" It is widely recognized that deep neural networks are sensitive to bias in\nthe data. This means that during training these models are likely to learn\nspurious correlations between data and labels, resulting in limited\ngeneralization abilities and low performance. In this context, model debiasing\napproaches can be devised aiming at reducing the model's dependency on such\nunwanted correlations, either leveraging the knowledge of bias information or\nnot. In this work, we focus on the latter and more realistic scenario, showing\nthe importance of accurately predicting the bias-conflicting and bias-aligned\nsamples to obtain compelling performance in bias mitigation. On this ground, we\npropose to conceive the problem of model bias from an out-of-distribution\nperspective, introducing a new bias identification method based on anomaly\ndetection. We claim that when data is mostly biased, bias-conflicting samples\ncan be regarded as outliers with respect to the bias-aligned distribution in\nthe feature space of a biased model, thus allowing for precisely detecting them\nwith an anomaly detection method. Coupling the proposed bias identification\napproach with bias-conflicting data upsampling and augmentation in a two-step\nstrategy, we reach state-of-the-art performance on synthetic and real benchmark\ndatasets. Ultimately, our proposed approach shows that the data bias issue does\nnot necessarily require complex debiasing methods, given that an accurate bias\nidentification procedure is defined. Source code is available at\nhttps://github.com/Malga-Vision/MoDAD\n","authors":["Vito Paolo Pastore","Massimiliano Ciranni","Davide Marinelli","Francesca Odone","Vittorio Murino"],"pdf_url":"https://arxiv.org/pdf/2407.17449v3.pdf","comment":"13 pages, 8 figures; Accepted at IEEE/CVF Winter Conference on\n Applications of Computer Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2407.16940v2","updated":"2024-12-05T13:30:16Z","published":"2024-07-24T02:20:29Z","title":"GV-Rep: A Large-Scale Dataset for Genetic Variant Representation\n Learning","summary":" Genetic variants (GVs) are defined as differences in the DNA sequences among\nindividuals and play a crucial role in diagnosing and treating genetic\ndiseases. The rapid decrease in next generation sequencing cost has led to an\nexponential increase in patient-level GV data. This growth poses a challenge\nfor clinicians who must efficiently prioritize patient-specific GVs and\nintegrate them with existing genomic databases to inform patient management. To\naddressing the interpretation of GVs, genomic foundation models (GFMs) have\nemerged. However, these models lack standardized performance assessments,\nleading to considerable variability in model evaluations. This poses the\nquestion: How effectively do deep learning methods classify unknown GVs and\nalign them with clinically-verified GVs? We argue that representation learning,\nwhich transforms raw data into meaningful feature spaces, is an effective\napproach for addressing both indexing and classification challenges. We\nintroduce a large-scale Genetic Variant dataset, named GV-Rep, featuring\nvariable-length contexts and detailed annotations, designed for deep learning\nmodels to learn GV representations across various traits, diseases, tissue\ntypes, and experimental contexts. Our contributions are three-fold: (i)\nConstruction of a comprehensive dataset with 7 million records, each labeled\nwith characteristics of the corresponding variants, alongside additional data\nfrom 17,548 gene knockout tests across 1,107 cell types, 1,808 variant\ncombinations, and 156 unique clinically verified GVs from real-world patients.\n(ii) Analysis of the structure and properties of the dataset. (iii)\nExperimentation of the dataset with pre-trained GFMs. The results show a\nsignificant gap between GFMs current capabilities and accurate GV\nrepresentation. We hope this dataset will help advance genomic deep learning to\nbridge this gap.\n","authors":["Zehui Li","Vallijah Subasri","Guy-Bart Stan","Yiren Zhao","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2407.16940v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2412.03417v2","updated":"2024-12-05T13:22:28Z","published":"2024-12-04T15:53:45Z","title":"Learning Semantic Association Rules from Internet of Things Data","summary":" Association Rule Mining (ARM) is the task of discovering commonalities in\ndata in the form of logical implications. ARM is used in the Internet of Things\n(IoT) for different tasks including monitoring and decision-making. However,\nexisting methods give limited consideration to IoT-specific requirements such\nas heterogeneity and volume. Furthermore, they do not utilize important static\ndomain-specific description data about IoT systems, which is increasingly\nrepresented as knowledge graphs. In this paper, we propose a novel ARM pipeline\nfor IoT data that utilizes both dynamic sensor data and static IoT system\nmetadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method\n(Aerial) as part of the pipeline to address the high volume of IoT data and\nreduce the total number of rules that are resource-intensive to process. Aerial\nlearns a neural representation of a given data and extracts association rules\nfrom this representation by exploiting the reconstruction (decoding) mechanism\nof an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show\nthat ARM on both static and dynamic IoT data results in more generically\napplicable rules while Aerial can learn a more concise set of high-quality\nassociation rules than the state-of-the-art with full coverage over the\ndatasets.\n","authors":["Erkan Karabulut","Paul Groth","Victoria Degeler"],"pdf_url":"https://arxiv.org/pdf/2412.03417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04147v1","updated":"2024-12-05T13:19:34Z","published":"2024-12-05T13:19:34Z","title":"MultiTASC++: A Continuously Adaptive Scheduler for Edge-Based\n Multi-Device Cascade Inference","summary":" Cascade systems, consisting of a lightweight model processing all samples and\na heavier, high-accuracy model refining challenging samples, have become a\nwidely-adopted distributed inference approach to achieving high accuracy and\nmaintaining a low computational burden for mobile and IoT devices. As\nintelligent indoor environments, like smart homes, continue to expand, a new\nscenario emerges, the multi-device cascade. In this setting, multiple diverse\ndevices simultaneously utilize a shared heavy model hosted on a server, often\nsituated within or close to the consumer environment. This work introduces\nMultiTASC++, a continuously adaptive multi-tenancy-aware scheduler that\ndynamically controls the forwarding decision functions of devices to optimize\nsystem throughput while maintaining high accuracy and low latency. Through\nextensive experimentation in diverse device environments and with varying\nserver-side models, we demonstrate the scheduler's efficacy in consistently\nmaintaining a targeted satisfaction rate while providing the highest available\naccuracy across different device tiers and workloads of up to 100 devices. This\ndemonstrates its scalability and efficiency in addressing the unique challenges\nof collaborative DNN inference in dynamic and diverse IoT environments.\n","authors":["Sokratis Nikolaidis","Stylianos I. Venieris","Iakovos S. Venieris"],"pdf_url":"https://arxiv.org/pdf/2412.04147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.14123v2","updated":"2024-12-05T13:15:34Z","published":"2024-02-21T20:43:49Z","title":"DeiSAM: Segment Anything with Deictic Prompting","summary":" Large-scale, pre-trained neural networks have demonstrated strong\ncapabilities in various tasks, including zero-shot image segmentation. To\nidentify concrete objects in complex scenes, humans instinctively rely on\ndeictic descriptions in natural language, i.e., referring to something\ndepending on the context such as \"The object that is on the desk and behind the\ncup.\". However, deep learning approaches cannot reliably interpret such deictic\nrepresentations due to their lack of reasoning capabilities in complex\nscenarios. To remedy this issue, we propose DeiSAM -- a combination of large\npre-trained neural networks with differentiable logic reasoners -- for deictic\npromptable segmentation. Given a complex, textual segmentation description,\nDeiSAM leverages Large Language Models (LLMs) to generate first-order logic\nrules and performs differentiable forward reasoning on generated scene graphs.\nSubsequently, DeiSAM segments objects by matching them to the logically\ninferred image regions. As part of our evaluation, we propose the Deictic\nVisual Genome (DeiVG) dataset, containing paired visual input and complex,\ndeictic textual prompts. Our empirical results demonstrate that DeiSAM is a\nsubstantial improvement over purely data-driven baselines for deictic\npromptable segmentation.\n","authors":["Hikaru Shindo","Manuel Brack","Gopika Sudhakaran","Devendra Singh Dhami","Patrick Schramowski","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2402.14123v2.pdf","comment":"Published as a conference paper at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.04140v1","updated":"2024-12-05T13:07:24Z","published":"2024-12-05T13:07:24Z","title":"Understanding Memorization in Generative Models via Sharpness in\n Probability Landscapes","summary":" In this paper, we introduce a geometric framework to analyze memorization in\ndiffusion models using the eigenvalues of the Hessian of the log probability\ndensity. We propose that memorization arises from isolated points in the\nlearned probability distribution, characterized by sharpness in the probability\nlandscape, as indicated by large negative eigenvalues of the Hessian. Through\nexperiments on various datasets, we demonstrate that these eigenvalues\neffectively detect and quantify memorization. Our approach provides a clear\nunderstanding of memorization in diffusion models and lays the groundwork for\ndeveloping strategies to ensure secure and reliable generative models\n","authors":["Dongjae Jeon","Dueun Kim","Albert No"],"pdf_url":"https://arxiv.org/pdf/2412.04140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04137v1","updated":"2024-12-05T13:04:10Z","published":"2024-12-05T13:04:10Z","title":"Text Change Detection in Multilingual Documents Using Image Comparison","summary":" Document comparison typically relies on optical character recognition (OCR)\nas its core technology. However, OCR requires the selection of appropriate\nlanguage models for each document and the performance of multilingual or hybrid\nmodels remains limited. To overcome these challenges, we propose text change\ndetection (TCD) using an image comparison model tailored for multilingual\ndocuments. Unlike OCR-based approaches, our method employs word-level text\nimage-to-image comparison to detect changes. Our model generates bidirectional\nchange segmentation maps between the source and target documents. To enhance\nperformance without requiring explicit text alignment or scaling preprocessing,\nwe employ correlations among multi-scale attention features. We also construct\na benchmark dataset comprising actual printed and scanned word pairs in various\nlanguages to evaluate our model. We validate our approach using our benchmark\ndataset and public benchmarks Distorted Document Images and the LRDE Document\nBinarization Dataset. We compare our model against state-of-the-art semantic\nsegmentation and change detection models, as well as to conventional OCR-based\nmodels.\n","authors":["Doyoung Park","Naresh Reddy Yarram","Sunjin Kim","Minkyu Kim","Seongho Cho","Taehee Lee"],"pdf_url":"https://arxiv.org/pdf/2412.04137v1.pdf","comment":"15pages, 11figures 6tables, wacv2025 accepted"},{"id":"http://arxiv.org/abs/2405.13888v2","updated":"2024-12-05T13:03:55Z","published":"2024-05-22T18:00:41Z","title":"Marrying Causal Representation Learning with Dynamical Systems for\n Science","summary":" Causal representation learning promises to extend causal models to hidden\ncausal variables from raw entangled measurements. However, most progress has\nfocused on proving identifiability results in different settings, and we are\nnot aware of any successful real-world application. At the same time, the field\nof dynamical systems benefited from deep learning and scaled to countless\napplications but does not allow parameter identification. In this paper, we\ndraw a clear connection between the two and their key assumptions, allowing us\nto apply identifiable methods developed in causal representation learning to\ndynamical systems. At the same time, we can leverage scalable differentiable\nsolvers developed for differential equations to build models that are both\nidentifiable and practical. Overall, we learn explicitly controllable models\nthat isolate the trajectory-specific parameters for further downstream tasks\nsuch as out-of-distribution classification or treatment effect estimation. We\nexperiment with a wind simulator with partially known factors of variation. We\nalso apply the resulting model to real-world climate data and successfully\nanswer downstream causal questions in line with existing literature on climate\nchange.\n","authors":["Dingling Yao","Caroline Muller","Francesco Locatello"],"pdf_url":"https://arxiv.org/pdf/2405.13888v2.pdf","comment":"NeurIPS 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2411.02785v2","updated":"2024-12-05T12:58:44Z","published":"2024-11-05T03:51:13Z","title":"Stochastic Monkeys at Play: Random Augmentations Cheaply Break LLM\n Safety Alignment","summary":" Safety alignment of Large Language Models (LLMs) has recently become a\ncritical objective of model developers. In response, a growing body of work has\nbeen investigating how safety alignment can be bypassed through various\njailbreaking methods, such as adversarial attacks. However, these jailbreak\nmethods can be rather costly or involve a non-trivial amount of creativity and\neffort, introducing the assumption that malicious users are high-resource or\nsophisticated. In this paper, we study how simple random augmentations to the\ninput prompt affect safety alignment effectiveness in state-of-the-art LLMs,\nsuch as Llama 3 and Qwen 2. We perform an in-depth evaluation of 17 different\nmodels and investigate the intersection of safety under random augmentations\nwith multiple dimensions: augmentation type, model size, quantization,\nfine-tuning-based defenses, and decoding strategies (e.g., sampling\ntemperature). We show that low-resource and unsophisticated attackers, i.e.\n$\\textit{stochastic monkeys}$, can significantly improve their chances of\nbypassing alignment with just 25 random augmentations per prompt. Source code\nand data: https://github.com/uiuc-focal-lab/stochastic-monkeys/\n","authors":["Jason Vega","Junsheng Huang","Gaokai Zhang","Hangoo Kang","Minjia Zhang","Gagandeep Singh"],"pdf_url":"https://arxiv.org/pdf/2411.02785v2.pdf","comment":"v2: Updated with changes from peer review rebuttal. v1: Version under\n peer review"},{"id":"http://arxiv.org/abs/2412.04134v1","updated":"2024-12-05T12:58:30Z","published":"2024-12-05T12:58:30Z","title":"Compositional Generative Multiphysics and Multi-component Simulation","summary":" Multiphysics simulation, which models the interactions between multiple\nphysical processes, and multi-component simulation of complex structures are\ncritical in fields like nuclear and aerospace engineering. Previous studies\noften rely on numerical solvers or machine learning-based surrogate models to\nsolve or accelerate these simulations. However, multiphysics simulations\ntypically require integrating multiple specialized solvers-each responsible for\nevolving a specific physical process-into a coupled program, which introduces\nsignificant development challenges. Furthermore, no universal algorithm exists\nfor multi-component simulations, which adds to the complexity. Here we propose\ncompositional Multiphysics and Multi-component Simulation with Diffusion models\n(MultiSimDiff) to overcome these challenges. During diffusion-based training,\nMultiSimDiff learns energy functions modeling the conditional probability of\none physical process/component conditioned on other processes/components. In\ninference, MultiSimDiff generates coupled multiphysics solutions and\nmulti-component structures by sampling from the joint probability distribution,\nachieved by composing the learned energy functions in a structured way. We test\nour method in three tasks. In the reaction-diffusion and nuclear thermal\ncoupling problems, MultiSimDiff successfully predicts the coupling solution\nusing decoupled data, while the surrogate model fails in the more complex\nsecond problem. For the thermal and mechanical analysis of the prismatic fuel\nelement, MultiSimDiff trained for single component prediction accurately\npredicts a larger structure with 64 components, reducing the relative error by\n40.3% compared to the surrogate model.\n","authors":["Tao Zhang","Zhenhai Liu","Feipeng Qi","Yongjun Jiao","Tailin Wu"],"pdf_url":"https://arxiv.org/pdf/2412.04134v1.pdf","comment":"30pages,13 figures"},{"id":"http://arxiv.org/abs/2412.04121v1","updated":"2024-12-05T12:46:18Z","published":"2024-12-05T12:46:18Z","title":"DeepFEA: Deep Learning for Prediction of Transient Finite Element\n Analysis Solutions","summary":" Finite Element Analysis (FEA) is a powerful but computationally intensive\nmethod for simulating physical phenomena. Recent advancements in machine\nlearning have led to surrogate models capable of accelerating FEA. Yet there\nare still limitations in developing surrogates of transient FEA models that can\nsimultaneously predict the solutions for both nodes and elements with\napplicability on both the 2D and 3D domains. Motivated by this research gap,\nthis study proposes DeepFEA, a deep learning-based framework that leverages a\nmultilayer Convolutional Long Short-Term Memory (ConvLSTM) network branching\ninto two parallel convolutional neural networks to predict the solutions for\nboth nodes and elements of FEA models. The proposed network is optimized using\na novel adaptive learning algorithm, called Node-Element Loss Optimization\n(NELO). NELO minimizes the error occurring at both branches of the network\nenabling the prediction of solutions for transient FEA simulations. The\nexperimental evaluation of DeepFEA is performed on three datasets in the\ncontext of structural mechanics, generated to serve as publicly available\nreference datasets. The results show that DeepFEA can achieve less than 3%\nnormalized mean and root mean squared error for 2D and 3D simulation scenarios,\nand inference times that are two orders of magnitude faster than FEA. In\ncontrast, relevant state-of-the-art methods face challenges with\nmulti-dimensional output and dynamic input prediction. Furthermore, DeepFEA's\nrobustness was demonstrated in a real-life biomedical scenario, confirming its\nsuitability for accurate and efficient predictions of FEA simulations.\n","authors":["Georgios Triantafyllou","Panagiotis G. Kalozoumis","George Dimas","Dimitris K. Iakovidis"],"pdf_url":"https://arxiv.org/pdf/2412.04121v1.pdf","comment":"This work has been submitted to a journal for possible publication"},{"id":"http://arxiv.org/abs/2409.19214v2","updated":"2024-12-05T12:45:09Z","published":"2024-09-28T02:45:14Z","title":"Group Distributionally Robust Optimization can Suppress Class Imbalance\n Effect in Network Traffic Classification","summary":" Internet services have led to the eruption of network traffic, and machine\nlearning on these Internet data has become an indispensable tool, especially\nwhen the application is risk-sensitive. This paper focuses on network traffic\nclassification in the presence of class imbalance, which fundamentally and\nubiquitously exists in Internet data analysis. This existence of class\nimbalance mostly drifts the optimal decision boundary, resulting in a less\noptimal solution for machine learning models. To alleviate the effect, we\npropose to design strategies for alleviating the class imbalance through the\nlens of group distributionally robust optimization. Our approach iteratively\nupdates the non-parametric weights for separate classes and optimizes the\nlearning model by minimizing reweighted losses. We interpret the optimization\nprocess from a Stackelberg game and perform extensive experiments on typical\nbenchmarks. Results show that our approach can not only suppress the negative\neffect of class imbalance but also improve the comprehensive performance in\nprediction.\n","authors":["Wumei Du","Dong Liang","Yiqin Lv","Xingxing Liang","Guanlin Wu","Qi Wang","Zheng Xie"],"pdf_url":"https://arxiv.org/pdf/2409.19214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.08020v2","updated":"2024-12-05T12:40:16Z","published":"2024-10-10T15:17:49Z","title":"Efficiently Learning at Test-Time: Active Fine-Tuning of LLMs","summary":" Recent efforts in fine-tuning language models often rely on automatic data\nselection, commonly using Nearest Neighbors retrieval from large datasets.\nHowever, we theoretically show that this approach tends to select redundant\ndata, limiting its effectiveness or even hurting performance. To address this,\nwe introduce SIFT, a data selection algorithm designed to reduce uncertainty\nabout the model's response given a prompt, which unifies ideas from retrieval\nand active learning. Whereas Nearest Neighbor retrieval typically fails in the\npresence of information duplication, SIFT accounts for information duplication\nand optimizes the overall information gain of the selected examples. We focus\nour evaluations on fine-tuning at test-time for prompt-specific language\nmodeling on the Pile dataset, and show that SIFT consistently outperforms\nNearest Neighbor retrieval, with minimal computational overhead. Moreover, we\nshow that our uncertainty estimates can predict the performance gain of\ntest-time fine-tuning, and use this to develop an adaptive algorithm that\ninvests test-time compute proportional to realized performance gains. We\nprovide the $\\texttt{activeft}$ (Active Fine-Tuning) library which can be used\nas a drop-in replacement for Nearest Neighbor retrieval.\n","authors":["Jonas Hübotter","Sascha Bongni","Ido Hakimi","Andreas Krause"],"pdf_url":"https://arxiv.org/pdf/2410.08020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.02865v2","updated":"2024-12-05T12:38:58Z","published":"2024-12-03T22:00:12Z","title":"Memory-efficient Continual Learning with Neural Collapse Contrastive","summary":" Contrastive learning has significantly improved representation quality,\nenhancing knowledge transfer across tasks in continual learning (CL). However,\ncatastrophic forgetting remains a key challenge, as contrastive based methods\nprimarily focus on \"soft relationships\" or \"softness\" between samples, which\nshift with changing data distributions and lead to representation overlap\nacross tasks. Recently, the newly identified Neural Collapse phenomenon has\nshown promise in CL by focusing on \"hard relationships\" or \"hardness\" between\nsamples and fixed prototypes. However, this approach overlooks \"softness\",\ncrucial for capturing intra-class variability, and this rigid focus can also\npull old class representations toward current ones, increasing forgetting.\nBuilding on these insights, we propose Focal Neural Collapse Contrastive\n(FNC2), a novel representation learning loss that effectively balances both\nsoft and hard relationships. Additionally, we introduce the Hardness-Softness\nDistillation (HSD) loss to progressively preserve the knowledge gained from\nthese relationships across tasks. Our method outperforms state-of-the-art\napproaches, particularly in minimizing memory reliance. Remarkably, even\nwithout the use of memory, our approach rivals rehearsal-based methods,\noffering a compelling solution for data privacy concerns.\n","authors":["Trung-Anh Dang","Vincent Nguyen","Ngoc-Son Vu","Christel Vrain"],"pdf_url":"https://arxiv.org/pdf/2412.02865v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2412.04100v1","updated":"2024-12-05T12:10:42Z","published":"2024-12-05T12:10:42Z","title":"Missing Melodies: AI Music Generation and its \"Nearly\" Complete Omission\n of the Global South","summary":" Recent advances in generative AI have sparked renewed interest and expanded\npossibilities for music generation. However, the performance and versatility of\nthese systems across musical genres are heavily influenced by the availability\nof training data. We conducted an extensive analysis of over one million hours\nof audio datasets used in AI music generation research and manually reviewed\nmore than 200 papers from eleven prominent AI and music conferences and\norganizations (AAAI, ACM, EUSIPCO, EURASIP, ICASSP, ICML, IJCAI, ISMIR,\nNeurIPS, NIME, SMC) to identify a critical gap in the fair representation and\ninclusion of the musical genres of the Global South in AI research. Our\nfindings reveal a stark imbalance: approximately 86% of the total dataset hours\nand over 93% of researchers focus primarily on music from the Global North.\nHowever, around 40% of these datasets include some form of non-Western music,\ngenres from the Global South account for only 14.6% of the data. Furthermore,\napproximately 51% of the papers surveyed concentrate on symbolic music\ngeneration, a method that often fails to capture the cultural nuances inherent\nin music from regions such as South Asia, the Middle East, and Africa. As AI\nincreasingly shapes the creation and dissemination of music, the significant\nunderrepresentation of music genres in datasets and research presents a serious\nthreat to global musical diversity. We also propose some important steps to\nmitigate these risks and foster a more inclusive future for AI-driven music\ngeneration.\n","authors":["Atharva Mehta","Shivam Chauhan","Monojit Choudhury"],"pdf_url":"https://arxiv.org/pdf/2412.04100v1.pdf","comment":"Submitted to CACM, 12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2412.04095v1","updated":"2024-12-05T12:01:20Z","published":"2024-12-05T12:01:20Z","title":"HyperFLINT: Hypernetwork-based Flow Estimation and Temporal\n Interpolation for Scientific Ensemble Visualization","summary":" We present HyperFLINT (Hypernetwork-based FLow estimation and temporal\nINTerpolation), a novel deep learning-based approach for estimating flow\nfields, temporally interpolating scalar fields, and facilitating parameter\nspace exploration in spatio-temporal scientific ensemble data. This work\naddresses the critical need to explicitly incorporate ensemble parameters into\nthe learning process, as traditional methods often neglect these, limiting\ntheir ability to adapt to diverse simulation settings and provide meaningful\ninsights into the data dynamics. HyperFLINT introduces a hypernetwork to\naccount for simulation parameters, enabling it to generate accurate\ninterpolations and flow fields for each timestep by dynamically adapting to\nvarying conditions, thereby outperforming existing parameter-agnostic\napproaches. The architecture features modular neural blocks with convolutional\nand deconvolutional layers, supported by a hypernetwork that generates weights\nfor the main network, allowing the model to better capture intricate simulation\ndynamics. A series of experiments demonstrates HyperFLINT's significantly\nimproved performance in flow field estimation and temporal interpolation, as\nwell as its potential in enabling parameter space exploration, offering\nvaluable insights into complex scientific ensembles.\n","authors":["Hamid Gadirov","Qi Wu","David Bauer","Kwan-Liu Ma","Jos Roerdink","Steffen Frey"],"pdf_url":"https://arxiv.org/pdf/2412.04095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12562v2","updated":"2024-12-05T11:57:19Z","published":"2024-03-19T09:17:18Z","title":"PePR: Performance Per Resource Unit as a Metric to Promote Small-Scale\n Deep Learning in Medical Image Analysis","summary":" The recent advances in deep learning (DL) have been accelerated by access to\nlarge-scale data and compute. These large-scale resources have been used to\ntrain progressively larger models which are resource intensive in terms of\ncompute, data, energy, and carbon emissions. These costs are becoming a new\ntype of entry barrier to researchers and practitioners with limited access to\nresources at such scale, particularly in the Global South. In this work, we\ntake a comprehensive look at the landscape of existing DL models for medical\nimage analysis tasks and demonstrate their usefulness in settings where\nresources are limited. To account for the resource consumption of DL models, we\nintroduce a novel measure to estimate the performance per resource unit, which\nwe call the PePR score. Using a diverse family of 131 unique DL architectures\n(spanning 1M to 130M trainable parameters) and three medical image datasets, we\ncapture trends about the performance-resource trade-offs. In applications like\nmedical image analysis, we argue that small-scale, specialized models are\nbetter than striving for large-scale models. Furthermore, we show that using\nexisting pretrained models that are fine-tuned on new data can significantly\nreduce the computational resources and data required compared to training\nmodels from scratch. We hope this work will encourage the community to focus on\nimproving AI equity by developing methods and models with smaller resource\nfootprints.\n","authors":["Raghavendra Selvan","Bob Pepin","Christian Igel","Gabrielle Samuel","Erik B Dam"],"pdf_url":"https://arxiv.org/pdf/2403.12562v2.pdf","comment":"Accepted to be published at the Northern Lights Deep Learning\n Conference (NLDL), 2025. Source code available at\n https://github.com/saintslab/PePR"},{"id":"http://arxiv.org/abs/2412.02482v2","updated":"2024-12-05T11:50:40Z","published":"2024-12-03T14:45:46Z","title":"What should a neuron aim for? Designing local objective functions based\n on information theory","summary":" In modern deep neural networks, the learning dynamics of the individual\nneurons is often obscure, as the networks are trained via global optimization.\nConversely, biological systems build on self-organized, local learning,\nachieving robustness and efficiency with limited global information. We here\nshow how self-organization between individual artificial neurons can be\nachieved by designing abstract bio-inspired local learning goals. These goals\nare parameterized using a recent extension of information theory, Partial\nInformation Decomposition (PID), which decomposes the information that a set of\ninformation sources holds about an outcome into unique, redundant and\nsynergistic contributions. Our framework enables neurons to locally shape the\nintegration of information from various input classes, i.e. feedforward,\nfeedback, and lateral, by selecting which of the three inputs should contribute\nuniquely, redundantly or synergistically to the output. This selection is\nexpressed as a weighted sum of PID terms, which, for a given problem, can be\ndirectly derived from intuitive reasoning or via numerical optimization,\noffering a window into understanding task-relevant local information\nprocessing. Achieving neuron-level interpretability while enabling strong\nperformance using local learning, our work advances a principled\ninformation-theoretic foundation for local learning strategies.\n","authors":["Andreas C. Schneider","Valentin Neuhaus","David A. Ehrlich","Abdullah Makkeh","Alexander S. Ecker","Viola Priesemann","Michael Wibral"],"pdf_url":"https://arxiv.org/pdf/2412.02482v2.pdf","comment":"24 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.13569v2","updated":"2024-12-05T11:50:24Z","published":"2024-10-17T17:17:09Z","title":"Learning on Model Weights using Tree Experts","summary":" The increasing availability of public models begs the question: can we train\nneural networks that use other networks as input? Such models allow us to study\ndifferent aspects of a given neural network, for example, determining the\ncategories in a model's training dataset. However, machine learning on model\nweights is challenging as they often exhibit significant variation unrelated to\nthe models' semantic properties (nuisance variation). Here, we identify a key\nproperty of real-world models: most public models belong to a small set of\nModel Trees, where all models within a tree are fine-tuned from a common\nancestor (e.g., a foundation model). Importantly, we find that within each tree\nthere is less nuisance variation between models. Concretely, while learning\nacross Model Trees requires complex architectures, even a linear classifier\ntrained on a single model layer often works within trees. While effective,\nthese linear classifiers are computationally expensive, especially when dealing\nwith larger models that have many parameters. To address this, we introduce\nProbing Experts (ProbeX), a theoretically motivated and lightweight method.\nNotably, ProbeX is the first probing method specifically designed to learn from\nthe weights of a single hidden model layer. We demonstrate the effectiveness of\nProbeX by predicting the categories in a model's training dataset based only on\nits weights. Excitingly, ProbeX can also map the weights of Stable Diffusion\ninto a shared weight-language embedding space, enabling zero-shot model\nclassification.\n","authors":["Eliahu Horwitz","Bar Cavia","Jonathan Kahana","Yedid Hoshen"],"pdf_url":"https://arxiv.org/pdf/2410.13569v2.pdf","comment":"Project page: https://horwitz.ai/probex/"},{"id":"http://arxiv.org/abs/2406.11624v3","updated":"2024-12-05T11:47:49Z","published":"2024-06-17T15:07:55Z","title":"Words in Motion: Extracting Interpretable Control Vectors for Motion\n Transformers","summary":" Transformer-based models generate hidden states that are difficult to\ninterpret. In this work, we aim to interpret these hidden states and control\nthem at inference, with a focus on motion forecasting. We use linear probes to\nmeasure neural collapse towards interpretable motion features in hidden states.\nHigh probing accuracy implies meaningful directions and distances between\nhidden states of opposing features, which we use to fit interpretable control\nvectors for activation steering at inference. To optimize our control vectors,\nwe use sparse autoencoders with fully-connected, convolutional, MLPMixer layers\nand various activation functions. Notably, we show that enforcing sparsity in\nhidden states leads to a more linear relationship between control vector\ntemperatures and forecasts. Our approach enables mechanistic interpretability\nand zero-shot generalization to unseen dataset characteristics with negligible\ncomputational overhead. Our implementation is available at\nhttps://github.com/kit-mrt/future-motion\n","authors":["Omer Sahin Tas","Royden Wagner"],"pdf_url":"https://arxiv.org/pdf/2406.11624v3.pdf","comment":"Add autoencoders with convolutional, MLPMixer layers, and JumpReLU\n activations"},{"id":"http://arxiv.org/abs/2412.04082v1","updated":"2024-12-05T11:32:53Z","published":"2024-12-05T11:32:53Z","title":"Learnable Similarity and Dissimilarity Guided Symmetric Non-Negative\n Matrix Factorization","summary":" Symmetric nonnegative matrix factorization (SymNMF) is a powerful tool for\nclustering, which typically uses the $k$-nearest neighbor ($k$-NN) method to\nconstruct similarity matrix. However, $k$-NN may mislead clustering since the\nneighbors may belong to different clusters, and its reliability generally\ndecreases as $k$ grows. In this paper, we construct the similarity matrix as a\nweighted $k$-NN graph with learnable weight that reflects the reliability of\neach $k$-th NN. This approach reduces the search space of the similarity matrix\nlearning to $n - 1$ dimension, as opposed to the $\\mathcal{O}(n^2)$ dimension\nof existing methods, where $n$ represents the number of samples. Moreover, to\nobtain a discriminative similarity matrix, we introduce a dissimilarity matrix\nwith a dual structure of the similarity matrix, and propose a new form of\northogonality regularization with discussions on its geometric interpretation\nand numerical stability. An efficient alternative optimization algorithm is\ndesigned to solve the proposed model, with theoretically guarantee that the\nvariables converge to a stationary point that satisfies the KKT conditions. The\nadvantage of the proposed model is demonstrated by the comparison with nine\nstate-of-the-art clustering methods on eight datasets. The code is available at\n\\url{https://github.com/lwl-learning/LSDGSymNMF}.\n","authors":["Wenlong Lyu","Yuheng Jia"],"pdf_url":"https://arxiv.org/pdf/2412.04082v1.pdf","comment":"12 pages, 14 figures"},{"id":"http://arxiv.org/abs/2412.04081v1","updated":"2024-12-05T11:32:14Z","published":"2024-12-05T11:32:14Z","title":"Federated Learning in Mobile Networks: A Comprehensive Case Study on\n Traffic Forecasting","summary":" The increasing demand for efficient resource allocation in mobile networks\nhas catalyzed the exploration of innovative solutions that could enhance the\ntask of real-time cellular traffic prediction. Under these circumstances,\nfederated learning (FL) stands out as a distributed and privacy-preserving\nsolution to foster collaboration among different sites, thus enabling\nresponsive near-the-edge solutions. In this paper, we comprehensively study the\npotential benefits of FL in telecommunications through a case study on\nfederated traffic forecasting using real-world data from base stations (BSs) in\nBarcelona (Spain). Our study encompasses relevant aspects within the federated\nexperience, including model aggregation techniques, outlier management, the\nimpact of individual clients, personalized learning, and the integration of\nexogenous sources of data. The performed evaluation is based on both prediction\naccuracy and sustainability, thus showcasing the environmental impact of\nemployed FL algorithms in various settings. The findings from our study\nhighlight FL as a promising and robust solution for mobile traffic prediction,\nemphasizing its twin merits as a privacy-conscious and environmentally\nsustainable approach, while also demonstrating its capability to overcome data\nheterogeneity and ensure high-quality predictions, marking a significant stride\ntowards its integration in mobile traffic management systems.\n","authors":["Nikolaos Pavlidis","Vasileios Perifanis","Selim F. Yilmaz","Francesc Wilhelmi","Marco Miozzo","Pavlos S. Efraimidis","Remous-Aris Koutsiamanis","Pavol Mulinka","Paolo Dini"],"pdf_url":"https://arxiv.org/pdf/2412.04081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.18245v2","updated":"2024-12-05T11:29:56Z","published":"2024-07-25T17:58:17Z","title":"VGGHeads: 3D Multi Head Alignment with a Large-Scale Synthetic Dataset","summary":" Human head detection, keypoint estimation, and 3D head model fitting are\nessential tasks with many applications. However, traditional real-world\ndatasets often suffer from bias, privacy, and ethical concerns, and they have\nbeen recorded in laboratory environments, which makes it difficult for trained\nmodels to generalize. Here, we introduce \\method -- a large-scale synthetic\ndataset generated with diffusion models for human head detection and 3D mesh\nestimation. Our dataset comprises over 1 million high-resolution images, each\nannotated with detailed 3D head meshes, facial landmarks, and bounding boxes.\nUsing this dataset, we introduce a new model architecture capable of\nsimultaneous head detection and head mesh reconstruction from a single image in\na single step. Through extensive experimental evaluations, we demonstrate that\nmodels trained on our synthetic data achieve strong performance on real images.\nFurthermore, the versatility of our dataset makes it applicable across a broad\nspectrum of tasks, offering a general and comprehensive representation of human\nheads.\n","authors":["Orest Kupyn","Eugene Khvedchenia","Christian Rupprecht"],"pdf_url":"https://arxiv.org/pdf/2407.18245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04078v1","updated":"2024-12-05T11:24:27Z","published":"2024-12-05T11:24:27Z","title":"Towards Generalizable Autonomous Penetration Testing via Domain\n Randomization and Meta-Reinforcement Learning","summary":" With increasing numbers of vulnerabilities exposed on the internet,\nautonomous penetration testing (pentesting) has emerged as an emerging research\narea, while reinforcement learning (RL) is a natural fit for studying\nautonomous pentesting. Previous research in RL-based autonomous pentesting\nmainly focused on enhancing agents' learning efficacy within abstract simulated\ntraining environments. They overlooked the applicability and generalization\nrequirements of deploying agents' policies in real-world environments that\ndiffer substantially from their training settings. In contrast, for the first\ntime, we shift focus to the pentesting agents' ability to generalize across\nunseen real environments. For this purpose, we propose a Generalizable\nAutonomous Pentesting framework (namely GAP) for training agents capable of\ndrawing inferences from one to another -- a key requirement for the broad\napplication of autonomous pentesting and a hallmark of human intelligence. GAP\nintroduces a Real-to-Sim-to-Real pipeline with two key methods: domain\nrandomization and meta-RL learning. Specifically, we are among the first to\napply domain randomization in autonomous pentesting and propose a large\nlanguage model-powered domain randomization method for synthetic environment\ngeneration. We further apply meta-RL to improve the agents' generalization\nability in unseen environments by leveraging the synthetic environments. The\ncombination of these two methods can effectively bridge the generalization gap\nand improve policy adaptation performance. Experiments are conducted on various\nvulnerable virtual machines, with results showing that GAP can (a) enable\npolicy learning in unknown real environments, (b) achieve zero-shot policy\ntransfer in similar environments, and (c) realize rapid policy adaptation in\ndissimilar environments.\n","authors":["Shicheng Zhou","Jingju Liu","Yuliang Lu","Jiahai Yang","Yue Zhang","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2412.04078v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2412.04076v1","updated":"2024-12-05T11:17:03Z","published":"2024-12-05T11:17:03Z","title":"Distance-Adaptive Quaternion Knowledge Graph Embedding with\n Bidirectional Rotation","summary":" Quaternion contains one real part and three imaginary parts, which provided a\nmore expressive hypercomplex space for learning knowledge graph. Existing\nquaternion embedding models measure the plausibility of a triplet either\nthrough semantic matching or geometric distance scoring functions. However, it\nappears that semantic matching diminishes the separability of entities, while\nthe distance scoring function weakens the semantics of entities. To address\nthis issue, we propose a novel quaternion knowledge graph embedding model. Our\nmodel combines semantic matching with entity's geometric distance to better\nmeasure the plausibility of triplets. Specifically, in the quaternion space, we\nperform a right rotation on head entity and a reverse rotation on tail entity\nto learn rich semantic features. Then, we utilize distance adaptive\ntranslations to learn geometric distance between entities. Furthermore, we\nprovide mathematical proofs to demonstrate our model can handle complex logical\nrelationships. Extensive experimental results and analyses show our model\nsignificantly outperforms previous models on well-known knowledge graph\ncompletion benchmark datasets. Our code is available at\nhttps://github.com/llqy123/DaBR.\n","authors":["Weihua Wang","Qiuyu Liang","Feilong Bao","Guanglai Gao"],"pdf_url":"https://arxiv.org/pdf/2412.04076v1.pdf","comment":"Accepted by COLING 2025"},{"id":"http://arxiv.org/abs/2412.04074v1","updated":"2024-12-05T11:12:46Z","published":"2024-12-05T11:12:46Z","title":"Integrated Sensing and Communications for Low-Altitude Economy: A Deep\n Reinforcement Learning Approach","summary":" This paper studies an integrated sensing and communications (ISAC) system for\nlow-altitude economy (LAE), where a ground base station (GBS) provides\ncommunication and navigation services for authorized unmanned aerial vehicles\n(UAVs), while sensing the low-altitude airspace to monitor the unauthorized\nmobile target. The expected communication sum-rate over a given flight period\nis maximized by jointly optimizing the beamforming at the GBS and UAVs'\ntrajectories, subject to the constraints on the average signal-to-noise ratio\nrequirement for sensing, the flight mission and collision avoidance of UAVs, as\nwell as the maximum transmit power at the GBS. Typically, this is a sequential\ndecision-making problem with the given flight mission. Thus, we transform it to\na specific Markov decision process (MDP) model called episode task. Based on\nthis modeling, we propose a novel LAE-oriented ISAC scheme, referred to as Deep\nLAE-ISAC (DeepLSC), by leveraging the deep reinforcement learning (DRL)\ntechnique. In DeepLSC, a reward function and a new action selection policy\ntermed constrained noise-exploration policy are judiciously designed to fulfill\nvarious constraints. To enable efficient learning in episode tasks, we develop\na hierarchical experience replay mechanism, where the gist is to employ all\nexperiences generated within each episode to jointly train the neural network.\nBesides, to enhance the convergence speed of DeepLSC, a symmetric experience\naugmentation mechanism, which simultaneously permutes the indexes of all\nvariables to enrich available experience sets, is proposed. Simulation results\ndemonstrate that compared with benchmarks, DeepLSC yields a higher sum-rate\nwhile meeting the preset constraints, achieves faster convergence, and is more\nrobust against different settings.\n","authors":["Xiaowen Ye","Yuyi Mao","Xianghao Yu","Shu Sun","Liqun Fu","Jie Xu"],"pdf_url":"https://arxiv.org/pdf/2412.04074v1.pdf","comment":"submitted for an IEEE publication"},{"id":"http://arxiv.org/abs/2412.04072v1","updated":"2024-12-05T11:09:11Z","published":"2024-12-05T11:09:11Z","title":"Boundary-Guided Learning for Gene Expression Prediction in Spatial\n Transcriptomics","summary":" Spatial transcriptomics (ST) has emerged as an advanced technology that\nprovides spatial context to gene expression. Recently, deep learning-based\nmethods have shown the capability to predict gene expression from WSI data\nusing ST data. Existing approaches typically extract features from images and\nthe neighboring regions using pretrained models, and then develop methods to\nfuse this information to generate the final output. However, these methods\noften fail to account for the cellular structure similarity, cellular density\nand the interactions within the microenvironment. In this paper, we propose a\nframework named BG-TRIPLEX, which leverages boundary information extracted from\npathological images as guiding features to enhance gene expression prediction\nfrom WSIs. Specifically, our model consists of three branches: the spot,\nin-context and global branches. In the spot and in-context branches, boundary\ninformation, including edge and nuclei characteristics, is extracted using\npretrained models. These boundary features guide the learning of cellular\nmorphology and the characteristics of microenvironment through Multi-Head\nCross-Attention. Finally, these features are integrated with global features to\npredict the final output. Extensive experiments were conducted on three public\nST datasets. The results demonstrate that our BG-TRIPLEX consistently\noutperforms existing methods in terms of Pearson Correlation Coefficient (PCC).\nThis method highlights the crucial role of boundary features in understanding\nthe complex interactions between WSI and gene expression, offering a promising\ndirection for future research.\n","authors":["Mingcheng Qu","Yuncong Wu","Donglin Di","Anyang Su","Tonghua Su","Yang Song","Lei Fan"],"pdf_url":"https://arxiv.org/pdf/2412.04072v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2408.08968v3","updated":"2024-12-05T11:01:30Z","published":"2024-08-16T18:34:11Z","title":"Online SLA Decomposition: Enabling Real-Time Adaptation to Evolving\n Systems","summary":" When a network slice spans multiple technology domains, it is crucial for\neach domain to uphold the End-to-End (E2E) Service Level Agreement (SLA)\nassociated with the slice. Consequently, the E2E SLA must be properly\ndecomposed into partial SLAs that are assigned to each domain involved. In a\nnetwork slice management system with a two-level architecture, comprising an\nE2E service orchestrator and local domain controllers, we consider that the\norchestrator has access solely to historical data regarding the responses of\nlocal controllers to previous requests, and this information is used to\nconstruct a risk model for each domain. In this study, we extend our previous\nwork by investigating the dynamic nature of real-world systems and introducing\nan online learning-decomposition framework to tackle the dynamicity. We propose\na framework that periodically updates the risk models based on the most recent\nfeedback. This approach leverages key components such as online gradient\ndescent and FIFO memory buffers, which enhance the stability and robustness of\nthe overall process. Our empirical study on an analytic model-based simulator\ndemonstrates that the proposed framework outperforms the state-of-the-art\nstatic approach, providing more accurate and resilient SLA decomposition even\nunder varying conditions and limited data scenarios.\n","authors":["Cyril Shih-Huan Hsu","Danny De Vleeschauwer","Chrysa Papagianni"],"pdf_url":"https://arxiv.org/pdf/2408.08968v3.pdf","comment":"The paper has been submitted to IEEE ICMLCN 2025"},{"id":"http://arxiv.org/abs/2412.04065v1","updated":"2024-12-05T10:59:54Z","published":"2024-12-05T10:59:54Z","title":"Space to Policy: Scalable Brick Kiln Detection and Automatic Compliance\n Monitoring with Geospatial Data","summary":" Air pollution kills 7 million people annually. The brick kiln sector\nsignificantly contributes to economic development but also accounts for 8-14\\%\nof air pollution in India. Policymakers have implemented compliance measures to\nregulate brick kilns. Emission inventories are critical for air quality\nmodeling and source apportionment studies. However, the largely unorganized\nnature of the brick kiln sector necessitates labor-intensive survey efforts for\nmonitoring. Recent efforts by air quality researchers have relied on manual\nannotation of brick kilns using satellite imagery to build emission\ninventories, but this approach lacks scalability. Machine-learning-based object\ndetection methods have shown promise for detecting brick kilns; however,\nprevious studies often rely on costly high-resolution imagery and fail to\nintegrate with governmental policies. In this work, we developed a scalable\nmachine-learning pipeline that detected and classified 30638 brick kilns across\nfive states in the Indo-Gangetic Plain using free, moderate-resolution\nsatellite imagery from Planet Labs. Our detections have a high correlation with\non-ground surveys. We performed automated compliance analysis based on\ngovernment policies. In the Delhi airshed, stricter policy enforcement has led\nto the adoption of efficient brick kiln technologies. This study highlights the\nneed for inclusive policies that balance environmental sustainability with the\nlivelihoods of workers.\n","authors":["Zeel B Patel","Rishabh Mondal","Shataxi Dubey","Suraj Jaiswal","Sarath Guttikunda","Nipun Batra"],"pdf_url":"https://arxiv.org/pdf/2412.04065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04064v1","updated":"2024-12-05T10:59:20Z","published":"2024-12-05T10:59:20Z","title":"Graph Neural Networks Need Cluster-Normalize-Activate Modules","summary":" Graph Neural Networks (GNNs) are non-Euclidean deep learning models for\ngraph-structured data. Despite their successful and diverse applications,\noversmoothing prohibits deep architectures due to node features converging to a\nsingle fixed point. This severely limits their potential to solve complex\ntasks. To counteract this tendency, we propose a plug-and-play module\nconsisting of three steps: Cluster-Normalize-Activate (CNA). By applying CNA\nmodules, GNNs search and form super nodes in each layer, which are normalized\nand activated individually. We demonstrate in node classification and property\nprediction tasks that CNA significantly improves the accuracy over the\nstate-of-the-art. Particularly, CNA reaches 94.18% and 95.75% accuracy on Cora\nand CiteSeer, respectively. It further benefits GNNs in regression tasks as\nwell, reducing the mean squared error compared to all baselines. At the same\ntime, GNNs with CNA require substantially fewer learnable parameters than\ncompeting architectures.\n","authors":["Arseny Skryagin","Felix Divo","Mohammad Amin Ali","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2412.04064v1.pdf","comment":"17 pages, 6 figures, 6 tables, accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.09014v6","updated":"2024-12-05T10:57:12Z","published":"2024-06-13T11:38:58Z","title":"Deep learning empowered sensor fusion boosts infant movement\n classification","summary":" To assess the integrity of the developing nervous system, the Prechtl general\nmovement assessment (GMA) is recognized for its clinical value in diagnosing\nneurological impairments in early infancy. GMA has been increasingly augmented\nthrough machine learning approaches intending to scale-up its application,\ncircumvent costs in the training of human assessors and further standardize\nclassification of spontaneous motor patterns. Available deep learning tools,\nall of which are based on single sensor modalities, are however still\nconsiderably inferior to that of well-trained human assessors. These approaches\nare hardly comparable as all models are designed, trained and evaluated on\nproprietary/silo-data sets. With this study we propose a sensor fusion approach\nfor assessing fidgety movements (FMs). FMs were recorded from 51 typically\ndeveloping participants. We compared three different sensor modalities\n(pressure, inertial, and visual sensors). Various combinations and two sensor\nfusion approaches (late and early fusion) for infant movement classification\nwere tested to evaluate whether a multi-sensor system outperforms single\nmodality assessments. Convolutional neural network (CNN) architectures were\nused to classify movement patterns. The performance of the three-sensor fusion\n(classification accuracy of 94.5%) was significantly higher than that of any\nsingle modality evaluated. We show that the sensor fusion approach is a\npromising avenue for automated classification of infant motor patterns. The\ndevelopment of a robust sensor fusion system may significantly enhance AI-based\nearly recognition of neurofunctions, ultimately facilitating automated early\ndetection of neurodevelopmental conditions.\n","authors":["Tomas Kulvicius","Dajie Zhang","Luise Poustka","Sven Bölte","Lennart Jahn","Sarah Flügge","Marc Kraft","Markus Zweckstetter","Karin Nielsen-Saines","Florentin Wörgötter","Peter B Marschik"],"pdf_url":"https://arxiv.org/pdf/2406.09014v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14027v3","updated":"2024-12-05T10:49:37Z","published":"2023-12-21T16:58:49Z","title":"AdamMCMC: Combining Metropolis Adjusted Langevin with Momentum-based\n Optimization","summary":" Uncertainty estimation is a key issue when considering the application of\ndeep neural network methods in science and engineering. In this work, we\nintroduce a novel algorithm that quantifies epistemic uncertainty via Monte\nCarlo sampling from a tempered posterior distribution. It combines the well\nestablished Metropolis Adjusted Langevin Algorithm (MALA) with momentum-based\noptimization using Adam and leverages a prolate proposal distribution, to\nefficiently draw from the posterior. We prove that the constructed chain admits\nthe Gibbs posterior as invariant distribution and approximates this posterior\nin total variation distance. Furthermore, we demonstrate the efficiency of the\nresulting algorithm and the merit of the proposed changes on a state-of-the-art\nclassifier from high-energy particle physics.\n","authors":["Sebastian Bieringer","Gregor Kasieczka","Maximilian F. Steffen","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2312.14027v3.pdf","comment":"16 pages, 5 figures; adapted Theorem 2"},{"id":"http://arxiv.org/abs/2411.14875v2","updated":"2024-12-05T10:40:41Z","published":"2024-11-22T11:55:37Z","title":"Iterative Reweighted Framework Based Algorithms for Sparse Linear\n Regression with Generalized Elastic Net Penalty","summary":" The elastic net penalty is frequently employed in high-dimensional statistics\nfor parameter regression and variable selection. It is particularly beneficial\ncompared to lasso when the number of predictors greatly surpasses the number of\nobservations. However, empirical evidence has shown that the $\\ell_q$-norm\npenalty (where $0 < q < 1$) often provides better regression compared to the\n$\\ell_1$-norm penalty, demonstrating enhanced robustness in various scenarios.\nIn this paper, we explore a generalized elastic net model that employs a\n$\\ell_r$-norm (where $r \\geq 1$) in loss function to accommodate various types\nof noise, and employs a $\\ell_q$-norm (where $0 < q < 1$) to replace the\n$\\ell_1$-norm in elastic net penalty. Theoretically, we establish the\ncomputable lower bounds for the nonzero entries of the generalized first-order\nstationary points of the proposed generalized elastic net model. For\nimplementation, we develop two efficient algorithms based on the locally\nLipschitz continuous $\\epsilon$-approximation to $\\ell_q$-norm. The first\nalgorithm employs an alternating direction method of multipliers (ADMM), while\nthe second utilizes a proximal majorization-minimization method (PMM), where\nthe subproblems are addressed using the semismooth Newton method (SNN). We also\nperform extensive numerical experiments with both simulated and real data,\nshowing that both algorithms demonstrate superior performance. Notably, the\nPMM-SSN is efficient than ADMM, even though the latter provides a simpler\nimplementation.\n","authors":["Yanyun Ding","Zhenghua Yao","Peili Li","Yunhai Xiao"],"pdf_url":"https://arxiv.org/pdf/2411.14875v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04047v1","updated":"2024-12-05T10:38:29Z","published":"2024-12-05T10:38:29Z","title":"Pathwise optimization for bridge-type estimators and its applications","summary":" Sparse parametric models are of great interest in statistical learning and\nare often analyzed by means of regularized estimators. Pathwise methods allow\nto efficiently compute the full solution path for penalized estimators, for any\npossible value of the penalization parameter $\\lambda$. In this paper we deal\nwith the pathwise optimization for bridge-type problems; i.e. we are interested\nin the minimization of a loss function, such as negative log-likelihood or\nresidual sum of squares, plus the sum of $\\ell^q$ norms with $q\\in(0,1]$\ninvolving adpative coefficients. For some loss functions this regularization\nachieves asymptotically the oracle properties (such as the selection\nconsistency). Nevertheless, since the objective function involves nonconvex and\nnondifferentiable terms, the minimization problem is computationally\nchallenging.\n The aim of this paper is to apply some general algorithms, arising from\nnonconvex optimization theory, to compute efficiently the path solutions for\nthe adaptive bridge estimator with multiple penalties. In particular, we take\ninto account two different approaches: accelerated proximal gradient descent\nand blockwise alternating optimization. The convergence and the path\nconsistency of these algorithms are discussed. In order to assess our methods,\nwe apply these algorithms to the penalized estimation of diffusion processes\nobserved at discrete times. This latter represents a recent research topic in\nthe field of statistics for time-dependent data.\n","authors":["Alessandro De Gregorio","Francesco Iafrate"],"pdf_url":"https://arxiv.org/pdf/2412.04047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04045v1","updated":"2024-12-05T10:36:39Z","published":"2024-12-05T10:36:39Z","title":"AI4EF: Artificial Intelligence for Energy Efficiency in the Building\n Sector","summary":" AI4EF, Artificial Intelligence for Energy Efficiency, is an advanced,\nuser-centric tool designed to support decision-making in building energy\nretrofitting and efficiency optimization. Leveraging machine learning (ML) and\ndata-driven insights, AI4EF enables stakeholders such as public sector\nrepresentatives, energy consultants, and building owners to model, analyze, and\npredict energy consumption, retrofit costs, and environmental impacts of\nbuilding upgrades. Featuring a modular framework, AI4EF includes customizable\nbuilding retrofitting, photovoltaic installation assessment, and predictive\nmodeling tools that allow users to input building parameters and receive\ntailored recommendations for achieving energy savings and carbon reduction\ngoals. Additionally, the platform incorporates a Training Playground for data\nscientists to refine ML models used by said framework. Finally, AI4EF provides\naccess to the Enershare Data Space to facilitate seamless data sharing and\naccess within the ecosystem. Its compatibility with open-source identity\nmanagement, Keycloak, enhances security and accessibility, making it adaptable\nfor various regulatory and organizational contexts. This paper presents an\narchitectural overview of AI4EF, its application in energy efficiency\nscenarios, and its potential for advancing sustainable energy practices through\nartificial intelligence (AI).\n","authors":["Alexandros Menelaos Tzortzis","Georgios Kormpakis","Sotiris Pelekis","Ariadni Michalitsi-Psarrou","Evangelos Karakolis","Christos Ntanos","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2412.04045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.01591v2","updated":"2024-12-05T10:22:00Z","published":"2024-12-02T15:08:55Z","title":"Kernel-Based Optimal Control: An Infinitesimal Generator Approach","summary":" This paper presents a novel approach for optimal control of nonlinear\nstochastic systems using infinitesimal generator learning within\ninfinite-dimensional reproducing kernel Hilbert spaces. Our learning framework\nleverages data samples of system dynamics and stage cost functions, with only\ncontrol penalties and constraints provided. The proposed method directly learns\nthe diffusion operator of a controlled Fokker-Planck-Kolmogorov equation in an\ninfinite-dimensional hypothesis space. This operator models the continuous-time\nevolution of the probability measure of the control system's state. We\ndemonstrate that this approach seamlessly integrates with modern convex\noperator-theoretic Hamilton-Jacobi-Bellman recursions, enabling a data-driven\nsolution to the optimal control problem. Furthermore, our statistical learning\nframework includes nonparametric estimators for uncontrolled forward\ninfinitesimal generators as a special case. Numerical experiments, ranging from\nsynthetic differential equations to simulated robotic systems, showcase the\nadvantages of our approach compared to both modern data-driven and classical\nnonlinear programming methods for optimal control.\n","authors":["Petar Bevanda","Nicolas Hoischen","Tobias Wittmann","Jan Brüdigam","Sandra Hirche","Boris Houska"],"pdf_url":"https://arxiv.org/pdf/2412.01591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04034v1","updated":"2024-12-05T10:15:56Z","published":"2024-12-05T10:15:56Z","title":"Dynamic Graph Representation with Contrastive Learning for Financial\n Market Prediction: Integrating Temporal Evolution and Static Relations","summary":" Temporal Graph Learning (TGL) is crucial for capturing the evolving nature of\nstock markets. Traditional methods often ignore the interplay between dynamic\ntemporal changes and static relational structures between stocks. To address\nthis issue, we propose the Dynamic Graph Representation with Contrastive\nLearning (DGRCL) framework, which integrates dynamic and static graph relations\nto improve the accuracy of stock trend prediction. Our framework introduces two\nkey components: the Embedding Enhancement (EE) module and the Contrastive\nConstrained Training (CCT) module. The EE module focuses on dynamically\ncapturing the temporal evolution of stock data, while the CCT module enforces\nstatic constraints based on stock relations, refined within contrastive\nlearning. This dual-relation approach allows for a more comprehensive\nunderstanding of stock market dynamics. Our experiments on two major U.S. stock\nmarket datasets, NASDAQ and NYSE, demonstrate that DGRCL significantly\noutperforms state-of-the-art TGL baselines. Ablation studies indicate the\nimportance of both modules. Overall, DGRCL not only enhances prediction ability\nbut also provides a robust framework for integrating temporal and relational\ndata in dynamic graphs. Code and data are available for public access.\n","authors":["Yunhua Pei","Jin Zheng","John Cartlidge"],"pdf_url":"https://arxiv.org/pdf/2412.04034v1.pdf","comment":"12 pages, 2 figures, author manuscript accepted for ICAART 2025\n (International Conference on Agents and Artificial Intelligence)"},{"id":"http://arxiv.org/abs/2411.01115v2","updated":"2024-12-05T09:45:55Z","published":"2024-11-02T02:50:12Z","title":"Relax and Merge: A Simple Yet Effective Framework for Solving Fair\n $k$-Means and $k$-sparse Wasserstein Barycenter Problems","summary":" The fairness of clustering algorithms has gained widespread attention across\nvarious areas, including machine learning, In this paper, we study fair\n$k$-means clustering in Euclidean space. Given a dataset comprising several\ngroups, the fairness constraint requires that each cluster should contain a\nproportion of points from each group within specified lower and upper bounds.\nDue to these fairness constraints, determining the optimal locations of $k$\ncenters is a quite challenging task. We propose a novel ``Relax and Merge''\nframework that returns a $(1+4\\rho + O(\\epsilon))$-approximate solution, where\n$\\rho$ is the approximate ratio of an off-the-shelf vanilla $k$-means algorithm\nand $O(\\epsilon)$ can be an arbitrarily small positive number. If equipped with\na PTAS of $k$-means, our solution can achieve an approximation ratio of\n$(5+O(\\epsilon))$ with only a slight violation of the fairness constraints,\nwhich improves the current state-of-the-art approximation guarantee.\nFurthermore, using our framework, we can also obtain a $(1+4\\rho\n+O(\\epsilon))$-approximate solution for the $k$-sparse Wasserstein Barycenter\nproblem, which is a fundamental optimization problem in the field of optimal\ntransport, and a $(2+6\\rho)$-approximate solution for the strictly fair\n$k$-means clustering with no violation, both of which are better than the\ncurrent state-of-the-art methods. In addition, the empirical results\ndemonstrate that our proposed algorithm can significantly outperform baseline\napproaches in terms of clustering cost.\n","authors":["Shihong Song","Guanlin Mo","Qingyuan Yang","Hu Ding"],"pdf_url":"https://arxiv.org/pdf/2411.01115v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.04011v1","updated":"2024-12-05T09:45:21Z","published":"2024-12-05T09:45:21Z","title":"A Note on Spectral Map","summary":" In molecular dynamics (MD) simulations, transitions between states are often\nrare events due to energy barriers that exceed the thermal temperature. Because\nof their infrequent occurrence and the huge number of degrees of freedom in\nmolecular systems, understanding the physical properties that drive rare events\nis immensely difficult. A common approach to this problem is to propose a\ncollective variable (CV) that describes this process by a simplified\nrepresentation. However, choosing CVs is not easy, as it often relies on\nphysical intuition. Machine learning (ML) techniques provide a promising\napproach for effectively extracting optimal CVs from MD data. Here, we provide\na note on a recent unsupervised ML method called spectral map, which constructs\nCVs by maximizing the timescale separation between slow and fast variables in\nthe system.\n","authors":["Tuğçe Gökdemir","Jakub Rydzewski"],"pdf_url":"https://arxiv.org/pdf/2412.04011v1.pdf","comment":"A letter prepared for the Ensemble journal of the Molecular\n Simulation Society of Japan (MSSJ)"},{"id":"http://arxiv.org/abs/2411.05712v2","updated":"2024-12-05T09:39:07Z","published":"2024-11-08T17:13:53Z","title":"Scaling Laws for Task-Optimized Models of the Primate Visual Ventral\n Stream","summary":" When trained on large-scale object classification datasets, certain\nartificial neural network models begin to approximate core object recognition\n(COR) behaviors and neural response patterns in the primate visual ventral\nstream (VVS). While recent machine learning advances suggest that scaling model\nsize, dataset size, and compute resources improve task performance, the impact\nof scaling on brain alignment remains unclear. In this study, we explore\nscaling laws for modeling the primate VVS by systematically evaluating over 600\nmodels trained under controlled conditions on benchmarks spanning V1, V2, V4,\nIT and COR behaviors. We observe that while behavioral alignment continues to\nscale with larger models, neural alignment saturates. This observation remains\ntrue across model architectures and training datasets, even though models with\nstronger inductive bias and datasets with higher-quality images are more\ncompute-efficient. Increased scaling is especially beneficial for higher-level\nvisual areas, where small models trained on few samples exhibit only poor\nalignment. Finally, we develop a scaling recipe, indicating that a greater\nproportion of compute should be allocated to data samples over model size. Our\nresults suggest that while scaling alone might suffice for alignment with human\ncore object recognition behavior, it will not yield improved models of the\nbrain's visual ventral stream with current architectures and datasets,\nhighlighting the need for novel strategies in building brain-like models.\n","authors":["Abdulkadir Gokce","Martin Schrimpf"],"pdf_url":"https://arxiv.org/pdf/2411.05712v2.pdf","comment":"10 pages for the main paper, 23 pages in total. 7 main figures and 7\n supplementary figures. Code, model weights, and benchmark results can be\n accessed at https://github.com/epflneuroailab/scaling-primate-vvs - In\n version 2, Figure 7 and the related discussion are added, and the appendix is\n updated"},{"id":"http://arxiv.org/abs/2412.03486v2","updated":"2024-12-05T09:26:26Z","published":"2024-12-04T17:23:35Z","title":"Tight PAC-Bayesian Risk Certificates for Contrastive Learning","summary":" Contrastive representation learning is a modern paradigm for learning\nrepresentations of unlabeled data via augmentations -- precisely, contrastive\nmodels learn to embed semantically similar pairs of samples (positive pairs)\ncloser than independently drawn samples (negative samples). In spite of its\nempirical success and widespread use in foundation models, statistical theory\nfor contrastive learning remains less explored. Recent works have developed\ngeneralization error bounds for contrastive losses, but the resulting risk\ncertificates are either vacuous (certificates based on Rademacher complexity or\n$f$-divergence) or require strong assumptions about samples that are\nunreasonable in practice. The present paper develops non-vacuous PAC-Bayesian\nrisk certificates for contrastive representation learning, considering the\npractical considerations of the popular SimCLR framework. Notably, we take into\naccount that SimCLR reuses positive pairs of augmented data as negative samples\nfor other data, thereby inducing strong dependence and making classical PAC or\nPAC-Bayesian bounds inapplicable. We further refine existing bounds on the\ndownstream classification loss by incorporating SimCLR-specific factors,\nincluding data augmentation and temperature scaling, and derive risk\ncertificates for the contrastive zero-one risk. The resulting bounds for\ncontrastive loss and downstream prediction are much tighter than those of\nprevious risk certificates, as demonstrated by experiments on CIFAR-10.\n","authors":["Anna Van Elst","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2412.03486v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14438v3","updated":"2024-12-05T09:23:13Z","published":"2024-05-23T11:10:32Z","title":"LoRA-Ensemble: Efficient Uncertainty Modelling for Self-attention\n Networks","summary":" Numerous crucial tasks in real-world decision-making rely on machine learning\nalgorithms with calibrated uncertainty estimates. However, modern methods often\nyield overconfident and uncalibrated predictions. Various approaches involve\ntraining an ensemble of separate models to quantify the uncertainty related to\nthe model itself, known as epistemic uncertainty. In an explicit\nimplementation, the ensemble approach has high computational cost and high\nmemory requirements. This particular challenge is evident in state-of-the-art\nneural networks such as transformers, where even a single network is already\ndemanding in terms of compute and memory. Consequently, efforts are made to\nemulate the ensemble model without actually instantiating separate ensemble\nmembers, referred to as implicit ensembling. We introduce LoRA-Ensemble, a\nparameter-efficient deep ensemble method for self-attention networks, which is\nbased on Low-Rank Adaptation (LoRA). Initially developed for efficient LLM\nfine-tuning, we extend LoRA to an implicit ensembling approach. By employing a\nsingle pre-trained self-attention network with weights shared across all\nmembers, we train member-specific low-rank matrices for the attention\nprojections. Our method exhibits superior calibration compared to explicit\nensembles and achieves similar or better accuracy across various prediction\ntasks and datasets.\n","authors":["Michelle Halbheer","Dominik J. Mühlematter","Alexander Becker","Dominik Narnhofer","Helge Aasen","Konrad Schindler","Mehmet Ozgur Turkoglu"],"pdf_url":"https://arxiv.org/pdf/2405.14438v3.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2412.03995v1","updated":"2024-12-05T09:15:21Z","published":"2024-12-05T09:15:21Z","title":"Blind Underwater Image Restoration using Co-Operational Regressor\n Networks","summary":" The exploration of underwater environments is essential for applications such\nas biological research, archaeology, and infrastructure maintenanceHowever,\nunderwater imaging is challenging due to the waters unique properties,\nincluding scattering, absorption, color distortion, and reduced visibility. To\naddress such visual degradations, a variety of approaches have been proposed\ncovering from basic signal processing methods to deep learning models; however,\nnone of them has proven to be consistently successful. In this paper, we\npropose a novel machine learning model, Co-Operational Regressor Networks\n(CoRe-Nets), designed to achieve the best possible underwater image\nrestoration. A CoRe-Net consists of two co-operating networks: the Apprentice\nRegressor (AR), responsible for image transformation, and the Master Regressor\n(MR), which evaluates the Peak Signal-to-Noise Ratio (PSNR) of the images\ngenerated by the AR and feeds it back to AR. CoRe-Nets are built on\nSelf-Organized Operational Neural Networks (Self-ONNs), which offer a superior\nlearning capability by modulating nonlinearity in kernel transformations. The\neffectiveness of the proposed model is demonstrated on the benchmark Large\nScale Underwater Image (LSUI) dataset. Leveraging the joint learning\ncapabilities of the two cooperating networks, the proposed model achieves the\nstate-of-art restoration performance with significantly reduced computational\ncomplexity and often presents such results that can even surpass the visual\nquality of the ground truth with a 2-pass application. Our results and the\noptimized PyTorch implementation of the proposed approach are now publicly\nshared on GitHub.\n","authors":["Ozer Can Devecioglu","Serkan Kiranyaz","Turker Ince","Moncef Gabbouj"],"pdf_url":"https://arxiv.org/pdf/2412.03995v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2412.03993v1","updated":"2024-12-05T09:14:50Z","published":"2024-12-05T09:14:50Z","title":"LaserGuider: A Laser Based Physical Backdoor Attack against Deep Neural\n Networks","summary":" Backdoor attacks embed hidden associations between triggers and targets in\ndeep neural networks (DNNs), causing them to predict the target when a trigger\nis present while maintaining normal behavior otherwise. Physical backdoor\nattacks, which use physical objects as triggers, are feasible but lack remote\ncontrol, temporal stealthiness, flexibility, and mobility. To overcome these\nlimitations, in this work, we propose a new type of backdoor triggers utilizing\nlasers that feature long-distance transmission and instant-imaging properties.\nBased on the laser-based backdoor triggers, we present a physical backdoor\nattack, called LaserGuider, which possesses remote control ability and achieves\nhigh temporal stealthiness, flexibility, and mobility. We also introduce a\nsystematic approach to optimize laser parameters for improving attack\neffectiveness. Our evaluation on traffic sign recognition DNNs, critical in\nautonomous vehicles, demonstrates that LaserGuider with three different\nlaser-based triggers achieves over 90% attack success rate with negligible\nimpact on normal inputs. Additionally, we release LaserMark, the first dataset\nof real world traffic signs stamped with physical laser spots, to support\nfurther research in backdoor attacks and defenses.\n","authors":["Yongjie Xu","Guangke Chen","Fu Song","Yuqi Chen"],"pdf_url":"https://arxiv.org/pdf/2412.03993v1.pdf","comment":"In Proceedings of the 23rd International Conference on Applied\n Cryptography and Network Security (ACNS), Munich, Germany, 23-26 June, 2025"},{"id":"http://arxiv.org/abs/2412.03992v1","updated":"2024-12-05T09:12:25Z","published":"2024-12-05T09:12:25Z","title":"How well behaved is finite dimensional Diffusion Maps?","summary":" Under a set of assumptions on a family of submanifolds $\\subset {\\mathbb\nR}^D$, we derive a series of geometric properties that remain valid after\nfinite-dimensional and almost isometric Diffusion Maps (DM), including almost\nuniform density, finite polynomial approximation and local reach. Leveraging\nthese properties, we establish rigorous bounds on the embedding errors\nintroduced by the DM algorithm is $O\\left((\\frac{\\log\nn}{n})^{\\frac{1}{8d+16}}\\right)$. These results offer a solid theoretical\nfoundation for understanding the performance and reliability of DM in practical\napplications.\n","authors":["Wenyu Bo","Marina Meilă"],"pdf_url":"https://arxiv.org/pdf/2412.03992v1.pdf","comment":"20 pages, 3 figures"},{"id":"http://arxiv.org/abs/2403.18569v2","updated":"2024-12-05T09:02:11Z","published":"2024-03-27T13:50:13Z","title":"PDNNet: PDN-Aware GNN-CNN Heterogeneous Network for Dynamic IR Drop\n Prediction","summary":" IR drop on the power delivery network (PDN) is closely related to PDN's\nconfiguration and cell current consumption. As the integrated circuit (IC)\ndesign is growing larger, dynamic IR drop simulation becomes computationally\nunaffordable and machine learning based IR drop prediction has been explored as\na promising solution. Although CNN-based methods have been adapted to IR drop\nprediction task in several works, the shortcomings of overlooking PDN\nconfiguration is non-negligible. In this paper, we consider not only how to\nproperly represent cell-PDN relation, but also how to model IR drop following\nits physical nature in the feature aggregation procedure. Thus, we propose a\nnovel graph structure, PDNGraph, to unify the representations of the PDN\nstructure and the fine-grained cell-PDN relation. We further propose a\ndual-branch heterogeneous network, PDNNet, incorporating two parallel GNN-CNN\nbranches to favorably capture the above features during the learning process.\nSeveral key designs are presented to make the dynamic IR drop prediction highly\neffective and interpretable. We are the first work to apply graph structure to\ndeep-learning based dynamic IR drop prediction method. Experiments show that\nPDNNet outperforms the state-of-the-art CNN-based methods and achieves 545x\nspeedup compared to the commercial tool, which demonstrates the superiority of\nour method.\n","authors":["Yuxiang Zhao","Zhuomin Chai","Xun Jiang","Yibo Lin","Runsheng Wang","Ru Huang"],"pdf_url":"https://arxiv.org/pdf/2403.18569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03983v1","updated":"2024-12-05T08:58:41Z","published":"2024-12-05T08:58:41Z","title":"Safe and Efficient Online Convex Optimization with Linear Budget\n Constraints and Partial Feedback","summary":" This paper studies online convex optimization with unknown linear budget\nconstraints, where only the gradient information of the objective and the\nbandit feedback of constraint functions are observed. We propose a safe and\nefficient Lyapunov-optimization algorithm (SELO) that can achieve an\n$O(\\sqrt{T})$ regret and zero cumulative constraint violation. The result also\nimplies SELO achieves $O(\\sqrt{T})$ regret when the budget is hard and not\nallowed to be violated. The proposed algorithm is computationally efficient as\nit resembles a primal-dual algorithm where the primal problem is an\nunconstrained, strongly convex and smooth problem, and the dual problem has a\nsimple gradient-type update. The algorithm and theory are further justified in\na simulated application of energy-efficient task processing in distributed data\ncenters.\n","authors":["Shanqi Liu","Xin Liu"],"pdf_url":"https://arxiv.org/pdf/2412.03983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03982v1","updated":"2024-12-05T08:58:25Z","published":"2024-12-05T08:58:25Z","title":"Exploring Fully Convolutional Networks for the Segmentation of\n Hyperspectral Imaging Applied to Advanced Driver Assistance Systems","summary":" Advanced Driver Assistance Systems (ADAS) are designed with the main purpose\nof increasing the safety and comfort of vehicle occupants. Most of current\ncomputer vision-based ADAS perform detection and tracking tasks quite\nsuccessfully under regular conditions, but are not completely reliable,\nparticularly under adverse weather and changing lighting conditions, neither in\ncomplex situations with many overlapping objects. In this work we explore the\nuse of hyperspectral imaging (HSI) in ADAS on the assumption that the distinct\nnear infrared (NIR) spectral reflectances of different materials can help to\nbetter separate the objects in a driving scene. In particular, this paper\ndescribes some experimental results of the application of fully convolutional\nnetworks (FCN) to the image segmentation of HSI for ADAS applications. More\nspecifically, our aim is to investigate to what extent the spatial features\ncodified by convolutional filters can be helpful to improve the performance of\nHSI segmentation systems. With that aim, we use the HSI-Drive v1.1 dataset,\nwhich provides a set of labelled images recorded in real driving conditions\nwith a small-size snapshot NIR-HSI camera. Finally, we analyze the\nimplementability of such a HSI segmentation system by prototyping the developed\nFCN model together with the necessary hyperspectral cube preprocessing stage\nand characterizing its performance on an MPSoC.\n","authors":["Jon Gutiérrez-Zaballa","Koldo Basterretxea","Javier Echanobe","M. Victoria Martínez","Inés del Campo"],"pdf_url":"https://arxiv.org/pdf/2412.03982v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2411.19274"},{"id":"http://arxiv.org/abs/2411.18220v2","updated":"2024-12-05T08:57:30Z","published":"2024-11-27T10:57:06Z","title":"R-MTLLMF: Resilient Multi-Task Large Language Model Fusion at the\n Wireless Edge","summary":" Multi-task large language models (MTLLMs) are important for many applications\nat the wireless edge, where users demand specialized models to handle multiple\ntasks efficiently. However, training MTLLMs is complex and exhaustive,\nparticularly when tasks are subject to change. Recently, the concept of model\nfusion via task vectors has emerged as an efficient approach for combining\nfine-tuning parameters to produce an MTLLM. In this paper, the problem of\nenabling edge users to collaboratively craft such MTLMs via tasks vectors is\nstudied, under the assumption of worst-case adversarial attacks. To this end,\nfirst the influence of adversarial noise to multi-task model fusion is\ninvestigated and a relationship between the so-called weight disentanglement\nerror and the mean squared error (MSE) is derived. Using hypothesis testing, it\nis directly shown that the MSE increases interference between task vectors,\nthereby rendering model fusion ineffective. Then, a novel resilient MTLLM\nfusion (R-MTLLMF) is proposed, which leverages insights about the LLM\narchitecture and fine-tuning process to safeguard task vector aggregation under\nadversarial noise by realigning the MTLLM. The proposed R-MTLLMF is then\ncompared for both worst-case and ideal transmission scenarios to study the\nimpact of the wireless channel. Extensive model fusion experiments with vision\nLLMs demonstrate R-MTLLMF's effectiveness, achieving close-to-baseline\nperformance across eight different tasks in ideal noise scenarios and\nsignificantly outperforming unprotected model fusion in worst-case scenarios.\nThe results further advocate for additional physical layer protection for a\nholistic approach to resilience, from both a wireless and LLM perspective.\n","authors":["Aladin Djuhera","Vlad C. Andrei","Mohsen Pourghasemian","Haris Gacanin","Holger Boche","Walid Saad"],"pdf_url":"https://arxiv.org/pdf/2411.18220v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03214v2","updated":"2024-12-05T08:49:02Z","published":"2024-12-04T11:05:01Z","title":"Continual Low-Rank Scaled Dot-product Attention","summary":" Transformers are widely used for their ability to capture data relations in\nsequence processing, with great success for a wide range of static tasks.\nHowever, the computational and memory footprint of their main component, i.e.,\nthe Scaled Dot-product Attention, is commonly overlooked. This makes their\nadoption in applications involving stream data processing with constraints in\nresponse latency, computational and memory resources infeasible. Some works\nhave proposed methods to lower the computational cost of transformers, i.e.\nlow-rank approximations, sparsity in attention, and efficient formulations for\nContinual Inference. In this paper, we introduce a new formulation of the\nScaled Dot-product Attention based on the Nystr\\\"om approximation that is\nsuitable for Continual Inference. In experiments on Online Audio Classification\nand Online Action Detection tasks, the proposed Continual Scaled Dot-product\nAttention can lower the number of operations by up to three orders of magnitude\ncompared to the original Transformers while retaining the predictive\nperformance of competing models.\n","authors":["Ginés Carreto Picón","Illia Oleksiienko","Lukas Hedegaard","Arian Bakhtiarnia","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2412.03214v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2410.01262v2","updated":"2024-12-05T08:44:53Z","published":"2024-10-02T06:16:06Z","title":"Improving Fine-Grained Control via Aggregation of Multiple Diffusion\n Models","summary":" While many diffusion models perform well when controlling for particular\naspect among style, character, and interaction, they struggle with fine-grained\ncontrol due to dataset limitations and intricate model architecture design.\nThis paper introduces a novel algorithm, Aggregation of Multiple Diffusion\nModels (AMDM), which synthesizes features from multiple diffusion models into a\nspecified model, activating specific features for fine-grained control.\nExperimental results demonstrate that AMDM significantly improves fine-grained\ncontrol without training, proving its effectiveness. Additionally, it reveals\nthat diffusion models initially focus on features such as position, attributes,\nand style, with later stages improving generation quality and consistency. AMDM\noffers a new perspective for tackling the challenges of fine-grained\nconditional control generation in diffusion models: We can fully utilize\nexisting or develop new conditional diffusion models that control specific\naspects, and then aggregate them using AMDM algorithm. This eliminates the need\nfor constructing complex datasets, designing intricate model architectures, and\nincurring high training costs. Code is available at:\nhttps://github.com/Hammour-steak/AMDM.\n","authors":["Conghan Yue","Zhengwei Peng","Shiyan Du","Zhi Ji","Chuangjian Cai","Le Wan","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.01262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03962v1","updated":"2024-12-05T08:26:13Z","published":"2024-12-05T08:26:13Z","title":"Local Curvature Smoothing with Stein's Identity for Efficient Score\n Matching","summary":" The training of score-based diffusion models (SDMs) is based on score\nmatching. The challenge of score matching is that it includes a computationally\nexpensive Jacobian trace. While several methods have been proposed to avoid\nthis computation, each has drawbacks, such as instability during training and\napproximating the learning as learning a denoising vector field rather than a\ntrue score. We propose a novel score matching variant, local curvature\nsmoothing with Stein's identity (LCSS). The LCSS bypasses the Jacobian trace by\napplying Stein's identity, enabling regularization effectiveness and efficient\ncomputation. We show that LCSS surpasses existing methods in sample generation\nperformance and matches the performance of denoising score matching, widely\nadopted by most SDMs, in evaluations such as FID, Inception score, and bits per\ndimension. Furthermore, we show that LCSS enables realistic image generation\neven at a high resolution of $1024 \\times 1024$.\n","authors":["Genki Osada","Makoto Shing","Takashi Nishide"],"pdf_url":"https://arxiv.org/pdf/2412.03962v1.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2412.03961v1","updated":"2024-12-05T08:26:07Z","published":"2024-12-05T08:26:07Z","title":"Electronic Health Records-Based Data-Driven Diabetes Knowledge Unveiling\n and Risk Prognosis","summary":" In the healthcare sector, the application of deep learning technologies has\nrevolutionized data analysis and disease forecasting. This is particularly\nevident in the field of diabetes, where the deep analysis of Electronic Health\nRecords (EHR) has unlocked new opportunities for early detection and effective\nintervention strategies. Our research presents an innovative model that\nsynergizes the capabilities of Bidirectional Long Short-Term Memory\nNetworks-Conditional Random Field (BiLSTM-CRF) with a fusion of XGBoost and\nLogistic Regression. This model is designed to enhance the accuracy of diabetes\nrisk prediction by conducting an in-depth analysis of electronic medical\nrecords data. The first phase of our approach involves employing BiLSTM-CRF to\ndelve into the temporal characteristics and latent patterns present in EHR\ndata. This method effectively uncovers the progression trends of diabetes,\nwhich are often hidden in the complex data structures of medical records. The\nsecond phase leverages the combined strength of XGBoost and Logistic Regression\nto classify these extracted features and evaluate associated risks. This dual\napproach facilitates a more nuanced and precise prediction of diabetes,\noutperforming traditional models, particularly in handling multifaceted and\nnonlinear medical datasets. Our research demonstrates a notable advancement in\ndiabetes prediction over traditional methods, showcasing the effectiveness of\nour combined BiLSTM-CRF, XGBoost, and Logistic Regression model. This study\nhighlights the value of data-driven strategies in clinical decision-making,\nequipping healthcare professionals with precise tools for early detection and\nintervention. By enabling personalized treatment and timely care, our approach\nsignifies progress in incorporating advanced analytics in healthcare,\npotentially improving outcomes for diabetes and other chronic conditions.\n","authors":["Huadong Pang","Li Zhou","Yiping Dong","Peiyuan Chen","Dian Gu","Tianyi Lyu","Hansong Zhang"],"pdf_url":"https://arxiv.org/pdf/2412.03961v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2107.14432v6","updated":"2024-12-05T08:11:50Z","published":"2021-07-30T05:33:43Z","title":"Adaptive Optimizers with Sparse Group Lasso for Neural Networks in CTR\n Prediction","summary":" We develop a novel framework that adds the regularizers of the sparse group\nlasso to a family of adaptive optimizers in deep learning, such as Momentum,\nAdagrad, Adam, AMSGrad, AdaHessian, and create a new class of optimizers, which\nare named Group Momentum, Group Adagrad, Group Adam, Group AMSGrad and Group\nAdaHessian, etc., accordingly. We establish theoretically proven convergence\nguarantees in the stochastic convex settings, based on primal-dual methods. We\nevaluate the regularized effect of our new optimizers on three large-scale\nreal-world ad click datasets with state-of-the-art deep learning models. The\nexperimental results reveal that compared with the original optimizers with the\npost-processing procedure which uses the magnitude pruning method, the\nperformance of the models can be significantly improved on the same sparsity\nlevel. Furthermore, in comparison to the cases without magnitude pruning, our\nmethods can achieve extremely high sparsity with significantly better or highly\ncompetitive performance. The code is available at\nhttps://github.com/intelligent-machine-learning/tfplus/tree/main/tfplus.\n","authors":["Yun Yue","Yongchao Liu","Suo Tong","Minghao Li","Zhen Zhang","Chunyang Wen","Huanjun Bao","Lihong Gu","Jinjie Gu","Yixiang Mu"],"pdf_url":"https://arxiv.org/pdf/2107.14432v6.pdf","comment":"24 pages. Published as a conference paper at ECML PKDD 2021. This\n version includes Appendix which was not included in the published version\n because of page limit"},{"id":"http://arxiv.org/abs/2410.13874v3","updated":"2024-12-05T08:10:55Z","published":"2024-10-02T13:02:17Z","title":"COOL: Efficient and Reliable Chain-Oriented Objective Logic with Neural\n Networks Feedback Control for Program Synthesis","summary":" Program synthesis methods, whether formal or neural-based, lack fine-grained\ncontrol and flexible modularity, which limits their adaptation to complex\nsoftware development. These limitations stem from rigid Domain-Specific\nLanguage (DSL) frameworks and neural network incorrect predictions. To this\nend, we propose the Chain of Logic (CoL), which organizes the synthesis process\ninto an activity flow and provides heuristic control to guide the process.\nFurthermore, by integrating neural networks with libraries and introducing a\nNeural Network Feedback Control (NNFC) mechanism, our approach modularizes\nsynthesis and mitigates the impact of neural network mispredictions.\nExperiments on relational and symbolic synthesis tasks show that CoL\nsignificantly enhances the efficiency and reliability of DSL program synthesis\nacross multiple metrics. Specifically, CoL improves accuracy by 70% while\nreducing tree operations by 91% and time by 95%. Additionally, NNFC further\nboosts accuracy by 6%, with a 64% reduction in tree operations under\nchallenging conditions such as insufficient training data, increased\ndifficulty, and multidomain synthesis. These improvements confirm COOL as a\nhighly efficient and reliable program synthesis framework.\n","authors":["Jipeng Han"],"pdf_url":"https://arxiv.org/pdf/2410.13874v3.pdf","comment":"31 pages, 10 figures"},{"id":"http://arxiv.org/abs/2412.03950v1","updated":"2024-12-05T07:58:32Z","published":"2024-12-05T07:58:32Z","title":"BEFL: Balancing Energy Consumption in Federated Learning for Mobile Edge\n IoT","summary":" Federated Learning (FL) is a privacy-preserving distributed learning paradigm\ndesigned to build a highly accurate global model. In Mobile Edge IoT (MEIoT),\nthe training and communication processes can significantly deplete the limited\nbattery resources of devices. Existing research primarily focuses on reducing\noverall energy consumption, but this may inadvertently create energy\nconsumption imbalances, leading to the premature dropout of energy-sensitive\ndevices.To address these challenges, we propose BEFL, a joint optimization\nframework aimed at balancing three objectives: enhancing global model accuracy,\nminimizing total energy consumption, and reducing energy usage disparities\namong devices. First, taking into account the communication constraints of\nMEIoT and the heterogeneity of devices, we employed the Sequential Least\nSquares Programming (SLSQP) algorithm for the rational allocation of\ncommunication resources. Based on this, we introduce a heuristic client\nselection algorithm that combines cluster partitioning with utility-driven\napproaches to alleviate both the total energy consumption of all devices and\nthe discrepancies in energy usage.Furthermore, we utilize the proposed\nheuristic client selection algorithm as a template for offline imitation\nlearning during pre-training, while adopting a ranking-based reinforcement\nlearning approach online to further boost training efficiency. Our experiments\nreveal that BEFL improves global model accuracy by 1.6\\%, reduces energy\nconsumption variance by 72.7\\%, and lowers total energy consumption by 28.2\\%\ncompared to existing methods. The relevant code can be found at\n\\href{URL}{https://github.com/juzehao/BEFL}.\n","authors":["Zehao Ju","Tongquan Wei","Fuke Shen"],"pdf_url":"https://arxiv.org/pdf/2412.03950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03949v1","updated":"2024-12-05T07:55:58Z","published":"2024-12-05T07:55:58Z","title":"Learning Speed-Adaptive Walking Agent Using Imitation Learning with\n Physics-Informed Simulation","summary":" Virtual models of human gait, or digital twins, offer a promising solution\nfor studying mobility without the need for labor-intensive data collection.\nHowever, challenges such as the sim-to-real gap and limited adaptability to\ndiverse walking conditions persist. To address these, we developed and\nvalidated a framework to create a skeletal humanoid agent capable of adapting\nto varying walking speeds while maintaining biomechanically realistic motions.\nThe framework combines a synthetic data generator, which produces\nbiomechanically plausible gait kinematics from open-source biomechanics data,\nand a training system that uses adversarial imitation learning to train the\nagent's walking policy. We conducted comprehensive analyses comparing the\nagent's kinematics, synthetic data, and the original biomechanics dataset. The\nagent achieved a root mean square error of 5.24 +- 0.09 degrees at varying\nspeeds compared to ground-truth kinematics data, demonstrating its\nadaptability. This work represents a significant step toward developing a\ndigital twin of human locomotion, with potential applications in biomechanics\nresearch, exoskeleton design, and rehabilitation.\n","authors":["Yi-Hung Chiu","Ung Hee Lee","Changseob Song","Manaen Hu","Inseung Kang"],"pdf_url":"https://arxiv.org/pdf/2412.03949v1.pdf","comment":"Currently under review"},{"id":"http://arxiv.org/abs/2305.19770v2","updated":"2024-12-05T07:46:11Z","published":"2023-05-31T12:03:12Z","title":"Quality In / Quality Out: Data quality more relevant than model choice\n in anomaly detection with the UGR'16","summary":" Autonomous or self-driving networks are expected to provide a solution to the\nmyriad of extremely demanding new applications with minimal human supervision.\nFor this purpose, the community relies on the development of new Machine\nLearning (ML) models and techniques. %, like the celebrated Deep Learning (DL).\nHowever, ML can only be as good as the data it is fitted with, and data quality\nis an elusive concept difficult to assess. In this paper, we show that\nrelatively minor modifications on a benchmark dataset (UGR'16, a flow-based\nreal-traffic dataset for anomaly detection) cause significantly more impact on\nmodel performance than the specific ML technique considered. We also show that\nthe measured model performance is uncertain, as a result of labelling\ninaccuracies. Our findings illustrate that the widely adopted approach of\ncomparing a set of models in terms of performance results (e.g., in terms of\naccuracy or ROC curves) may lead to incorrect conclusions when done without a\nproper understanding of dataset biases and sensitivity. We contribute a\nmethodology to interpret a model response that can be useful for this\nunderstanding.\n","authors":["José Camacho","Katarzyna Wasielewska","Pablo Espinosa","Marta Fuentes-García"],"pdf_url":"https://arxiv.org/pdf/2305.19770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15560v3","updated":"2024-12-05T07:43:25Z","published":"2023-05-24T23:47:26Z","title":"Differentially Private Synthetic Data via Foundation Model APIs 1:\n Images","summary":" Generating differentially private (DP) synthetic data that closely resembles\nthe original private data is a scalable way to mitigate privacy concerns in the\ncurrent data-driven world. In contrast to current practices that train\ncustomized models for this task, we aim to generate DP Synthetic Data via APIs\n(DPSDA), where we treat foundation models as blackboxes and only utilize their\ninference APIs. Such API-based, training-free approaches are easier to deploy\nas exemplified by the recent surge in the number of API-based apps. These\napproaches can also leverage the power of large foundation models which are\nonly accessible via their inference APIs. However, this comes with greater\nchallenges due to strictly more restrictive model access and the need to\nprotect privacy from the API provider.\n In this paper, we present a new framework called Private Evolution (PE) to\nsolve this problem and show its initial promise on synthetic images.\nSurprisingly, PE can match or even outperform state-of-the-art (SOTA) methods\nwithout any model training. For example, on CIFAR10 (with ImageNet as the\npublic data), we achieve FID <= 7.9 with privacy cost {\\epsilon} = 0.67,\nsignificantly improving the previous SOTA from {\\epsilon} = 32. We further\ndemonstrate the promise of applying PE on large foundation models such as\nStable Diffusion to tackle challenging private datasets with a small number of\nhigh-resolution images. The code and data are released at\nhttps://github.com/microsoft/DPSDA.\n","authors":["Zinan Lin","Sivakanth Gopi","Janardhan Kulkarni","Harsha Nori","Sergey Yekhanin"],"pdf_url":"https://arxiv.org/pdf/2305.15560v3.pdf","comment":"Published in ICLR 2024"},{"id":"http://arxiv.org/abs/2410.06940v2","updated":"2024-12-05T07:39:22Z","published":"2024-10-09T14:34:53Z","title":"Representation Alignment for Generation: Training Diffusion Transformers\n Is Easier Than You Think","summary":" Recent studies have shown that the denoising process in (generative)\ndiffusion models can induce meaningful (discriminative) representations inside\nthe model, though the quality of these representations still lags behind those\nlearned through recent self-supervised learning methods. We argue that one main\nbottleneck in training large-scale diffusion models for generation lies in\neffectively learning these representations. Moreover, training can be made\neasier by incorporating high-quality external visual representations, rather\nthan relying solely on the diffusion models to learn them independently. We\nstudy this by introducing a straightforward regularization called\nREPresentation Alignment (REPA), which aligns the projections of noisy input\nhidden states in denoising networks with clean image representations obtained\nfrom external, pretrained visual encoders. The results are striking: our simple\nstrategy yields significant improvements in both training efficiency and\ngeneration quality when applied to popular diffusion and flow-based\ntransformers, such as DiTs and SiTs. For instance, our method can speed up SiT\ntraining by over 17.5$\\times$, matching the performance (without\nclassifier-free guidance) of a SiT-XL model trained for 7M steps in less than\n400K steps. In terms of final generation quality, our approach achieves\nstate-of-the-art results of FID=1.42 using classifier-free guidance with the\nguidance interval.\n","authors":["Sihyun Yu","Sangkyung Kwak","Huiwon Jang","Jongheon Jeong","Jonathan Huang","Jinwoo Shin","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2410.06940v2.pdf","comment":"Preprint. Project page: https://sihyun.me/REPA"},{"id":"http://arxiv.org/abs/2412.03938v1","updated":"2024-12-05T07:35:56Z","published":"2024-12-05T07:35:56Z","title":"JANUS: A Difference-Oriented Analyzer For Financial Centralization Risks\n in Smart Contracts","summary":" Some smart contracts violate decentralization principles by defining\nprivileged accounts that manage other users' assets without permission,\nintroducing centralization risks that have caused financial losses. Existing\nmethods, however, face challenges in accurately detecting diverse\ncentralization risks due to their dependence on predefined behavior patterns.\nIn this paper, we propose JANUS, an automated analyzer for Solidity smart\ncontracts that detects financial centralization risks independently of their\nspecific behaviors. JANUS identifies differences between states reached by\nprivileged and ordinary accounts, and analyzes whether these differences are\nfinance-related. Focusing on the impact of risks rather than behaviors, JANUS\nachieves improved accuracy compared to existing tools and can uncover\ncentralization risks with unknown patterns.\n To evaluate JANUS's performance, we compare it with other tools using a\ndataset of 540 contracts. Our evaluation demonstrates that JANUS outperforms\nrepresentative tools in terms of detection accuracy for financial\ncentralization risks . Additionally, we evaluate JANUS on a real-world dataset\nof 33,151 contracts, successfully identifying two types of risks that other\ntools fail to detect. We also prove that the state traversal method and\nvariable summaries, which are used in JANUS to reduce the number of states to\nbe compared, do not introduce false alarms or omissions in detection.\n","authors":["Wansen Wang","Pu Zhang","Renjie Ji","Wenchao Huang","Zhaoyi Meng","Yan Xiong"],"pdf_url":"https://arxiv.org/pdf/2412.03938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03936v1","updated":"2024-12-05T07:34:04Z","published":"2024-12-05T07:34:04Z","title":"Deep Learning Modeling Method for RF Devices Based on Uniform Noise\n Training Set","summary":" As the scale and complexity of integrated circuits continue to increase,\ntraditional modeling methods are struggling to address the nonlinear challenges\nin radio frequency (RF) chips. Deep learning has been increasingly applied to\nRF device modeling. This paper proposes a deep learning-based modeling method\nfor RF devices using a uniform noise training set, aimed at modeling and\nfitting the nonlinear characteristics of RF devices. We hypothesize that a\nuniform noise signal can encompass the full range of characteristics across\nboth frequency and amplitude, and that a deep learning model can effectively\ncapture and learn these features. Based on this hypothesis, the paper designs a\ncomplete integrated circuit modeling process based on measured data, including\ndata collection, processing, and neural network training. The proposed method\nis experimentally validated using the RF amplifier PW210 as a case study.\nExperimental results show that the uniform noise training set allows the model\nto capture the nonlinear characteristics of RF devices, and the trained model\ncan predict waveform patterns it has never encountered before. The proposed\ndeep learning-based RF device modeling method, using a uniform noise training\nset, demonstrates strong generalization capability and excellent training\nperformance, offering high practical application value.\n","authors":["Zhaokun Hu","Yindong Xiao","Houjun Wang","Jiayong Yu","Zihang Gao"],"pdf_url":"https://arxiv.org/pdf/2412.03936v1.pdf","comment":"9 pages,11 figures"},{"id":"http://arxiv.org/abs/2305.15817v3","updated":"2024-12-05T07:31:10Z","published":"2023-05-25T08:00:34Z","title":"Sharpness-Aware Minimization Revisited: Weighted Sharpness as a\n Regularization Term","summary":" Deep Neural Networks (DNNs) generalization is known to be closely related to\nthe flatness of minima, leading to the development of Sharpness-Aware\nMinimization (SAM) for seeking flatter minima and better generalization. In\nthis paper, we revisit the loss of SAM and propose a more general method,\ncalled WSAM, by incorporating sharpness as a regularization term. We prove its\ngeneralization bound through the combination of PAC and Bayes-PAC techniques,\nand evaluate its performance on various public datasets. The results\ndemonstrate that WSAM achieves improved generalization, or is at least highly\ncompetitive, compared to the vanilla optimizer, SAM and its variants. The code\nis available at\nhttps://github.com/intelligent-machine-learning/atorch/tree/main/atorch/optimizers.\n","authors":["Yun Yue","Jiadi Jiang","Zhiling Ye","Ning Gao","Yongchao Liu","Ke Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.15817v3.pdf","comment":"10 pages. Accepted as a conference paper at KDD '23"},{"id":"http://arxiv.org/abs/2410.12672v3","updated":"2024-12-05T07:27:31Z","published":"2024-10-16T15:36:13Z","title":"Context Matters: Leveraging Contextual Features for Time Series\n Forecasting","summary":" Time series forecasts are often influenced by exogenous contextual features\nin addition to their corresponding history. For example, in financial settings,\nit is hard to accurately predict a stock price without considering public\nsentiments and policy decisions in the form of news articles, tweets, etc.\nThough this is common knowledge, the current state-of-the-art (SOTA)\nforecasting models fail to incorporate such contextual information, owing to\nits heterogeneity and multimodal nature. To address this, we introduce\nContextFormer, a novel plug-and-play method to surgically integrate multimodal\ncontextual information into existing pre-trained forecasting models.\nContextFormer effectively distills forecast-specific information from rich\nmultimodal contexts, including categorical, continuous, time-varying, and even\ntextual information, to significantly enhance the performance of existing base\nforecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on\na range of real-world datasets spanning energy, traffic, environmental, and\nfinancial domains.\n","authors":["Sameep Chattopadhyay","Pulkit Paliwal","Sai Shankar Narasimhan","Shubhankar Agarwal","Sandeep P. Chinchali"],"pdf_url":"https://arxiv.org/pdf/2410.12672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03933v1","updated":"2024-12-05T07:23:14Z","published":"2024-12-05T07:23:14Z","title":"Exploring AI Text Generation, Retrieval-Augmented Generation, and\n Detection Technologies: a Comprehensive Overview","summary":" The rapid development of Artificial Intelligence (AI) has led to the creation\nof powerful text generation models, such as large language models (LLMs), which\nare widely used for diverse applications. However, concerns surrounding\nAI-generated content, including issues of originality, bias, misinformation,\nand accountability, have become increasingly prominent. This paper offers a\ncomprehensive overview of AI text generators (AITGs), focusing on their\nevolution, capabilities, and ethical implications. This paper also introduces\nRetrieval-Augmented Generation (RAG), a recent approach that improves the\ncontextual relevance and accuracy of text generation by integrating dynamic\ninformation retrieval. RAG addresses key limitations of traditional models,\nincluding their reliance on static knowledge and potential inaccuracies in\nhandling real-world data. Additionally, the paper reviews detection tools that\nhelp differentiate AI-generated text from human-written content and discusses\nthe ethical challenges these technologies pose. The paper explores future\ndirections for improving detection accuracy, supporting ethical AI development,\nand increasing accessibility. The paper contributes to a more responsible and\nreliable use of AI in content creation through these discussions.\n","authors":["Fnu Neha","Deepshikha Bhati","Deepak Kumar Shukla","Angela Guercio","Ben Ward"],"pdf_url":"https://arxiv.org/pdf/2412.03933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.16320v3","updated":"2024-12-05T07:14:52Z","published":"2024-09-21T03:45:05Z","title":"Developing a Thailand solar irradiance map using Himawari-8 satellite\n imageries and deep learning models","summary":" This paper presents an online platform showing Thailand solar irradiance map\nevery 30 minutes, available at https://www.cusolarforecast.com. The methodology\nfor estimating global horizontal irradiance (GHI) across Thailand relies on\ncloud index extracted from Himawari-8 satellite imagery, Ineichen clear-sky\nmodel with locally-tuned Linke turbidity, and machine learning models. The\nmethods take clear-sky irradiance, cloud index, re-analyzed GHI and temperature\ndata from the MERRA-2 database, and date-time as inputs for GHI estimation\nmodels, including LightGBM, LSTM, Informer, and Transformer. These are\nbenchmarked with the estimate from a commercial service X by evaluation of\n15-minute ground GHI data from 53 ground stations over 1.5 years during\n2022-2023. The results show that the four models exhibit comparable overall MAE\nperformance to the service X. The best model is LightGBM with an overall MAE of\n78.58 W/sqm and RMSE of 118.97 W/sqm, while the service X achieves the lowest\nMAE, RMSE, and MBE in cloudy condition. Obtaining re-analyzed MERRA-2 data for\nthe whole Thailand region is not economically feasible for deployment. When\nremoving these features, the Informer model has a winning performance in MAE of\n78.67 W/sqm. The obtained performance aligns with existing literature by taking\nthe climate zone and time granularity of data into consideration. As the map\nshows an estimate of GHI over 93,000 grids with a frequent update, the paper\nalso describes a computational framework for displaying the entire map. It\ntests the runtime performance of deep learning models in the GHI estimation\nprocess.\n","authors":["Suwichaya Suwanwimolkul","Natanon Tongamrak","Nuttamon Thungka","Naebboon Hoonchareon","Jitkomut Songsiri"],"pdf_url":"https://arxiv.org/pdf/2409.16320v3.pdf","comment":"23 pages, 14 figures"},{"id":"http://arxiv.org/abs/2410.21216v2","updated":"2024-12-05T07:09:27Z","published":"2024-10-28T17:01:52Z","title":"HoPE: A Novel Positional Encoding Without Long-Term Decay for Enhanced\n Context Awareness and Extrapolation","summary":" Many positional encodings (PEs) are designed to exhibit long-term decay,\nbased on an entrenched and long-standing inductive opinion: tokens farther away\nfrom the current position carry less relevant information. We argue that\nlong-term decay is outdated in the era of LLMs, as LLMs are now applied to\ntasks demanding precise retrieval of in-context information from arbitrary\npositions. Firstly, we present empirical analyses on various PEs, demonstrating\nthat models inherently learn attention with only a local-decay pattern while\nforming a U-shape pattern globally, contradicting the principle of long-term\ndecay. Furthermore, we conduct a detailed analysis of rotary position encoding\n(RoPE, a prevalent relative positional encoding in LLMs), and found that the\nU-shape attention is caused by some learned components, which are also the key\nfactor limiting RoPE's expressiveness and extrapolation.Inspired by these\ninsights, we propose High-frequency rotary Position Encoding (HoPE). HoPE\nreplaces the specific components in RoPE with position-independent ones,\nretaining only high-frequency signals, which also breaks the principle of\nlong-term decay in theory. HoPE achieves two major advantages: (1) Without\nconstraints imposed by long-term decay, contradictory factors that limit\nspontaneous attention optimization and model extrapolation performance are\nremoved. (2) Components representing positions and semantics are are optimized.\nThese enhances model's context awareness and extrapolation, as validated by\nextensive experiments.\n","authors":["Yuhan Chen","Ang Lv","Jian Luan","Bin Wang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2410.21216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03928v1","updated":"2024-12-05T07:07:35Z","published":"2024-12-05T07:07:35Z","title":"MT3DNet: Multi-Task learning Network for 3D Surgical Scene\n Reconstruction","summary":" In image-assisted minimally invasive surgeries (MIS), understanding surgical\nscenes is vital for real-time feedback to surgeons, skill evaluation, and\nimproving outcomes through collaborative human-robot procedures. Within this\ncontext, the challenge lies in accurately detecting, segmenting, and estimating\nthe depth of surgical scenes depicted in high-resolution images, while\nsimultaneously reconstructing the scene in 3D and providing segmentation of\nsurgical instruments along with detection labels for each instrument. To\naddress this challenge, a novel Multi-Task Learning (MTL) network is proposed\nfor performing these tasks concurrently. A key aspect of this approach involves\novercoming the optimization hurdles associated with handling multiple tasks\nconcurrently by integrating a Adversarial Weight Update into the MTL framework,\nthe proposed MTL model achieves 3D reconstruction through the integration of\nsegmentation, depth estimation, and object detection, thereby enhancing the\nunderstanding of surgical scenes, which marks a significant advancement\ncompared to existing studies that lack 3D capabilities. Comprehensive\nexperiments on the EndoVis2018 benchmark dataset underscore the adeptness of\nthe model in efficiently addressing all three tasks, demonstrating the efficacy\nof the proposed techniques.\n","authors":["Mithun Parab","Pranay Lendave","Jiyoung Kim","Thi Quynh Dan Nguyen","Palash Ingle"],"pdf_url":"https://arxiv.org/pdf/2412.03928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2412.03927v1","updated":"2024-12-05T07:06:17Z","published":"2024-12-05T07:06:17Z","title":"MegaCOIN: Enhancing Medium-Grained Color Perception for Vision-Language\n Models","summary":" In vision-language models (VLMs), the ability to perceive and interpret color\nand physical environment is crucial for achieving contextually accurate\nunderstanding and interaction. However, despite advances in multimodal\nmodeling, there remains a significant lack of specialized datasets that\nrigorously evaluate a model's capacity to discern subtle color variations and\nspatial context -- critical elements for situational comprehension and reliable\ndeployment across real-world applications. Toward that goal, we curate\nMegaCOIN, a high-quality, human-labeled dataset based on \\emph{real} images\nwith various contextual attributes. MegaCOIN consists of two parts:\nMegaCOIN-Instruct, which serves as a supervised fine-tuning (SFT) dataset for\nVLMs; and MegaCOIN-Bench, an annotated test set that can be used as a\nstand-alone QA dataset. MegaCOIN~provides three annotated features for 220,000\nreal images: foreground color, background color, and description of an object's\nphysical environment, constituting 660k human annotations. In addition,\nMegaCOIN can be applied to benchmark domain generalization (DG) algorithms. We\nexplore benchmarking DG methods in the linear probing setup for VLM and show\nsome new insights. Last but not least, we show that VLMs, including GPT-4o,\nhave subpar color recognition capabilities, and fine-tuning with MegaCOIN can\nresult in improved performance on visual evaluation tasks. In certain cases,\nMegaCOIN fine-tuned small-scale opensource models such as LLaVA and Bunny can\noutperform closed-source GPT-4o. We hope the utilities of MegaCOIN can shed\nlight on the directions VLMs can improve and provide a more complex platform\nfor domain generalization algorithms.\n","authors":["Ming-Chang Chiu","Shicheng Wen","Pin-Yu Chen","Xuezhe Ma"],"pdf_url":"https://arxiv.org/pdf/2412.03927v1.pdf","comment":"8 pages, 13 tables, 2 figures"},{"id":"http://arxiv.org/abs/1903.04209v6","updated":"2024-12-05T16:32:37Z","published":"2019-03-11T10:37:05Z","title":"From interpretability to inference: an estimation framework for\n universal approximators","summary":" We present a novel framework for estimation and inference with the broad\nclass of universal approximators. Estimation is based on the decomposition of\nmodel predictions into Shapley values. Inference relies on analyzing the bias\nand variance properties of individual Shapley components. We show that Shapley\nvalue estimation is asymptotically unbiased, and we introduce Shapley\nregressions as a tool to uncover the true data generating process from noisy\ndata alone. The well-known case of the linear regression is the special case in\nour framework if the model is linear in parameters. We present theoretical,\nnumerical, and empirical results for the estimation of heterogeneous treatment\neffects as our guiding example.\n","authors":["Andreas Joseph"],"pdf_url":"https://arxiv.org/pdf/1903.04209v6.pdf","comment":"37 pages, 5 figures, 3 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2208.14784v2","updated":"2024-12-05T15:07:54Z","published":"2022-08-31T11:45:21Z","title":"Practical Operator Sketching Framework for Accelerating Iterative\n Data-Driven Solutions in Inverse Problems","summary":" We propose a new operator-sketching paradigm for designing efficient\niterative data-driven reconstruction (IDR) schemes, e.g. Plug-and-Play\nalgorithms and deep unrolling networks. These IDR schemes are currently the\nstate-of-the-art solutions for imaging inverse problems. However, for\nhigh-dimensional imaging tasks, especially X-ray CT and MRI imaging, these IDR\nschemes typically become inefficient both in terms of computation, due to the\nneed of computing multiple times the high-dimensional forward and adjoint\noperators. In this work, we explore and propose a universal dimensionality\nreduction framework for accelerating IDR schemes in solving imaging inverse\nproblems, based on leveraging the sketching techniques from stochastic\noptimization. Using this framework, we derive a number of accelerated IDR\nschemes, such as the plug-and-play multi-stage sketched gradient (PnP-MS2G) and\nsketching-based primal-dual (LSPD and Sk-LSPD) deep unrolling networks.\nMeanwhile, for fully accelerating PnP schemes when the denoisers are\ncomputationally expensive, we provide novel stochastic lazy denoising schemes\n(Lazy-PnP and Lazy-PnP-EQ), leveraging the ProxSkip scheme in optimization and\nequivariant image denoisers, which can massively accelerate the PnP algorithms\nwith improved practicality. We provide theoretical analysis for recovery\nguarantees of instances of the proposed framework. Our numerical experiments on\nnatural image processing and tomographic image reconstruction demonstrate the\nremarkable effectiveness of our sketched IDR schemes.\n","authors":["Junqi Tang","Guixian Xu","Subhadip Mukherjee","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2208.14784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1903.04797v3","updated":"2024-12-05T09:41:31Z","published":"2019-03-12T09:28:05Z","title":"Elements of Sequential Monte Carlo","summary":" A core problem in statistics and probabilistic machine learning is to compute\nprobability distributions and expectations. This is the fundamental problem of\nBayesian statistics and machine learning, which frames all inference as\nexpectations with respect to the posterior distribution. The key challenge is\nto approximate these intractable expectations. In this tutorial, we review\nsequential Monte Carlo (SMC), a random-sampling-based class of methods for\napproximate inference. First, we explain the basics of SMC, discuss practical\nissues, and review theoretical results. We then examine two of the main user\ndesign choices: the proposal distributions and the so called intermediate\ntarget distributions. We review recent results on how variational inference and\namortization can be used to learn efficient proposals and target distributions.\nNext, we discuss the SMC estimate of the normalizing constant, how this can be\nused for pseudo-marginal inference and inference evaluation. Throughout the\ntutorial we illustrate the use of SMC on various models commonly used in\nmachine learning, such as stochastic recurrent neural networks, probabilistic\ngraphical models, and probabilistic programs.\n","authors":["Christian A. Naesseth","Fredrik Lindsten","Thomas B. Schön"],"pdf_url":"https://arxiv.org/pdf/1903.04797v3.pdf","comment":"Foundations and Trends in Machine Learning"},{"id":"http://arxiv.org/abs/2208.04767v2","updated":"2024-12-05T09:07:23Z","published":"2022-08-09T13:23:29Z","title":"Combining Stochastic Defenses to Resist Gradient Inversion: An Ablation\n Study","summary":" Gradient Inversion (GI) attacks are a ubiquitous threat in Federated Learning\n(FL) as they exploit gradient leakage to reconstruct supposedly private\ntraining data. Common defense mechanisms such as Differential Privacy (DP) or\nstochastic Privacy Modules (PMs) introduce randomness during gradient\ncomputation to prevent such attacks. However, we pose that if an attacker\neffectively mimics a client's stochastic gradient computation, the attacker can\ncircumvent the defense and reconstruct clients' private training data. This\npaper introduces several targeted GI attacks that leverage this principle to\nbypass common defense mechanisms. As a result, we demonstrate that no\nindividual defense provides sufficient privacy protection. To address this\nissue, we propose to combine multiple defenses. We conduct an extensive\nablation study to evaluate the influence of various combinations of defenses on\nprivacy protection and model utility. We observe that only the combination of\nDP and a stochastic PM was sufficient to decrease the Attack Success Rate (ASR)\nfrom 100% to 0%, thus preserving privacy. Moreover, we found that this\ncombination of defenses consistently achieves the best trade-off between\nprivacy and model utility.\n","authors":["Daniel Scheliga","Patrick Mäder","Marco Seeland"],"pdf_url":"https://arxiv.org/pdf/2208.04767v2.pdf","comment":"This version represents a comprehensive rework of the initial study,\n including substantial updates to the methodology, analysis, and conclusions.\n 26 pages, 2 figures, 5 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2412.04307v1","updated":"2024-12-05T16:26:37Z","published":"2024-12-05T16:26:37Z","title":"Feature Coding in the Era of Large Models: Dataset, Test Conditions, and\n Benchmark","summary":" Large models have achieved remarkable performance across various tasks, yet\nthey incur significant computational costs and privacy concerns during both\ntraining and inference. Distributed deployment has emerged as a potential\nsolution, but it necessitates the exchange of intermediate information between\nmodel segments, with feature representations serving as crucial information\ncarriers. To optimize information exchange, feature coding methods are applied\nto reduce transmission and storage overhead. Despite its importance, feature\ncoding for large models remains an under-explored area. In this paper, we draw\nattention to large model feature coding and make three contributions to this\nfield. First, we introduce a comprehensive dataset encompassing diverse\nfeatures generated by three representative types of large models. Second, we\nestablish unified test conditions, enabling standardized evaluation pipelines\nand fair comparisons across future feature coding studies. Third, we introduce\ntwo baseline methods derived from widely used image coding techniques and\nbenchmark their performance on the proposed dataset. These contributions aim to\nadvance the field of feature coding, facilitating more efficient large model\ndeployment. All source code and the dataset will be made available on GitHub.\n","authors":["Changsheng Gao","Yifan Ma","Qiaoxi Chen","Yenan Xu","Dong Liu","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2412.04307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.17440v2","updated":"2024-12-05T15:54:00Z","published":"2024-11-26T13:58:24Z","title":"Identity-Preserving Text-to-Video Generation by Frequency Decomposition","summary":" Identity-preserving text-to-video (IPT2V) generation aims to create\nhigh-fidelity videos with consistent human identity. It is an important task in\nvideo generation but remains an open problem for generative models. This paper\npushes the technical frontier of IPT2V in two directions that have not been\nresolved in literature: (1) A tuning-free pipeline without tedious case-by-case\nfinetuning, and (2) A frequency-aware heuristic identity-preserving DiT-based\ncontrol scheme. We propose ConsisID, a tuning-free DiT-based controllable IPT2V\nmodel to keep human identity consistent in the generated video. Inspired by\nprior findings in frequency analysis of diffusion transformers, it employs\nidentity-control signals in the frequency domain, where facial features can be\ndecomposed into low-frequency global features and high-frequency intrinsic\nfeatures. First, from a low-frequency perspective, we introduce a global facial\nextractor, which encodes reference images and facial key points into a latent\nspace, generating features enriched with low-frequency information. These\nfeatures are then integrated into shallow layers of the network to alleviate\ntraining challenges associated with DiT. Second, from a high-frequency\nperspective, we design a local facial extractor to capture high-frequency\ndetails and inject them into transformer blocks, enhancing the model's ability\nto preserve fine-grained features. We propose a hierarchical training strategy\nto leverage frequency information for identity preservation, transforming a\nvanilla pre-trained video generation model into an IPT2V model. Extensive\nexperiments demonstrate that our frequency-aware heuristic scheme provides an\noptimal control solution for DiT-based models. Thanks to this scheme, our\nConsisID generates high-quality, identity-preserving videos, making strides\ntowards more effective IPT2V.\n","authors":["Shenghai Yuan","Jinfa Huang","Xianyi He","Yunyuan Ge","Yujun Shi","Liuhan Chen","Jiebo Luo","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2411.17440v2.pdf","comment":"12 pages, 8 figures, Code: https://github.com/PKU-YuanGroup/ConsisID"},{"id":"http://arxiv.org/abs/2212.05005v4","updated":"2024-12-05T10:52:25Z","published":"2022-12-09T17:45:36Z","title":"Memories are One-to-Many Mapping Alleviators in Talking Face Generation","summary":" Talking face generation aims at generating photo-realistic video portraits of\na target person driven by input audio. Due to its nature of one-to-many mapping\nfrom the input audio to the output video (e.g., one speech content may have\nmultiple feasible visual appearances), learning a deterministic mapping like\nprevious works brings ambiguity during training, and thus causes inferior\nvisual results. Although this one-to-many mapping could be alleviated in part\nby a two-stage framework (i.e., an audio-to-expression model followed by a\nneural-rendering model), it is still insufficient since the prediction is\nproduced without enough information (e.g., emotions, wrinkles, etc.). In this\npaper, we propose MemFace to complement the missing information with an\nimplicit memory and an explicit memory that follow the sense of the two stages\nrespectively. More specifically, the implicit memory is employed in the\naudio-to-expression model to capture high-level semantics in the\naudio-expression shared space, while the explicit memory is employed in the\nneural-rendering model to help synthesize pixel-level details. Our experimental\nresults show that our proposed MemFace surpasses all the state-of-the-art\nresults across multiple scenarios consistently and significantly.\n","authors":["Anni Tang","Tianyu He","Xu Tan","Jun Ling","Li Song"],"pdf_url":"https://arxiv.org/pdf/2212.05005v4.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence\n (2024). Project page: see https://memoryface.github.io"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 151 + +
+
+
+ + ☆ Stereo Anywhere: Robust Zero-Shot Deep Stereo Matching Even Where Either + Stereo or Mono Fail + + +
+ We introduce Stereo Anywhere, a novel stereo-matching framework that combines +geometric constraints with robust priors from monocular depth Vision Foundation +Models (VFMs). By elegantly coupling these complementary worlds through a +dual-branch architecture, we seamlessly integrate stereo matching with learned +contextual cues. Following this design, our framework introduces novel cost +volume fusion mechanisms that effectively handle critical challenges such as +textureless regions, occlusions, and non-Lambertian surfaces. Through our novel +optical illusion dataset, MonoTrap, and extensive evaluation across multiple +benchmarks, we demonstrate that our synthetic-only trained model achieves +state-of-the-art results in zero-shot generalization, significantly +outperforming existing solutions while showing remarkable robustness to +challenging cases such as mirrors and transparencies. + +
+
+ comment: Code: https://github.com/bartn8/stereoanywhere - Project page: + https://stereoanywhere.github.io/ +
+
+
+
+
+ + ☆ PaintScene4D: Consistent 4D Scene Generation from Text Prompts + + +
+ Recent advances in diffusion models have revolutionized 2D and 3D content +creation, yet generating photorealistic dynamic 4D scenes remains a significant +challenge. Existing dynamic 4D generation methods typically rely on distilling +knowledge from pre-trained 3D generative models, often fine-tuned on synthetic +object datasets. Consequently, the resulting scenes tend to be object-centric +and lack photorealism. While text-to-video models can generate more realistic +scenes with motion, they often struggle with spatial understanding and provide +limited control over camera viewpoints during rendering. To address these +limitations, we present PaintScene4D, a novel text-to-4D scene generation +framework that departs from conventional multi-view generative models in favor +of a streamlined architecture that harnesses video generative models trained on +diverse real-world datasets. Our method first generates a reference video using +a video generation model, and then employs a strategic camera array selection +for rendering. We apply a progressive warping and inpainting technique to +ensure both spatial and temporal consistency across multiple viewpoints. +Finally, we optimize multi-view images using a dynamic renderer, enabling +flexible camera control based on user preferences. Adopting a training-free +architecture, our PaintScene4D efficiently produces realistic 4D scenes that +can be viewed from arbitrary trajectories. The code will be made publicly +available. Our project page is at https://paintscene4d.github.io/ + +
+
+ comment: Project page: https://paintscene4d.github.io/ +
+
+
+
+
+ + ☆ Turbo3D: Ultra-fast Text-to-3D Generation + + +
+ We present Turbo3D, an ultra-fast text-to-3D system capable of generating +high-quality Gaussian splatting assets in under one second. Turbo3D employs a +rapid 4-step, 4-view diffusion generator and an efficient feed-forward Gaussian +reconstructor, both operating in latent space. The 4-step, 4-view generator is +a student model distilled through a novel Dual-Teacher approach, which +encourages the student to learn view consistency from a multi-view teacher and +photo-realism from a single-view teacher. By shifting the Gaussian +reconstructor's inputs from pixel space to latent space, we eliminate the extra +image decoding time and halve the transformer sequence length for maximum +efficiency. Our method demonstrates superior 3D generation results compared to +previous baselines, while operating in a fraction of their runtime. + +
+
+ comment: project page: https://turbo-3d.github.io/ +
+
+
+
+
+ + ☆ NVILA: Efficient Frontier Visual Language Models + + +
+ Visual language models (VLMs) have made significant advances in accuracy in +recent years. However, their efficiency has received much less attention. This +paper introduces NVILA, a family of open VLMs designed to optimize both +efficiency and accuracy. Building on top of VILA, we improve its model +architecture by first scaling up the spatial and temporal resolutions, and then +compressing visual tokens. This "scale-then-compress" approach enables NVILA to +efficiently process high-resolution images and long videos. We also conduct a +systematic investigation to enhance the efficiency of NVILA throughout its +entire lifecycle, from training and fine-tuning to deployment. NVILA matches or +surpasses the accuracy of many leading open and proprietary VLMs across a wide +range of image and video benchmarks. At the same time, it reduces training +costs by 4.5X, fine-tuning memory usage by 3.4X, pre-filling latency by +1.6-2.2X, and decoding latency by 1.2-2.8X. We will soon make our code and +models available to facilitate reproducibility. + +
+
+
+
+
+ + ☆ QUEEN: QUantized Efficient ENcoding of Dynamic Gaussians for Streaming + Free-viewpoint Videos NeurIPS 2024 + + +
+ Online free-viewpoint video (FVV) streaming is a challenging problem, which +is relatively under-explored. It requires incremental on-the-fly updates to a +volumetric representation, fast training and rendering to satisfy real-time +constraints and a small memory footprint for efficient transmission. If +achieved, it can enhance user experience by enabling novel applications, e.g., +3D video conferencing and live volumetric video broadcast, among others. In +this work, we propose a novel framework for QUantized and Efficient ENcoding +(QUEEN) for streaming FVV using 3D Gaussian Splatting (3D-GS). QUEEN directly +learns Gaussian attribute residuals between consecutive frames at each +time-step without imposing any structural constraints on them, allowing for +high quality reconstruction and generalizability. To efficiently store the +residuals, we further propose a quantization-sparsity framework, which contains +a learned latent-decoder for effectively quantizing attribute residuals other +than Gaussian positions and a learned gating module to sparsify position +residuals. We propose to use the Gaussian viewspace gradient difference vector +as a signal to separate the static and dynamic content of the scene. It acts as +a guide for effective sparsity learning and speeds up training. On diverse FVV +benchmarks, QUEEN outperforms the state-of-the-art online FVV methods on all +metrics. Notably, for several highly dynamic scenes, it reduces the model size +to just 0.7 MB per frame while training in under 5 sec and rendering at 350 +FPS. Project website is at https://research.nvidia.com/labs/amri/projects/queen + +
+
+ comment: Accepted at NeurIPS 2024, Project website: + https://research.nvidia.com/labs/amri/projects/queen +
+
+
+
+
+ + ☆ VisionZip: Longer is Better but Not Necessary in Vision Language Models + + +
+ Recent advancements in vision-language models have enhanced performance by +increasing the length of visual tokens, making them much longer than text +tokens and significantly raising computational costs. However, we observe that +the visual tokens generated by popular vision encoders, such as CLIP and +SigLIP, contain significant redundancy. To address this, we introduce +VisionZip, a simple yet effective method that selects a set of informative +tokens for input to the language model, reducing visual token redundancy and +improving efficiency while maintaining model performance. The proposed +VisionZip can be widely applied to image and video understanding tasks and is +well-suited for multi-turn dialogues in real-world scenarios, where previous +methods tend to underperform. Experimental results show that VisionZip +outperforms the previous state-of-the-art method by at least 5% performance +gains across nearly all settings. Moreover, our method significantly enhances +model inference speed, improving the prefilling time by 8x and enabling the +LLaVA-Next 13B model to infer faster than the LLaVA-Next 7B model while +achieving better results. Furthermore, we analyze the causes of this redundancy +and encourage the community to focus on extracting better visual features +rather than merely increasing token length. Our code is available at +https://github.com/dvlab-research/VisionZip . + +
+
+ comment: 2 columns, 28 pages, 15 figures, 18 tables +
+
+
+
+
+ + ☆ UnZipLoRA: Separating Content and Style from a Single Image + + +
+ This paper introduces UnZipLoRA, a method for decomposing an image into its +constituent subject and style, represented as two distinct LoRAs (Low-Rank +Adaptations). Unlike existing personalization techniques that focus on either +subject or style in isolation, or require separate training sets for each, +UnZipLoRA disentangles these elements from a single image by training both the +LoRAs simultaneously. UnZipLoRA ensures that the resulting LoRAs are +compatible, i.e., they can be seamlessly combined using direct addition. +UnZipLoRA enables independent manipulation and recontextualization of subject +and style, including generating variations of each, applying the extracted +style to new subjects, and recombining them to reconstruct the original image +or create novel variations. To address the challenge of subject and style +entanglement, UnZipLoRA employs a novel prompt separation technique, as well as +column and block separation strategies to accurately preserve the +characteristics of subject and style, and ensure compatibility between the +learned LoRAs. Evaluation with human studies and quantitative metrics +demonstrates UnZipLoRA's effectiveness compared to other state-of-the-art +methods, including DreamBooth-LoRA, Inspiration Tree, and B-LoRA. + +
+
+ comment: Project page: https://unziplora.github.io +
+
+
+
+
+ + ☆ DualPM: Dual Posed-Canonical Point Maps for 3D Shape and Pose + Reconstruction + + +
+ The choice of data representation is a key factor in the success of deep +learning in geometric tasks. For instance, DUSt3R has recently introduced the +concept of viewpoint-invariant point maps, generalizing depth prediction, and +showing that one can reduce all the key problems in the 3D reconstruction of +static scenes to predicting such point maps. In this paper, we develop an +analogous concept for a very different problem, namely, the reconstruction of +the 3D shape and pose of deformable objects. To this end, we introduce the Dual +Point Maps (DualPM), where a pair of point maps is extracted from the {same} +image, one associating pixels to their 3D locations on the object, and the +other to a canonical version of the object at rest pose. We also extend point +maps to amodal reconstruction, seeing through self-occlusions to obtain the +complete shape of the object. We show that 3D reconstruction and 3D pose +estimation reduce to the prediction of the DualPMs. We demonstrate empirically +that this representation is a good target for a deep network to predict; +specifically, we consider modeling horses, showing that DualPMs can be trained +purely on 3D synthetic data, consisting of a single model of a horse, while +generalizing very well to real images. With this, we improve by a large margin +previous methods for the 3D analysis and reconstruction of this type of +objects. + +
+
+ comment: First two authors contributed equally. Project page: + https://dualpm.github.io +
+
+
+
+
+ + ☆ MegaSaM: Accurate, Fast, and Robust Structure and Motion from Casual + Dynamic Videos + + +
+ We present a system that allows for accurate, fast, and robust estimation of +camera parameters and depth maps from casual monocular videos of dynamic +scenes. Most conventional structure from motion and monocular SLAM techniques +assume input videos that feature predominantly static scenes with large amounts +of parallax. Such methods tend to produce erroneous estimates in the absence of +these conditions. Recent neural network-based approaches attempt to overcome +these challenges; however, such methods are either computationally expensive or +brittle when run on dynamic videos with uncontrolled camera motion or unknown +field of view. We demonstrate the surprising effectiveness of a deep visual +SLAM framework: with careful modifications to its training and inference +schemes, this system can scale to real-world videos of complex dynamic scenes +with unconstrained camera paths, including videos with little camera parallax. +Extensive experiments on both synthetic and real videos demonstrate that our +system is significantly more accurate and robust at camera pose and depth +estimation when compared with prior and concurrent work, with faster or +comparable running times. See interactive results on our project page: +https://mega-sam.github.io/ + +
+
+
+
+
+ + ☆ 4Real-Video: Learning Generalizable Photo-Realistic 4D Video Diffusion + + +
+ We propose 4Real-Video, a novel framework for generating 4D videos, organized +as a grid of video frames with both time and viewpoint axes. In this grid, each +row contains frames sharing the same timestep, while each column contains +frames from the same viewpoint. We propose a novel two-stream architecture. One +stream performs viewpoint updates on columns, and the other stream performs +temporal updates on rows. After each diffusion transformer layer, a +synchronization layer exchanges information between the two token streams. We +propose two implementations of the synchronization layer, using either hard or +soft synchronization. This feedforward architecture improves upon previous work +in three ways: higher inference speed, enhanced visual quality (measured by +FVD, CLIP, and VideoScore), and improved temporal and viewpoint consistency +(measured by VideoScore and Dust3R-Confidence). + +
+
+ comment: Project page: https://snap-research.github.io/4Real-Video/ +
+
+
+
+
+ + ☆ LayerFusion: Harmonized Multi-Layer Text-to-Image Generation with + Generative Priors + + +
+ Large-scale diffusion models have achieved remarkable success in generating +high-quality images from textual descriptions, gaining popularity across +various applications. However, the generation of layered content, such as +transparent images with foreground and background layers, remains an +under-explored area. Layered content generation is crucial for creative +workflows in fields like graphic design, animation, and digital art, where +layer-based approaches are fundamental for flexible editing and composition. In +this paper, we propose a novel image generation pipeline based on Latent +Diffusion Models (LDMs) that generates images with two layers: a foreground +layer (RGBA) with transparency information and a background layer (RGB). Unlike +existing methods that generate these layers sequentially, our approach +introduces a harmonized generation mechanism that enables dynamic interactions +between the layers for more coherent outputs. We demonstrate the effectiveness +of our method through extensive qualitative and quantitative experiments, +showing significant improvements in visual coherence, image quality, and layer +consistency compared to baseline methods. + +
+
+ comment: Project page: https://layerfusion.github.io +
+
+
+
+
+ + ☆ Sparse Voxels Rasterization: Real-time High-fidelity Radiance Field + Rendering + + +
+ We propose an efficient radiance field rendering algorithm that incorporates +a rasterization process on sparse voxels without neural networks or 3D +Gaussians. There are two key contributions coupled with the proposed system. +The first is to render sparse voxels in the correct depth order along pixel +rays by using dynamic Morton ordering. This avoids the well-known popping +artifact found in Gaussian splatting. Second, we adaptively fit sparse voxels +to different levels of detail within scenes, faithfully reproducing scene +details while achieving high rendering frame rates. Our method improves the +previous neural-free voxel grid representation by over 4db PSNR and more than +10x rendering FPS speedup, achieving state-of-the-art comparable novel-view +synthesis results. Additionally, our neural-free sparse voxels are seamlessly +compatible with grid-based 3D processing algorithms. We achieve promising mesh +reconstruction accuracy by integrating TSDF-Fusion and Marching Cubes into our +sparse grid system. + +
+
+ comment: Code release in progress +
+
+
+
+
+ + ☆ Cubify Anything: Scaling Indoor 3D Object Detection + + +
+ We consider indoor 3D object detection with respect to a single RGB(-D) frame +acquired from a commodity handheld device. We seek to significantly advance the +status quo with respect to both data and modeling. First, we establish that +existing datasets have significant limitations to scale, accuracy, and +diversity of objects. As a result, we introduce the Cubify-Anything 1M (CA-1M) +dataset, which exhaustively labels over 400K 3D objects on over 1K highly +accurate laser-scanned scenes with near-perfect registration to over 3.5K +handheld, egocentric captures. Next, we establish Cubify Transformer (CuTR), a +fully Transformer 3D object detection baseline which rather than operating in +3D on point or voxel-based representations, predicts 3D boxes directly from 2D +features derived from RGB(-D) inputs. While this approach lacks any 3D +inductive biases, we show that paired with CA-1M, CuTR outperforms point-based +methods - accurately recalling over 62% of objects in 3D, and is significantly +more capable at handling noise and uncertainty present in commodity +LiDAR-derived depth maps while also providing promising RGB only performance +without architecture changes. Furthermore, by pre-training on CA-1M, CuTR can +outperform point-based methods on a more diverse variant of SUN RGB-D - +supporting the notion that while inductive biases in 3D are useful at the +smaller sizes of existing datasets, they fail to scale to the data-rich regime +of CA-1M. Overall, this dataset and baseline model provide strong evidence that +we are moving towards models which can effectively Cubify Anything. + +
+
+
+
+
+ + ☆ Monocular Dynamic Gaussian Splatting is Fast and Brittle but Smooth + Motion Helps + + +
+ Gaussian splatting methods are emerging as a popular approach for converting +multi-view image data into scene representations that allow view synthesis. In +particular, there is interest in enabling view synthesis for dynamic scenes +using only monocular input data -- an ill-posed and challenging problem. The +fast pace of work in this area has produced multiple simultaneous papers that +claim to work best, which cannot all be true. In this work, we organize, +benchmark, and analyze many Gaussian-splatting-based methods, providing +apples-to-apples comparisons that prior works have lacked. We use multiple +existing datasets and a new instructive synthetic dataset designed to isolate +factors that affect reconstruction quality. We systematically categorize +Gaussian splatting methods into specific motion representation types and +quantify how their differences impact performance. Empirically, we find that +their rank order is well-defined in synthetic data, but the complexity of +real-world data currently overwhelms the differences. Furthermore, the fast +rendering speed of all Gaussian-based methods comes at the cost of brittleness +in optimization. We summarize our experiments into a list of findings that can +help to further progress in this lively problem setting. Project Webpage: +https://lynl7130.github.io/MonoDyGauBench.github.io/ + +
+
+ comment: 37 pages, 39 figures, 9 tables +
+
+
+
+
+ + ☆ HeatFormer: A Neural Optimizer for Multiview Human Mesh Recovery + + +
+ We introduce a novel method for human shape and pose recovery that can fully +leverage multiple static views. We target fixed-multiview people monitoring, +including elderly care and safety monitoring, in which calibrated cameras can +be installed at the corners of a room or an open space but whose configuration +may vary depending on the environment. Our key idea is to formulate it as +neural optimization. We achieve this with HeatFormer, a neural optimizer that +iteratively refines the SMPL parameters given multiview images, which is +fundamentally agonistic to the configuration of views. HeatFormer realizes this +SMPL parameter estimation as heat map generation and alignment with a novel +transformer encoder and decoder. We demonstrate the effectiveness of HeatFormer +including its accuracy, robustness to occlusion, and generalizability through +an extensive set of experiments. We believe HeatFormer can serve a key role in +passive human behavior modeling. + +
+
+
+
+
+ + ☆ Code-as-Monitor: Constraint-aware Visual Programming for Reactive and + Proactive Robotic Failure Detection + + +
+ Automatic detection and prevention of open-set failures are crucial in +closed-loop robotic systems. Recent studies often struggle to simultaneously +identify unexpected failures reactively after they occur and prevent +foreseeable ones proactively. To this end, we propose Code-as-Monitor (CaM), a +novel paradigm leveraging the vision-language model (VLM) for both open-set +reactive and proactive failure detection. The core of our method is to +formulate both tasks as a unified set of spatio-temporal constraint +satisfaction problems and use VLM-generated code to evaluate them for real-time +monitoring. To enhance the accuracy and efficiency of monitoring, we further +introduce constraint elements that abstract constraint-related entities or +their parts into compact geometric elements. This approach offers greater +generality, simplifies tracking, and facilitates constraint-aware visual +programming by leveraging these elements as visual prompts. Experiments show +that CaM achieves a 28.7% higher success rate and reduces execution time by +31.8% under severe disturbances compared to baselines across three simulators +and a real-world setting. Moreover, CaM can be integrated with open-loop +control policies to form closed-loop systems, enabling long-horizon tasks in +cluttered scenes with dynamic environments. + +
+
+ comment: Project page: https://zhoues.github.io/Code-as-Monitor/ +
+
+
+
+
+ + ☆ Four-Plane Factorized Video Autoencoders + + +
+ Latent variable generative models have emerged as powerful tools for +generative tasks including image and video synthesis. These models are enabled +by pretrained autoencoders that map high resolution data into a compressed +lower dimensional latent space, where the generative models can subsequently be +developed while requiring fewer computational resources. Despite their +effectiveness, the direct application of latent variable models to higher +dimensional domains such as videos continues to pose challenges for efficient +training and inference. In this paper, we propose an autoencoder that projects +volumetric data onto a four-plane factorized latent space that grows +sublinearly with the input size, making it ideal for higher dimensional data +like videos. The design of our factorized model supports straightforward +adoption in a number of conditional generation tasks with latent diffusion +models (LDMs), such as class-conditional generation, frame prediction, and +video interpolation. Our results show that the proposed four-plane latent space +retains a rich representation needed for high-fidelity reconstructions despite +the heavy compression, while simultaneously enabling LDMs to operate with +significant improvements in speed and memory. + +
+
+
+
+
+ + ☆ NaVILA: Legged Robot Vision-Language-Action Model for Navigation + + +
+ This paper proposes to solve the problem of Vision-and-Language Navigation +with legged robots, which not only provides a flexible way for humans to +command but also allows the robot to navigate through more challenging and +cluttered scenes. However, it is non-trivial to translate human language +instructions all the way to low-level leg joint actions. We propose NaVILA, a +2-level framework that unifies a Vision-Language-Action model (VLA) with +locomotion skills. Instead of directly predicting low-level actions from VLA, +NaVILA first generates mid-level actions with spatial information in the form +of language, (e.g., "moving forward 75cm"), which serves as an input for a +visual locomotion RL policy for execution. NaVILA substantially improves +previous approaches on existing benchmarks. The same advantages are +demonstrated in our newly developed benchmarks with IsaacLab, featuring more +realistic scenes, low-level controls, and real-world robot experiments. We show +more results at https://navila-bot.github.io/ + +
+
+ comment: Website: https://navila-bot.github.io/ +
+
+
+
+
+ + ☆ p-MoD: Building Mixture-of-Depths MLLMs via Progressive Ratio Decay + + +
+ Despite the remarkable performance of multimodal large language models +(MLLMs) across diverse tasks, the substantial training and inference costs +impede their advancement. The majority of computation stems from the +overwhelming volume of vision tokens processed by the transformer decoder. In +this paper, we propose to build efficient MLLMs by leveraging the +Mixture-of-Depths (MoD) mechanism, where each transformer decoder layer selects +essential vision tokens to process while skipping redundant ones. However, +integrating MoD into MLLMs is non-trivial. To address the challenges of +training and inference stability as well as limited training data, we adapt the +MoD module with two novel designs: tanh-gated weight normalization (TanhNorm) +and symmetric token reweighting (STRing). Moreover, we observe that vision +tokens exhibit higher redundancy in deeper layer and thus design a progressive +ratio decay (PRD) strategy, which gradually reduces the token retention ratio +layer by layer, employing a shifted cosine schedule. This crucial design fully +unleashes the potential of MoD, significantly boosting the efficiency and +performance of our models. To validate the effectiveness of our approach, we +conduct extensive experiments with two baseline models across 14 benchmarks. +Our model, p-MoD, matches or even surpasses the performance of the baseline +models, with only 55.6% TFLOPs and 53.8% KV cache storage during inference, and +77.7% GPU hours during training. + +
+
+ comment: Technical Report; Code released at https://github.com/MCG-NJU/p-MoD +
+
+
+
+
+ + ☆ MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation + + +
+ Recent advances in video diffusion models have unlocked new potential for +realistic audio-driven talking video generation. However, achieving seamless +audio-lip synchronization, maintaining long-term identity consistency, and +producing natural, audio-aligned expressions in generated talking videos remain +significant challenges. To address these challenges, we propose Memory-guided +EMOtion-aware diffusion (MEMO), an end-to-end audio-driven portrait animation +approach to generate identity-consistent and expressive talking videos. Our +approach is built around two key modules: (1) a memory-guided temporal module, +which enhances long-term identity consistency and motion smoothness by +developing memory states to store information from a longer past context to +guide temporal modeling via linear attention; and (2) an emotion-aware audio +module, which replaces traditional cross attention with multi-modal attention +to enhance audio-video interaction, while detecting emotions from audio to +refine facial expressions via emotion adaptive layer norm. Extensive +quantitative and qualitative results demonstrate that MEMO generates more +realistic talking videos across diverse image and audio types, outperforming +state-of-the-art methods in overall quality, audio-lip synchronization, +identity consistency, and expression-emotion alignment. + +
+
+ comment: Project Page: https://memoavatar.github.io +
+
+
+
+
+ + ☆ EgoPlan-Bench2: A Benchmark for Multimodal Large Language Model Planning + in Real-World Scenarios + + +
+ The advent of Multimodal Large Language Models, leveraging the power of Large +Language Models, has recently demonstrated superior multimodal understanding +and reasoning abilities, heralding a new era for artificial general +intelligence. However, achieving AGI necessitates more than just comprehension +and reasoning. A crucial capability required is effective planning in diverse +scenarios, which involves making reasonable decisions based on complex +environments to solve real-world problems. Despite its importance, the planning +abilities of current MLLMs in varied scenarios remain underexplored. In this +paper, we introduce EgoPlan-Bench2, a rigorous and comprehensive benchmark +designed to assess the planning capabilities of MLLMs across a wide range of +real-world scenarios. EgoPlan-Bench2 encompasses everyday tasks spanning 4 +major domains and 24 detailed scenarios, closely aligned with human daily life. +EgoPlan-Bench2 is constructed through a semi-automatic process utilizing +egocentric videos, complemented by manual verification. Grounded in a +first-person perspective, it mirrors the way humans approach problem-solving in +everyday life. We evaluate 21 competitive MLLMs and provide an in-depth +analysis of their limitations, revealing that they face significant challenges +in real-world planning. To further improve the planning proficiency of current +MLLMs, we propose a training-free approach using multimodal Chain-of-Thought +(CoT) prompting through investigating the effectiveness of various multimodal +prompts in complex planning. Our approach enhances the performance of GPT-4V by +10.24 on EgoPlan-Bench2 without additional training. Our work not only sheds +light on the current limitations of MLLMs in planning, but also provides +insights for future enhancements in this critical area. We have made data and +code available at https://qiulu66.github.io/egoplanbench2/. + +
+
+ comment: Code & data are available at: + https://qiulu66.github.io/egoplanbench2/ +
+
+
+
+
+ + ☆ DiCoDe: Diffusion-Compressed Deep Tokens for Autoregressive Video + Generation with Language Models + + +
+ Videos are inherently temporal sequences by their very nature. In this work, +we explore the potential of modeling videos in a chronological and scalable +manner with autoregressive (AR) language models, inspired by their success in +natural language processing. We introduce DiCoDe, a novel approach that +leverages Diffusion-Compressed Deep Tokens to generate videos with a language +model in an autoregressive manner. Unlike existing methods that employ +low-level representations with limited compression rates, DiCoDe utilizes deep +tokens with a considerable compression rate (a 1000x reduction in token count). +This significant compression is made possible by a tokenizer trained through +leveraging the prior knowledge of video diffusion models. Deep tokens enable +DiCoDe to employ vanilla AR language models for video generation, akin to +translating one visual "language" into another. By treating videos as temporal +sequences, DiCoDe fully harnesses the capabilities of language models for +autoregressive generation. DiCoDe is scalable using readily available AR +architectures, and is capable of generating videos ranging from a few seconds +to one minute using only 4 A100 GPUs for training. We evaluate DiCoDe both +quantitatively and qualitatively, demonstrating that it performs comparably to +existing methods in terms of quality while ensuring efficient training. To +showcase its scalability, we release a series of DiCoDe configurations with +varying parameter sizes and observe a consistent improvement in performance as +the model size increases from 100M to 3B. We believe that DiCoDe's exploration +in academia represents a promising initial step toward scalable video modeling +with AR language models, paving the way for the development of larger and more +powerful video generation models. + +
+
+ comment: Project Page: https://liyz15.github.io/DiCoDe +
+
+
+
+
+ + ☆ Moto: Latent Motion Token as the Bridging Language for Robot + Manipulation + + +
+ Recent developments in Large Language Models pre-trained on extensive corpora +have shown significant success in various natural language processing tasks +with minimal fine-tuning. This success offers new promise for robotics, which +has long been constrained by the high cost of action-labeled data. We ask: +given the abundant video data containing interaction-related knowledge +available as a rich "corpus", can a similar generative pre-training approach be +effectively applied to enhance robot learning? The key challenge is to identify +an effective representation for autoregressive pre-training that benefits robot +manipulation tasks. Inspired by the way humans learn new skills through +observing dynamic environments, we propose that effective robotic learning +should emphasize motion-related knowledge, which is closely tied to low-level +actions and is hardware-agnostic, facilitating the transfer of learned motions +to actual robot actions. To this end, we introduce Moto, which converts video +content into latent Motion Token sequences by a Latent Motion Tokenizer, +learning a bridging "language" of motion from videos in an unsupervised manner. +We pre-train Moto-GPT through motion token autoregression, enabling it to +capture diverse visual motion knowledge. After pre-training, Moto-GPT +demonstrates the promising ability to produce semantically interpretable motion +tokens, predict plausible motion trajectories, and assess trajectory +rationality through output likelihood. To transfer learned motion priors to +real robot actions, we implement a co-fine-tuning strategy that seamlessly +bridges latent motion token prediction and real robot control. Extensive +experiments show that the fine-tuned Moto-GPT exhibits superior robustness and +efficiency on robot manipulation benchmarks, underscoring its effectiveness in +transferring knowledge from video data to downstream visual manipulation tasks. + +
+
+ comment: Project released at: https://chenyi99.github.io/moto/ +
+
+
+
+
+ + ☆ Learning Artistic Signatures: Symmetry Discovery and Style Transfer + + +
+ Despite nearly a decade of literature on style transfer, there is no +undisputed definition of artistic style. State-of-the-art models produce +impressive results but are difficult to interpret since, without a coherent +definition of style, the problem of style transfer is inherently ill-posed. +Early work framed style-transfer as an optimization problem but treated style +as a measure only of texture. This led to artifacts in the outputs of early +models where content features from the style image sometimes bled into the +output image. Conversely, more recent work with diffusion models offers +compelling empirical results but provides little theoretical grounding. To +address these issues, we propose an alternative definition of artistic style. +We suggest that style should be thought of as a set of global symmetries that +dictate the arrangement of local textures. We validate this perspective +empirically by learning the symmetries of a large dataset of paintings and +showing that symmetries are predictive of the artistic movement to which each +painting belongs. Finally, we show that by considering both local and global +features, using both Lie generators and traditional measures of texture, we can +quantitatively capture the stylistic similarity between artists better than +with either set of features alone. This approach not only aligns well with art +historians' consensus but also offers a robust framework for distinguishing +nuanced stylistic differences, allowing for a more interpretable, theoretically +grounded approach to style transfer. + +
+
+
+
+
+ + ☆ GenMAC: Compositional Text-to-Video Generation with Multi-Agent + Collaboration + + +
+ Text-to-video generation models have shown significant progress in the recent +years. However, they still struggle with generating complex dynamic scenes +based on compositional text prompts, such as attribute binding for multiple +objects, temporal dynamics associated with different objects, and interactions +between objects. Our key motivation is that complex tasks can be decomposed +into simpler ones, each handled by a role-specialized MLLM agent. Multiple +agents can collaborate together to achieve collective intelligence for complex +goals. We propose GenMAC, an iterative, multi-agent framework that enables +compositional text-to-video generation. The collaborative workflow includes +three stages: Design, Generation, and Redesign, with an iterative loop between +the Generation and Redesign stages to progressively verify and refine the +generated videos. The Redesign stage is the most challenging stage that aims to +verify the generated videos, suggest corrections, and redesign the text +prompts, frame-wise layouts, and guidance scales for the next iteration of +generation. To avoid hallucination of a single MLLM agent, we decompose this +stage to four sequentially-executed MLLM-based agents: verification agent, +suggestion agent, correction agent, and output structuring agent. Furthermore, +to tackle diverse scenarios of compositional text-to-video generation, we +design a self-routing mechanism to adaptively select the proper correction +agent from a collection of correction agents each specialized for one scenario. +Extensive experiments demonstrate the effectiveness of GenMAC, achieving +state-of-the art performance in compositional text-to-video generation. + +
+
+ comment: Project website: https://karine-h.github.io/GenMAC/ +
+
+
+
+
+ + ☆ Towards Real-Time Open-Vocabulary Video Instance Segmentation + + +
+ In this paper, we address the challenge of performing open-vocabulary video +instance segmentation (OV-VIS) in real-time. We analyze the computational +bottlenecks of state-of-the-art foundation models that performs OV-VIS, and +propose a new method, TROY-VIS, that significantly improves processing speed +while maintaining high accuracy. We introduce three key techniques: (1) +Decoupled Attention Feature Enhancer to speed up information interaction +between different modalities and scales; (2) Flash Embedding Memory for +obtaining fast text embeddings of object categories; and, (3) Kernel +Interpolation for exploiting the temporal continuity in videos. Our experiments +demonstrate that TROY-VIS achieves the best trade-off between accuracy and +speed on two large-scale OV-VIS benchmarks, BURST and LV-VIS, running 20x +faster than GLEE-Lite (25 FPS v.s. 1.25 FPS) with comparable or even better +accuracy. These results demonstrate TROY-VIS's potential for real-time +applications in dynamic environments such as mobile robotics and augmented +reality. Code and model will be released at +https://github.com/google-research/troyvis. + +
+
+
+
+
+ + ☆ PBDyG: Position Based Dynamic Gaussians for Motion-Aware Clothed Human + Avatars + + +
+ This paper introduces a novel clothed human model that can be learned from +multiview RGB videos, with a particular emphasis on recovering physically +accurate body and cloth movements. Our method, Position Based Dynamic Gaussians +(PBDyG), realizes ``movement-dependent'' cloth deformation via physical +simulation, rather than merely relying on ``pose-dependent'' rigid +transformations. We model the clothed human holistically but with two distinct +physical entities in contact: clothing modeled as 3D Gaussians, which are +attached to a skinned SMPL body that follows the movement of the person in the +input videos. The articulation of the SMPL body also drives physically-based +simulation of the clothes' Gaussians to transform the avatar to novel poses. In +order to run position based dynamics simulation, physical properties including +mass and material stiffness are estimated from the RGB videos through Dynamic +3D Gaussian Splatting. Experiments demonstrate that our method not only +accurately reproduces appearance but also enables the reconstruction of avatars +wearing highly deformable garments, such as skirts or coats, which have been +challenging to reconstruct using existing methods. + +
+
+
+
+
+ + ☆ Divot: Diffusion Powers Video Tokenizer for Comprehension and Generation + + +
+ In recent years, there has been a significant surge of interest in unifying +image comprehension and generation within Large Language Models (LLMs). This +growing interest has prompted us to explore extending this unification to +videos. The core challenge lies in developing a versatile video tokenizer that +captures both the spatial characteristics and temporal dynamics of videos to +obtain representations for LLMs, and the representations can be further decoded +into realistic video clips to enable video generation. In this work, we +introduce Divot, a Diffusion-Powered Video Tokenizer, which leverages the +diffusion process for self-supervised video representation learning. We posit +that if a video diffusion model can effectively de-noise video clips by taking +the features of a video tokenizer as the condition, then the tokenizer has +successfully captured robust spatial and temporal information. Additionally, +the video diffusion model inherently functions as a de-tokenizer, decoding +videos from their representations. Building upon the Divot tokenizer, we +present Divot-Vicuna through video-to-text autoregression and text-to-video +generation by modeling the distributions of continuous-valued Divot features +with a Gaussian Mixture Model. Experimental results demonstrate that our +diffusion-based video tokenizer, when integrated with a pre-trained LLM, +achieves competitive performance across various video comprehension and +generation benchmarks. The instruction tuned Divot-Vicuna also excels in video +storytelling, generating interleaved narratives and corresponding videos. + +
+
+ comment: Project released at: https://github.com/TencentARC/Divot +
+
+
+
+
+ + ☆ Infinity: Scaling Bitwise AutoRegressive Modeling for High-Resolution + Image Synthesis + + +
+ We present Infinity, a Bitwise Visual AutoRegressive Modeling capable of +generating high-resolution, photorealistic images following language +instruction. Infinity redefines visual autoregressive model under a bitwise +token prediction framework with an infinite-vocabulary tokenizer & classifier +and bitwise self-correction mechanism, remarkably improving the generation +capacity and details. By theoretically scaling the tokenizer vocabulary size to +infinity and concurrently scaling the transformer size, our method +significantly unleashes powerful scaling capabilities compared to vanilla VAR. +Infinity sets a new record for autoregressive text-to-image models, +outperforming top-tier diffusion models like SD3-Medium and SDXL. Notably, +Infinity surpasses SD3-Medium by improving the GenEval benchmark score from +0.62 to 0.73 and the ImageReward benchmark score from 0.87 to 0.96, achieving a +win rate of 66%. Without extra optimization, Infinity generates a high-quality +1024x1024 image in 0.8 seconds, making it 2.6x faster than SD3-Medium and +establishing it as the fastest text-to-image model. Models and codes will be +released to promote further exploration of Infinity for visual generation and +unified tokenizer modeling. + +
+
+ comment: 17 pages, 14 figures +
+
+
+
+
+ + ☆ Grounding Descriptions in Images informs Zero-Shot Visual Recognition + + +
+ Vision-language models (VLMs) like CLIP have been cherished for their ability +to perform zero-shot visual recognition on open-vocabulary concepts. This is +achieved by selecting the object category whose textual representation bears +the highest similarity with the query image. While successful in some domains, +this method struggles with identifying fine-grained entities as well as +generalizing to unseen concepts that are not captured by the training +distribution. Recent works attempt to mitigate these challenges by integrating +category descriptions at test time, albeit yielding modest improvements. We +attribute these limited gains to a fundamental misalignment between image and +description representations, which is rooted in the pretraining structure of +CLIP. In this paper, we propose GRAIN, a new pretraining strategy aimed at +aligning representations at both fine and coarse levels simultaneously. Our +approach learns to jointly ground textual descriptions in image regions along +with aligning overarching captions with global image representations. To drive +this pre-training, we leverage frozen Multimodal Large Language Models (MLLMs) +to derive large-scale synthetic annotations. We demonstrate the enhanced +zero-shot performance of our model compared to current state-of-the art methods +across 11 diverse image classification datasets. Additionally, we introduce +Products-2023, a newly curated, manually labeled dataset featuring novel +concepts, and showcase our model's ability to recognize these concepts by +benchmarking on it. Significant improvements achieved by our model on other +downstream tasks like retrieval further highlight the superior quality of +representations learned by our approach. Code available at +https://github.com/shaunak27/grain-clip . + +
+
+
+
+
+ + ☆ Florence-VL: Enhancing Vision-Language Models with Generative Vision + Encoder and Depth-Breadth Fusion + + +
+ We present Florence-VL, a new family of multimodal large language models +(MLLMs) with enriched visual representations produced by Florence-2, a +generative vision foundation model. Unlike the widely used CLIP-style vision +transformer trained by contrastive learning, Florence-2 can capture different +levels and aspects of visual features, which are more versatile to be adapted +to diverse downstream tasks. We propose a novel feature-fusion architecture and +an innovative training recipe that effectively integrates Florence-2's visual +features into pretrained LLMs, such as Phi 3.5 and LLama 3. In particular, we +propose "depth-breath fusion (DBFusion)" to fuse the visual features extracted +from different depths and under multiple prompts. Our model training is +composed of end-to-end pretraining of the whole model followed by finetuning of +the projection layer and the LLM, on a carefully designed recipe of diverse +open-source datasets that include high-quality image captions and +instruction-tuning pairs. Our quantitative analysis and visualization of +Florence-VL's visual features show its advantages over popular vision encoders +on vision-language alignment, where the enriched depth and breath play +important roles. Florence-VL achieves significant improvements over existing +state-of-the-art MLLMs across various multi-modal and vision-centric benchmarks +covering general VQA, perception, hallucination, OCR, Chart, +knowledge-intensive understanding, etc. To facilitate future research, our +models and the complete training recipe are open-sourced. +https://github.com/JiuhaiChen/Florence-VL + +
+
+
+
+
+ + ☆ FedDUAL: A Dual-Strategy with Adaptive Loss and Dynamic Aggregation for + Mitigating Data Heterogeneity in Federated Learning + + +
+ Federated Learning (FL) marks a transformative approach to distributed model +training by combining locally optimized models from various clients into a +unified global model. While FL preserves data privacy by eliminating +centralized storage, it encounters significant challenges such as performance +degradation, slower convergence, and reduced robustness of the global model due +to the heterogeneity in client data distributions. Among the various forms of +data heterogeneity, label skew emerges as a particularly formidable and +prevalent issue, especially in domains such as image classification. To address +these challenges, we begin with comprehensive experiments to pinpoint the +underlying issues in the FL training process. Based on our findings, we then +introduce an innovative dual-strategy approach designed to effectively resolve +these issues. First, we introduce an adaptive loss function for client-side +training, meticulously crafted to preserve previously acquired knowledge while +maintaining an optimal equilibrium between local optimization and global model +coherence. Secondly, we develop a dynamic aggregation strategy for aggregating +client models at the server. This approach adapts to each client's unique +learning patterns, effectively addressing the challenges of diverse data across +the network. Our comprehensive evaluation, conducted across three diverse +real-world datasets, coupled with theoretical convergence guarantees, +demonstrates the superior efficacy of our method compared to several +established state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Probabilistic Gaussian Superposition for Efficient 3D Occupancy + Prediction + + +
+ 3D semantic occupancy prediction is an important task for robust +vision-centric autonomous driving, which predicts fine-grained geometry and +semantics of the surrounding scene. Most existing methods leverage dense +grid-based scene representations, overlooking the spatial sparsity of the +driving scenes. Although 3D semantic Gaussian serves as an object-centric +sparse alternative, most of the Gaussians still describe the empty region with +low efficiency. To address this, we propose a probabilistic Gaussian +superposition model which interprets each Gaussian as a probability +distribution of its neighborhood being occupied and conforms to probabilistic +multiplication to derive the overall geometry. Furthermore, we adopt the exact +Gaussian mixture model for semantics calculation to avoid unnecessary +overlapping of Gaussians. To effectively initialize Gaussians in non-empty +region, we design a distribution-based initialization module which learns the +pixel-aligned occupancy distribution instead of the depth of surfaces. We +conduct extensive experiments on nuScenes and KITTI-360 datasets and our +GaussianFormer-2 achieves state-of-the-art performance with high efficiency. +Code: https://github.com/huang-yh/GaussianFormer. + +
+
+ comment: Code is available at: https://github.com/huang-yh/GaussianFormer +
+
+
+
+
+ + ☆ SeeGround: See and Ground for Zero-Shot Open-Vocabulary 3D Visual + Grounding + + +
+ 3D Visual Grounding (3DVG) aims to locate objects in 3D scenes based on +textual descriptions, which is essential for applications like augmented +reality and robotics. Traditional 3DVG approaches rely on annotated 3D datasets +and predefined object categories, limiting scalability and adaptability. To +overcome these limitations, we introduce SeeGround, a zero-shot 3DVG framework +leveraging 2D Vision-Language Models (VLMs) trained on large-scale 2D data. We +propose to represent 3D scenes as a hybrid of query-aligned rendered images and +spatially enriched text descriptions, bridging the gap between 3D data and +2D-VLMs input formats. We propose two modules: the Perspective Adaptation +Module, which dynamically selects viewpoints for query-relevant image +rendering, and the Fusion Alignment Module, which integrates 2D images with 3D +spatial descriptions to enhance object localization. Extensive experiments on +ScanRefer and Nr3D demonstrate that our approach outperforms existing zero-shot +methods by large margins. Notably, we exceed weakly supervised methods and +rival some fully supervised ones, outperforming previous SOTA by 7.7% on +ScanRefer and 7.1% on Nr3D, showcasing its effectiveness. + +
+
+ comment: Preprint; 19 pages, 10 figures, 9 tables; Project Page at + https://seeground.github.io/ +
+
+
+
+
+ + ☆ EmbodiedOcc: Embodied 3D Occupancy Prediction for Vision-based Online + Scene Understanding + + +
+ 3D occupancy prediction provides a comprehensive description of the +surrounding scenes and has become an essential task for 3D perception. Most +existing methods focus on offline perception from one or a few views and cannot +be applied to embodied agents which demands to gradually perceive the scene +through progressive embodied exploration. In this paper, we formulate an +embodied 3D occupancy prediction task to target this practical scenario and +propose a Gaussian-based EmbodiedOcc framework to accomplish it. We initialize +the global scene with uniform 3D semantic Gaussians and progressively update +local regions observed by the embodied agent. For each update, we extract +semantic and structural features from the observed image and efficiently +incorporate them via deformable cross-attention to refine the regional +Gaussians. Finally, we employ Gaussian-to-voxel splatting to obtain the global +3D occupancy from the updated 3D Gaussians. Our EmbodiedOcc assumes an unknown +(i.e., uniformly distributed) environment and maintains an explicit global +memory of it with 3D Gaussians. It gradually gains knowledge through local +refinement of regional Gaussians, which is consistent with how humans +understand new scenes through embodied exploration. We reorganize an +EmbodiedOcc-ScanNet benchmark based on local annotations to facilitate the +evaluation of the embodied 3D occupancy prediction task. Experiments +demonstrate that our EmbodiedOcc outperforms existing local prediction methods +and accomplishes the embodied occupancy prediction with high accuracy and +strong expandability. Our code is available at: +https://github.com/YkiWu/EmbodiedOcc. + +
+
+ comment: Code: https://github.com/YkiWu/EmbodiedOcc +
+
+
+
+
+ + ☆ Discriminative Fine-tuning of LVLMs + + +
+ Contrastively-trained Vision-Language Models (VLMs) like CLIP have become the +de facto approach for discriminative vision-language representation learning. +However, these models have limited language understanding, often exhibiting a +"bag of words" behavior. At the same time, Large Vision-Language Models +(LVLMs), which combine vision encoders with LLMs, have been shown capable of +detailed vision-language reasoning, yet their autoregressive nature renders +them less suitable for discriminative tasks. + In this work, we propose to combine "the best of both worlds": a new training +approach for discriminative fine-tuning of LVLMs that results in strong +discriminative and compositional capabilities. Essentially, our approach +converts a generative LVLM into a discriminative one, unlocking its capability +for powerful image-text discrimination combined with enhanced language +understanding. + Our contributions include: (1) A carefully designed training/optimization +framework that utilizes image-text pairs of variable length and granularity for +training the model with both contrastive and next-token prediction losses. This +is accompanied by ablation studies that justify the necessity of our +framework's components. (2) A parameter-efficient adaptation method using a +combination of soft prompting and LoRA adapters. (3) Significant improvements +over state-of-the-art CLIP-like models of similar size, including standard +image-text retrieval benchmarks and notable gains in compositionality. + +
+
+ comment: Preprint. The first two authors contributed equally +
+
+
+
+
+ + ☆ A Hitchhiker's Guide to Understanding Performances of Two-Class + Classifiers + + +
+ Properly understanding the performances of classifiers is essential in +various scenarios. However, the literature often relies only on one or two +standard scores to compare classifiers, which fails to capture the nuances of +application-specific requirements, potentially leading to suboptimal classifier +selection. Recently, a paper on the foundations of the theory of +performance-based ranking introduced a tool, called the Tile, that organizes an +infinity of ranking scores into a 2D map. Thanks to the Tile, it is now +possible to evaluate and compare classifiers efficiently, displaying all +possible application-specific preferences instead of having to rely on a pair +of scores. In this paper, we provide a first hitchhiker's guide for +understanding the performances of two-class classifiers by presenting four +scenarios, each showcasing a different user profile: a theoretical analyst, a +method designer, a benchmarker, and an application developer. Particularly, we +show that we can provide different interpretative flavors that are adapted to +the user's needs by mapping different values on the Tile. As an illustration, +we leverage the newly introduced Tile tool and the different flavors to rank +and analyze the performances of 74 state-of-the-art semantic segmentation +models in two-class classification through the eyes of the four user profiles. +Through these user profiles, we demonstrate that the Tile effectively captures +the behavior of classifiers in a single visualization, while accommodating an +infinite number of ranking scores. + +
+
+
+
+
+ + ☆ ActFusion: a Unified Diffusion Model for Action Segmentation and + Anticipation NeurIPS 2024 + + +
+ Temporal action segmentation and long-term action anticipation are two +popular vision tasks for the temporal analysis of actions in videos. Despite +apparent relevance and potential complementarity, these two problems have been +investigated as separate and distinct tasks. In this work, we tackle these two +problems, action segmentation and action anticipation, jointly using a unified +diffusion model dubbed ActFusion. The key idea to unification is to train the +model to effectively handle both visible and invisible parts of the sequence in +an integrated manner; the visible part is for temporal segmentation, and the +invisible part is for future anticipation. To this end, we introduce a new +anticipative masking strategy during training in which a late part of the video +frames is masked as invisible, and learnable tokens replace these frames to +learn to predict the invisible future. Experimental results demonstrate the +bi-directional benefits between action segmentation and anticipation. ActFusion +achieves the state-of-the-art performance across the standard benchmarks of 50 +Salads, Breakfast, and GTEA, outperforming task-specific models in both of the +two tasks with a single unified model through joint learning. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ RMD: A Simple Baseline for More General Human Motion Generation via + Training-free Retrieval-Augmented Motion Diffuse + + +
+ While motion generation has made substantial progress, its practical +application remains constrained by dataset diversity and scale, limiting its +ability to handle out-of-distribution scenarios. To address this, we propose a +simple and effective baseline, RMD, which enhances the generalization of motion +generation through retrieval-augmented techniques. Unlike previous +retrieval-based methods, RMD requires no additional training and offers three +key advantages: (1) the external retrieval database can be flexibly replaced; +(2) body parts from the motion database can be reused, with an LLM facilitating +splitting and recombination; and (3) a pre-trained motion diffusion model +serves as a prior to improve the quality of motions obtained through retrieval +and direct combination. Without any training, RMD achieves state-of-the-art +performance, with notable advantages on out-of-distribution data. + +
+
+
+
+
+ + ☆ Likelihood-Scheduled Score-Based Generative Modeling for Fully 3D PET + Image Reconstruction + + +
+ Medical image reconstruction with pre-trained score-based generative models +(SGMs) has advantages over other existing state-of-the-art deep-learned +reconstruction methods, including improved resilience to different scanner +setups and advanced image distribution modeling. SGM-based reconstruction has +recently been applied to simulated positron emission tomography (PET) datasets, +showing improved contrast recovery for out-of-distribution lesions relative to +the state-of-the-art. However, existing methods for SGM-based reconstruction +from PET data suffer from slow reconstruction, burdensome hyperparameter tuning +and slice inconsistency effects (in 3D). In this work, we propose a practical +methodology for fully 3D reconstruction that accelerates reconstruction and +reduces the number of critical hyperparameters by matching the likelihood of an +SGM's reverse diffusion process to a current iterate of the maximum-likelihood +expectation maximization algorithm. Using the example of low-count +reconstruction from simulated $[^{18}$F]DPA-714 datasets, we show our +methodology can match or improve on the NRMSE and SSIM of existing +state-of-the-art SGM-based PET reconstruction while reducing reconstruction +time and the need for hyperparameter tuning. We evaluate our methodology +against state-of-the-art supervised and conventional reconstruction algorithms. +Finally, we demonstrate a first-ever implementation of SGM-based reconstruction +for real 3D PET data, specifically $[^{18}$F]DPA-714 data, where we integrate +perpendicular pre-trained SGMs to eliminate slice inconsistency issues. + +
+
+ comment: 11 pages, 12 figures. Submitted to Transactions on Medical Imaging +
+
+
+
+
+ + ☆ Reflective Teacher: Semi-Supervised Multimodal 3D Object Detection in + Bird's-Eye-View via Uncertainty Measure + + +
+ Applying pseudo labeling techniques has been found to be advantageous in +semi-supervised 3D object detection (SSOD) in Bird's-Eye-View (BEV) for +autonomous driving, particularly where labeled data is limited. In the +literature, Exponential Moving Average (EMA) has been used for adjustments of +the weights of teacher network by the student network. However, the same +induces catastrophic forgetting in the teacher network. In this work, we +address this issue by introducing a novel concept of Reflective Teacher where +the student is trained by both labeled and pseudo labeled data while its +knowledge is progressively passed to the teacher through a regularizer to +ensure retention of previous knowledge. Additionally, we propose Geometry Aware +BEV Fusion (GA-BEVFusion) for efficient alignment of multi-modal BEV features, +thus reducing the disparity between the modalities - camera and LiDAR. This +helps to map the precise geometric information embedded among LiDAR points +reliably with the spatial priors for extraction of semantic information from +camera images. Our experiments on the nuScenes and Waymo datasets demonstrate: +1) improved performance over state-of-the-art methods in both fully supervised +and semi-supervised settings; 2) Reflective Teacher achieves equivalent +performance with only 25% and 22% of labeled data for nuScenes and Waymo +datasets respectively, in contrast to other fully supervised methods that +utilize the full labeled dataset. + +
+
+
+
+
+ + ☆ Liquid: Language Models are Scalable Multi-modal Generators + + +
+ We present Liquid, an auto-regressive generation paradigm that seamlessly +integrates visual comprehension and generation by tokenizing images into +discrete codes and learning these code embeddings alongside text tokens within +a shared feature space for both vision and language. Unlike previous multimodal +large language model (MLLM), Liquid achieves this integration using a single +large language model (LLM), eliminating the need for external pretrained visual +embeddings such as CLIP. For the first time, Liquid uncovers a scaling law that +performance drop unavoidably brought by the unified training of visual and +language tasks diminishes as the model size increases. Furthermore, the unified +token space enables visual generation and comprehension tasks to mutually +enhance each other, effectively removing the typical interference seen in +earlier models. We show that existing LLMs can serve as strong foundations for +Liquid, saving 100x in training costs while outperforming Chameleon in +multimodal capabilities and maintaining language performance comparable to +mainstream LLMs like LLAMA2. Liquid also outperforms models like SD v2.1 and +SD-XL (FID of 5.47 on MJHQ-30K), excelling in both vision-language and +text-only tasks. This work demonstrates that LLMs such as LLAMA3.2 and GEMMA2 +are powerful multimodal generators, offering a scalable solution for enhancing +both vision-language understanding and generation. The code and models will be +released. + +
+
+ comment: Technical report. Will be updated soon +
+
+
+
+
+ + ☆ Multi-Subject Image Synthesis as a Generative Prior for Single-Subject + PET Image Reconstruction + + +
+ Large high-quality medical image datasets are difficult to acquire but +necessary for many deep learning applications. For positron emission tomography +(PET), reconstructed image quality is limited by inherent Poisson noise. We +propose a novel method for synthesising diverse and realistic pseudo-PET images +with improved signal-to-noise ratio. We also show how our pseudo-PET images may +be exploited as a generative prior for single-subject PET image reconstruction. +Firstly, we perform deep-learned deformable registration of multi-subject +magnetic resonance (MR) images paired to multi-subject PET images. We then use +the anatomically-learned deformation fields to transform multiple PET images to +the same reference space, before averaging random subsets of the transformed +multi-subject data to form a large number of varying pseudo-PET images. We +observe that using MR information for registration imbues the resulting +pseudo-PET images with improved anatomical detail compared to the originals. We +consider applications to PET image reconstruction, by generating pseudo-PET +images in the same space as the intended single-subject reconstruction and +using them as training data for a diffusion model-based reconstruction method. +We show visual improvement and reduced background noise in our 2D +reconstructions as compared to OSEM, MAP-EM and an existing state-of-the-art +diffusion model-based approach. Our method shows the potential for utilising +highly subject-specific prior information within a generative reconstruction +framework. Future work may compare the benefits of our approach to explicitly +MR-guided reconstruction methodologies. + +
+
+ comment: 2 pages, 3 figures. Accepted as a poster presentation at IEEE NSS MIC + RTSD 2024 (submitted May 2024; accepted July 2024; presented Nov 2024) +
+
+
+
+
+ + ☆ Generative-Model-Based Fully 3D PET Image Reconstruction by Conditional + Diffusion Sampling + + +
+ Score-based generative models (SGMs) have recently shown promising results +for image reconstruction on simulated positron emission tomography (PET) +datasets. In this work we have developed and implemented practical methodology +for 3D image reconstruction with SGMs, and perform (to our knowledge) the first +SGM-based reconstruction of real fully 3D PET data. We train an SGM on +full-count reference brain images, and extend methodology to allow SGM-based +reconstructions at very low counts (1% of original, to simulate low-dose or +short-duration scanning). We then perform reconstructions for multiple +independent realisations of 1% count data, allowing us to analyse the bias and +variance characteristics of the method. We sample from the learned posterior +distribution of the generative algorithm to calculate uncertainty images for +our reconstructions. We evaluate the method's performance on real full- and +low-count PET data and compare with conventional OSEM and MAP-EM baselines, +showing that our SGM-based low-count reconstructions match full-dose +reconstructions more closely and in a bias-variance trade-off comparison, our +SGM-reconstructed images have lower variance than existing baselines. Future +work will compare to supervised deep-learned methods, with other avenues for +investigation including how data conditioning affects the SGM's posterior +distribution and the algorithm's performance with different tracers. + +
+
+ comment: 2 pages, 2 figures. Accepted for oral presentation at IEEE NSS MIC + RTSD 2024 (submitted May 2024; accepted July 2024; presented Nov 2024) +
+
+
+
+
+ + ☆ FlashSloth: Lightning Multimodal Large Language Models via Embedded + Visual Compression + + +
+ Despite a big leap forward in capability, multimodal large language models +(MLLMs) tend to behave like a sloth in practical use, i.e., slow response and +large latency. Recent efforts are devoted to building tiny MLLMs for better +efficiency, but the plethora of visual tokens still used limit their actual +speedup. In this paper, we propose a powerful and fast tiny MLLM called +FlashSloth. Different from previous efforts, FlashSloth focuses on improving +the descriptive power of visual tokens in the process of compressing their +redundant semantics. In particular, FlashSloth introduces embedded visual +compression designs to capture both visually salient and instruction-related +image information, so as to achieving superior multimodal performance with +fewer visual tokens. Extensive experiments are conducted to validate the +proposed FlashSloth, and a bunch of tiny but strong MLLMs are also +comprehensively compared, e.g., InternVL2, MiniCPM-V2 and Qwen2-VL. The +experimental results show that compared with these advanced tiny MLLMs, our +FlashSloth can greatly reduce the number of visual tokens, training memory and +computation complexity while retaining high performance on various VL tasks. + +
+
+
+
+
+ + ☆ LocalSR: Image Super-Resolution in Local Region + + +
+ Standard single-image super-resolution (SR) upsamples and restores entire +images. Yet several real-world applications require higher resolutions only in +specific regions, such as license plates or faces, making the super-resolution +of the entire image, along with the associated memory and computational cost, +unnecessary. We propose a novel task, called LocalSR, to restore only local +regions of the low-resolution image. For this problem setting, we propose a +context-based local super-resolution (CLSR) to super-resolve only specified +regions of interest (ROI) while leveraging the entire image as context. Our +method uses three parallel processing modules: a base module for +super-resolving the ROI, a global context module for gathering helpful features +from across the image, and a proximity integration module for concentrating on +areas surrounding the ROI, progressively propagating features from distant +pixels to the target region. Experimental results indicate that our approach, +with its reduced low complexity, outperforms variants that focus exclusively on +the ROI. + +
+
+
+
+
+ + ☆ The Tile: A 2D Map of Ranking Scores for Two-Class Classification + + +
+ In the computer vision and machine learning communities, as well as in many +other research domains, rigorous evaluation of any new method, including +classifiers, is essential. One key component of the evaluation process is the +ability to compare and rank methods. However, ranking classifiers and +accurately comparing their performances, especially when taking +application-specific preferences into account, remains challenging. For +instance, commonly used evaluation tools like Receiver Operating Characteristic +(ROC) and Precision/Recall (PR) spaces display performances based on two +scores. Hence, they are inherently limited in their ability to compare +classifiers across a broader range of scores and lack the capability to +establish a clear ranking among classifiers. In this paper, we present a novel +versatile tool, named the Tile, that organizes an infinity of ranking scores in +a single 2D map for two-class classifiers, including common evaluation scores +such as the accuracy, the true positive rate, the positive predictive value, +Jaccard's coefficient, and all F-beta scores. Furthermore, we study the +properties of the underlying ranking scores, such as the influence of the +priors or the correspondences with the ROC space, and depict how to +characterize any other score by comparing them to the Tile. Overall, we +demonstrate that the Tile is a powerful tool that effectively captures all the +rankings in a single visualization and allows interpreting them. + +
+
+
+
+
+ + ☆ Towards Zero-shot 3D Anomaly Localization WACV 2025 + + +
+ 3D anomaly detection and localization is of great significance for industrial +inspection. Prior 3D anomaly detection and localization methods focus on the +setting that the testing data share the same category as the training data +which is normal. However, in real-world applications, the normal training data +for the target 3D objects can be unavailable due to issues like data privacy or +export control regulation. To tackle these challenges, we identify a new task +-- zero-shot 3D anomaly detection and localization, where the training and +testing classes do not overlap. To this end, we design 3DzAL, a novel +patch-level contrastive learning framework based on pseudo anomalies generated +using the inductive bias from task-irrelevant 3D xyz data to learn more +representative feature representations. Furthermore, we train a normalcy +classifier network to classify the normal patches and pseudo anomalies and +utilize the classification result jointly with feature distance to design +anomaly scores. Instead of directly using the patch point clouds, we introduce +adversarial perturbations to the input patch xyz data before feeding into the +3D normalcy classifier for the classification-based anomaly score. We show that +3DzAL outperforms the state-of-the-art anomaly detection and localization +performance. + +
+
+ comment: This paper is accepted to WACV 2025 +
+
+
+
+
+ + ☆ SwiftEdit: Lightning Fast Text-Guided Image Editing via One-Step + Diffusion + + +
+ Recent advances in text-guided image editing enable users to perform image +edits through simple text inputs, leveraging the extensive priors of multi-step +diffusion-based text-to-image models. However, these methods often fall short +of the speed demands required for real-world and on-device applications due to +the costly multi-step inversion and sampling process involved. In response to +this, we introduce SwiftEdit, a simple yet highly efficient editing tool that +achieve instant text-guided image editing (in 0.23s). The advancement of +SwiftEdit lies in its two novel contributions: a one-step inversion framework +that enables one-step image reconstruction via inversion and a mask-guided +editing technique with our proposed attention rescaling mechanism to perform +localized image editing. Extensive experiments are provided to demonstrate the +effectiveness and efficiency of SwiftEdit. In particular, SwiftEdit enables +instant text-guided image editing, which is extremely faster than previous +multi-step methods (at least 50 times faster) while maintain a competitive +performance in editing results. Our project page is at: +https://swift-edit.github.io/ + +
+
+ comment: 16 pages, 15 figures +
+
+
+
+
+ + ☆ T2I-FactualBench: Benchmarking the Factuality of Text-to-Image Models + with Knowledge-Intensive Concepts + + +
+ Evaluating the quality of synthesized images remains a significant challenge +in the development of text-to-image (T2I) generation. Most existing studies in +this area primarily focus on evaluating text-image alignment, image quality, +and object composition capabilities, with comparatively fewer studies +addressing the evaluation of the factuality of T2I models, particularly when +the concepts involved are knowledge-intensive. To mitigate this gap, we present +T2I-FactualBench in this work - the largest benchmark to date in terms of the +number of concepts and prompts specifically designed to evaluate the factuality +of knowledge-intensive concept generation. T2I-FactualBench consists of a +three-tiered knowledge-intensive text-to-image generation framework, ranging +from the basic memorization of individual knowledge concepts to the more +complex composition of multiple knowledge concepts. We further introduce a +multi-round visual question answering (VQA) based evaluation framework to +assess the factuality of three-tiered knowledge-intensive text-to-image +generation tasks. Experiments on T2I-FactualBench indicate that current +state-of-the-art (SOTA) T2I models still leave significant room for +improvement. + +
+
+
+
+
+ + ☆ Structure-Aware Stylized Image Synthesis for Robust Medical Image + Segmentation + + +
+ Accurate medical image segmentation is essential for effective diagnosis and +treatment planning but is often challenged by domain shifts caused by +variations in imaging devices, acquisition conditions, and patient-specific +attributes. Traditional domain generalization methods typically require +inclusion of parts of the test domain within the training set, which is not +always feasible in clinical settings with limited diverse data. Additionally, +although diffusion models have demonstrated strong capabilities in image +generation and style transfer, they often fail to preserve the critical +structural information necessary for precise medical analysis. To address these +issues, we propose a novel medical image segmentation method that combines +diffusion models and Structure-Preserving Network for structure-aware one-shot +image stylization. Our approach effectively mitigates domain shifts by +transforming images from various sources into a consistent style while +maintaining the location, size, and shape of lesions. This ensures robust and +accurate segmentation even when the target domain is absent from the training +data. Experimental evaluations on colonoscopy polyp segmentation and skin +lesion segmentation datasets show that our method enhances the robustness and +accuracy of segmentation models, achieving superior performance metrics +compared to baseline models without style transfer. This structure-aware +stylization framework offers a practical solution for improving medical image +segmentation across diverse domains, facilitating more reliable clinical +diagnoses. + +
+
+
+
+
+ + ☆ SIDA: Social Media Image Deepfake Detection, Localization and + Explanation with Large Multimodal Model + + +
+ The rapid advancement of generative models in creating highly realistic +images poses substantial risks for misinformation dissemination. For instance, +a synthetic image, when shared on social media, can mislead extensive audiences +and erode trust in digital content, resulting in severe repercussions. Despite +some progress, academia has not yet created a large and diversified deepfake +detection dataset for social media, nor has it devised an effective solution to +address this issue. In this paper, we introduce the Social media Image +Detection dataSet (SID-Set), which offers three key advantages: (1) extensive +volume, featuring 300K AI-generated/tampered and authentic images with +comprehensive annotations, (2) broad diversity, encompassing fully synthetic +and tampered images across various classes, and (3) elevated realism, with +images that are predominantly indistinguishable from genuine ones through mere +visual inspection. Furthermore, leveraging the exceptional capabilities of +large multimodal models, we propose a new image deepfake detection, +localization, and explanation framework, named SIDA (Social media Image +Detection, localization, and explanation Assistant). SIDA not only discerns the +authenticity of images, but also delineates tampered regions through mask +prediction and provides textual explanations of the model's judgment criteria. +Compared with state-of-the-art deepfake detection models on SID-Set and other +benchmarks, extensive experiments demonstrate that SIDA achieves superior +performance among diversified settings. The code, model, and dataset will be +released. + +
+
+
+
+
+ + ☆ Learnable Infinite Taylor Gaussian for Dynamic View Rendering + + +
+ Capturing the temporal evolution of Gaussian properties such as position, +rotation, and scale is a challenging task due to the vast number of +time-varying parameters and the limited photometric data available, which +generally results in convergence issues, making it difficult to find an optimal +solution. While feeding all inputs into an end-to-end neural network can +effectively model complex temporal dynamics, this approach lacks explicit +supervision and struggles to generate high-quality transformation fields. On +the other hand, using time-conditioned polynomial functions to model Gaussian +trajectories and orientations provides a more explicit and interpretable +solution, but requires significant handcrafted effort and lacks +generalizability across diverse scenes. To overcome these limitations, this +paper introduces a novel approach based on a learnable infinite Taylor Formula +to model the temporal evolution of Gaussians. This method offers both the +flexibility of an implicit network-based approach and the interpretability of +explicit polynomial functions, allowing for more robust and generalizable +modeling of Gaussian dynamics across various dynamic scenes. Extensive +experiments on dynamic novel view rendering tasks are conducted on public +datasets, demonstrating that the proposed method achieves state-of-the-art +performance in this domain. More information is available on our project +page(https://ellisonking.github.io/TaylorGaussian). + +
+
+
+
+
+ + ☆ HumanEdit: A High-Quality Human-Rewarded Dataset for Instruction-based + Image Editing + + +
+ We present HumanEdit, a high-quality, human-rewarded dataset specifically +designed for instruction-guided image editing, enabling precise and diverse +image manipulations through open-form language instructions. Previous +large-scale editing datasets often incorporate minimal human feedback, leading +to challenges in aligning datasets with human preferences. HumanEdit bridges +this gap by employing human annotators to construct data pairs and +administrators to provide feedback. With meticulously curation, HumanEdit +comprises 5,751 images and requires more than 2,500 hours of human effort +across four stages, ensuring both accuracy and reliability for a wide range of +image editing tasks. The dataset includes six distinct types of editing +instructions: Action, Add, Counting, Relation, Remove, and Replace, +encompassing a broad spectrum of real-world scenarios. All images in the +dataset are accompanied by masks, and for a subset of the data, we ensure that +the instructions are sufficiently detailed to support mask-free editing. +Furthermore, HumanEdit offers comprehensive diversity and high-resolution $1024 +\times 1024$ content sourced from various domains, setting a new versatile +benchmark for instructional image editing datasets. With the aim of advancing +future research and establishing evaluation benchmarks in the field of image +editing, we release HumanEdit at +\url{https://huggingface.co/datasets/BryanW/HumanEdit}. + +
+
+ comment: Codes and Supplementary Material: https://github.com/viiika/HumanEdit +
+
+
+
+
+ + ☆ Targeted Hard Sample Synthesis Based on Estimated Pose and Occlusion + Error for Improved Object Pose Estimation + + +
+ 6D Object pose estimation is a fundamental component in robotics enabling +efficient interaction with the environment. It is particularly challenging in +bin-picking applications, where objects may be textureless and in difficult +poses, and occlusion between objects of the same type may cause confusion even +in well-trained models. We propose a novel method of hard example synthesis +that is model-agnostic, using existing simulators and the modeling of pose +error in both the camera-to-object viewsphere and occlusion space. Through +evaluation of the model performance with respect to the distribution of object +poses and occlusions, we discover regions of high error and generate realistic +training samples to specifically target these regions. With our training +approach, we demonstrate an improvement in correct detection rate of up to 20% +across several ROBI-dataset objects using state-of-the-art pose estimation +models. + +
+
+ comment: To be published in IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ☆ Reinforcement Learning from Wild Animal Videos + + +
+ We propose to learn legged robot locomotion skills by watching thousands of +wild animal videos from the internet, such as those featured in nature +documentaries. Indeed, such videos offer a rich and diverse collection of +plausible motion examples, which could inform how robots should move. To +achieve this, we introduce Reinforcement Learning from Wild Animal Videos +(RLWAV), a method to ground these motions into physical robots. We first train +a video classifier on a large-scale animal video dataset to recognize actions +from RGB clips of animals in their natural habitats. We then train a +multi-skill policy to control a robot in a physics simulator, using the +classification score of a third-person camera capturing videos of the robot's +movements as a reward for reinforcement learning. Finally, we directly transfer +the learned policy to a real quadruped Solo. Remarkably, despite the extreme +gap in both domain and embodiment between animals in the wild and robots, our +approach enables the policy to learn diverse skills such as walking, jumping, +and keeping still, without relying on reference trajectories nor skill-specific +rewards. + +
+
+ comment: Project website: https://elliotchanesane31.github.io/RLWAV/ +
+
+
+
+
+ + ☆ Enhancing Whole Slide Image Classification through Supervised + Contrastive Domain Adaptation + + +
+ Domain shift in the field of histopathological imaging is a common phenomenon +due to the intra- and inter-hospital variability of staining and digitization +protocols. The implementation of robust models, capable of creating generalized +domains, represents a need to be solved. In this work, a new domain adaptation +method to deal with the variability between histopathological images from +multiple centers is presented. In particular, our method adds a training +constraint to the supervised contrastive learning approach to achieve domain +adaptation and improve inter-class separability. Experiments performed on +domain adaptation and classification of whole-slide images of six skin cancer +subtypes from two centers demonstrate the method's usefulness. The results +reflect superior performance compared to not using domain adaptation after +feature extraction or staining normalization. + +
+
+ comment: Accepted in CASEIB 2024 +
+
+
+
+
+ + ☆ 3D Part Segmentation via Geometric Aggregation of 2D Visual Features + + +
+ Supervised 3D part segmentation models are tailored for a fixed set of +objects and parts, limiting their transferability to open-set, real-world +scenarios. Recent works have explored vision-language models (VLMs) as a +promising alternative, using multi-view rendering and textual prompting to +identify object parts. However, naively applying VLMs in this context +introduces several drawbacks, such as the need for meticulous prompt +engineering, and fails to leverage the 3D geometric structure of objects. To +address these limitations, we propose COPS, a COmprehensive model for Parts +Segmentation that blends the semantics extracted from visual concepts and 3D +geometry to effectively identify object parts. COPS renders a point cloud from +multiple viewpoints, extracts 2D features, projects them back to 3D, and uses a +novel geometric-aware feature aggregation procedure to ensure spatial and +semantic consistency. Finally, it clusters points into parts and labels them. +We demonstrate that COPS is efficient, scalable, and achieves zero-shot +state-of-the-art performance across five datasets, covering synthetic and +real-world data, texture-less and coloured objects, as well as rigid and +non-rigid shapes. The code is available at https://3d-cops.github.io. + +
+
+
+
+
+ + ☆ Intriguing Properties of Robust Classification + + +
+ Despite extensive research since the community learned about adversarial +examples 10 years ago, we still do not know how to train high-accuracy +classifiers that are guaranteed to be robust to small perturbations of their +inputs. Previous works often argued that this might be because no classifier +exists that is robust and accurate at the same time. However, in computer +vision this assumption does not match reality where humans are usually accurate +and robust on most tasks of interest. We offer an alternative explanation and +show that in certain settings robust generalization is only possible with +unrealistically large amounts of data. More precisely we find a setting where a +robust classifier exists, it is easy to learn an accurate classifier, yet it +requires an exponential amount of data to learn a robust classifier. Based on +this theoretical result, we explore how well robust classifiers generalize on +datasets such as CIFAR-10. We come to the conclusion that on this datasets, the +limitation of current robust models also lies in the generalization, and that +they require a lot of data to do well on the test set. We also show that the +problem is not in the expressiveness or generalization capabilities of current +architectures, and that there are low magnitude features in the data which are +useful for non-robust generalization but are not available for robust +classifiers. + +
+
+
+
+
+ + ☆ GigaHands: A Massive Annotated Dataset of Bimanual Hand Activities + + +
+ Understanding bimanual human hand activities is a critical problem in AI and +robotics. We cannot build large models of bimanual activities because existing +datasets lack the scale, coverage of diverse hand activities, and detailed +annotations. We introduce GigaHands, a massive annotated dataset capturing 34 +hours of bimanual hand activities from 56 subjects and 417 objects, totaling +14k motion clips derived from 183 million frames paired with 84k text +annotations. Our markerless capture setup and data acquisition protocol enable +fully automatic 3D hand and object estimation while minimizing the effort +required for text annotation. The scale and diversity of GigaHands enable broad +applications, including text-driven action synthesis, hand motion captioning, +and dynamic radiance field reconstruction. + +
+
+
+
+
+ + ☆ Quantifying the Limits of Segment Anything Model: Analyzing Challenges + in Segmenting Tree-Like and Low-Contrast Structures + + +
+ Segment Anything Model (SAM) has shown impressive performance in interactive +and zero-shot segmentation across diverse domains, suggesting that they have +learned a general concept of "objects" from their large-scale training. +However, we observed that SAM struggles with certain types of objects, +particularly those featuring dense, tree-like structures and low textural +contrast from their surroundings. These failure modes are critical for +understanding its limitations in real-world use. In order to systematically +examine this issue, we propose metrics to quantify two key object +characteristics: tree-likeness and textural separability. Through extensive +controlled synthetic experiments and testing on real datasets, we demonstrate +that SAM's performance is noticeably correlated with these factors. We link +these behaviors under the concept of "textural confusion", where SAM +misinterprets local structure as global texture, leading to over-segmentation, +or struggles to differentiate objects from similarly textured backgrounds. +These findings offer the first quantitative framework to model SAM's +challenges, providing valuable insights into its limitations and guiding future +improvements for vision foundation models. + +
+
+ comment: Code: https://github.com/mazurowski-lab/SAM-TexturalConfusion-Metrics +
+
+
+
+
+ + ☆ VASCAR: Content-Aware Layout Generation via Visual-Aware Self-Correction + + +
+ Large language models (LLMs) have proven effective for layout generation due +to their ability to produce structure-description languages, such as HTML or +JSON, even without access to visual information. Recently, LLM providers have +evolved these models into large vision-language models (LVLM), which shows +prominent multi-modal understanding capabilities. Then, how can we leverage +this multi-modal power for layout generation? To answer this, we propose +Visual-Aware Self-Correction LAyout GeneRation (VASCAR) for LVLM-based +content-aware layout generation. In our method, LVLMs iteratively refine their +outputs with reference to rendered layout images, which are visualized as +colored bounding boxes on poster backgrounds. In experiments, we demonstrate +that our method combined with the Gemini. Without any additional training, +VASCAR achieves state-of-the-art (SOTA) layout generation quality outperforming +both existing layout-specific generative models and other LLM-based methods. + +
+
+
+
+
+ + ☆ DEIM: DETR with Improved Matching for Fast Convergence + + +
+ We introduce DEIM, an innovative and efficient training framework designed to +accelerate convergence in real-time object detection with Transformer-based +architectures (DETR). To mitigate the sparse supervision inherent in one-to-one +(O2O) matching in DETR models, DEIM employs a Dense O2O matching strategy. This +approach increases the number of positive samples per image by incorporating +additional targets, using standard data augmentation techniques. While Dense +O2O matching speeds up convergence, it also introduces numerous low-quality +matches that could affect performance. To address this, we propose the +Matchability-Aware Loss (MAL), a novel loss function that optimizes matches +across various quality levels, enhancing the effectiveness of Dense O2O. +Extensive experiments on the COCO dataset validate the efficacy of DEIM. When +integrated with RT-DETR and D-FINE, it consistently boosts performance while +reducing training time by 50%. Notably, paired with RT-DETRv2, DEIM achieves +53.2% AP in a single day of training on an NVIDIA 4090 GPU. Additionally, +DEIM-trained real-time models outperform leading real-time object detectors, +with DEIM-D-FINE-L and DEIM-D-FINE-X achieving 54.7% and 56.5% AP at 124 and 78 +FPS on an NVIDIA T4 GPU, respectively, without the need for additional data. We +believe DEIM sets a new baseline for advancements in real-time object +detection. Our code and pre-trained models are available at +https://github.com/ShihuaHuang95/DEIM. + +
+
+ comment: Exceeding all existing real-time object detectors, including YOLOv11 + and D-FINE +
+
+
+
+
+ + ☆ Foundations of the Theory of Performance-Based Ranking + + +
+ Ranking entities such as algorithms, devices, methods, or models based on +their performances, while accounting for application-specific preferences, is a +challenge. To address this challenge, we establish the foundations of a +universal theory for performance-based ranking. First, we introduce a rigorous +framework built on top of both the probability and order theories. Our new +framework encompasses the elements necessary to (1) manipulate performances as +mathematical objects, (2) express which performances are worse than or +equivalent to others, (3) model tasks through a variable called satisfaction, +(4) consider properties of the evaluation, (5) define scores, and (6) specify +application-specific preferences through a variable called importance. On top +of this framework, we propose the first axiomatic definition of performance +orderings and performance-based rankings. Then, we introduce a universal +parametric family of scores, called ranking scores, that can be used to +establish rankings satisfying our axioms, while considering +application-specific preferences. Finally, we show, in the case of two-class +classification, that the family of ranking scores encompasses well-known +performance scores, including the accuracy, the true positive rate (recall, +sensitivity), the true negative rate (specificity), the positive predictive +value (precision), and F1. However, we also show that some other scores +commonly used to compare classifiers are unsuitable to derive performance +orderings satisfying the axioms. Therefore, this paper provides the computer +vision and machine learning communities with a rigorous framework for +evaluating and ranking entities. + +
+
+
+
+
+ + ☆ Customize Segment Anything Model for Multi-Modal Semantic Segmentation + with Mixture of LoRA Experts + + +
+ The recent Segment Anything Model (SAM) represents a significant breakthrough +in scaling segmentation models, delivering strong performance across various +downstream applications in the RGB modality. However, directly applying SAM to +emerging visual modalities, such as depth and event data results in suboptimal +performance in multi-modal segmentation tasks. In this paper, we make the first +attempt to adapt SAM for multi-modal semantic segmentation by proposing a +Mixture of Low-Rank Adaptation Experts (MoE-LoRA) tailored for different input +visual modalities. By training only the MoE-LoRA layers while keeping SAM's +weights frozen, SAM's strong generalization and segmentation capabilities can +be preserved for downstream tasks. Specifically, to address cross-modal +inconsistencies, we propose a novel MoE routing strategy that adaptively +generates weighted features across modalities, enhancing multi-modal feature +integration. Additionally, we incorporate multi-scale feature extraction and +fusion by adapting SAM's segmentation head and introducing an auxiliary +segmentation head to combine multi-scale features for improved segmentation +performance effectively. Extensive experiments were conducted on three +multi-modal benchmarks: DELIVER, MUSES, and MCubeS. The results consistently +demonstrate that the proposed method significantly outperforms state-of-the-art +approaches across diverse scenarios. Notably, under the particularly +challenging condition of missing modalities, our approach exhibits a +substantial performance gain, achieving an improvement of 32.15% compared to +existing methods. + +
+
+
+
+
+ + ☆ Aligned Music Notation and Lyrics Transcription + + +
+ The digitization of vocal music scores presents unique challenges that go +beyond traditional Optical Music Recognition (OMR) and Optical Character +Recognition (OCR), as it necessitates preserving the critical alignment between +music notation and lyrics. This alignment is essential for proper +interpretation and processing in practical applications. This paper introduces +and formalizes, for the first time, the Aligned Music Notation and Lyrics +Transcription (AMNLT) challenge, which addresses the complete transcription of +vocal scores by jointly considering music symbols, lyrics, and their +synchronization. We analyze different approaches to address this challenge, +ranging from traditional divide-and-conquer methods that handle music and +lyrics separately, to novel end-to-end solutions including direct +transcription, unfolding mechanisms, and language modeling. To evaluate these +methods, we introduce four datasets of Gregorian chants, comprising both real +and synthetic sources, along with custom metrics specifically designed to +assess both transcription and alignment accuracy. Our experimental results +demonstrate that end-to-end approaches generally outperform heuristic methods +in the alignment challenge, with language models showing particular promise in +scenarios where sufficient training data is available. This work establishes +the first comprehensive framework for AMNLT, providing both theoretical +foundations and practical solutions for preserving and digitizing vocal music +heritage. + +
+
+
+
+
+ + ☆ PANGAEA: A Global and Inclusive Benchmark for Geospatial Foundation + Models + + +
+ Geospatial Foundation Models (GFMs) have emerged as powerful tools for +extracting representations from Earth observation data, but their evaluation +remains inconsistent and narrow. Existing works often evaluate on suboptimal +downstream datasets and tasks, that are often too easy or too narrow, limiting +the usefulness of the evaluations to assess the real-world applicability of +GFMs. Additionally, there is a distinct lack of diversity in current evaluation +protocols, which fail to account for the multiplicity of image resolutions, +sensor types, and temporalities, which further complicates the assessment of +GFM performance. In particular, most existing benchmarks are geographically +biased towards North America and Europe, questioning the global applicability +of GFMs. To overcome these challenges, we introduce PANGAEA, a standardized +evaluation protocol that covers a diverse set of datasets, tasks, resolutions, +sensor modalities, and temporalities. It establishes a robust and widely +applicable benchmark for GFMs. We evaluate the most popular GFMs openly +available on this benchmark and analyze their performance across several +domains. In particular, we compare these models to supervised baselines (e.g. +UNet and vanilla ViT), and assess their effectiveness when faced with limited +labeled data. Our findings highlight the limitations of GFMs, under different +scenarios, showing that they do not consistently outperform supervised models. +PANGAEA is designed to be highly extensible, allowing for the seamless +inclusion of new datasets, models, and tasks in future research. By releasing +the evaluation code and benchmark, we aim to enable other researchers to +replicate our experiments and build upon our work, fostering a more principled +evaluation protocol for large pre-trained geospatial models. The code is +available at https://github.com/VMarsocci/pangaea-bench. + +
+
+
+
+
+ + ☆ Hipandas: Hyperspectral Image Joint Denoising and Super-Resolution by + Image Fusion with the Panchromatic Image + + +
+ Hyperspectral images (HSIs) are frequently noisy and of low resolution due to +the constraints of imaging devices. Recently launched satellites can +concurrently acquire HSIs and panchromatic (PAN) images, enabling the +restoration of HSIs to generate clean and high-resolution imagery through +fusing PAN images for denoising and super-resolution. However, previous studies +treated these two tasks as independent processes, resulting in accumulated +errors. This paper introduces \textbf{H}yperspectral \textbf{I}mage Joint +\textbf{Pand}enoising \textbf{a}nd Pan\textbf{s}harpening (Hipandas), a novel +learning paradigm that reconstructs HRHS images from noisy low-resolution HSIs +(LRHS) and high-resolution PAN images. The proposed zero-shot Hipandas +framework consists of a guided denoising network, a guided super-resolution +network, and a PAN reconstruction network, utilizing an HSI low-rank prior and +a newly introduced detail-oriented low-rank prior. The interconnection of these +networks complicates the training process, necessitating a two-stage training +strategy to ensure effective training. Experimental results on both simulated +and real-world datasets indicate that the proposed method surpasses +state-of-the-art algorithms, yielding more accurate and visually pleasing HRHS +images. + +
+
+
+
+
+ + ☆ Instructional Video Generation + + +
+ Despite the recent strides in video generation, state-of-the-art methods +still struggle with elements of visual detail. One particularly challenging +case is the class of egocentric instructional videos in which the intricate +motion of the hand coupled with a mostly stable and non-distracting environment +is necessary to convey the appropriate visual action instruction. To address +these challenges, we introduce a new method for instructional video generation. +Our diffusion-based method incorporates two distinct innovations. First, we +propose an automatic method to generate the expected region of motion, guided +by both the visual context and the action text. Second, we introduce a critical +hand structure loss to guide the diffusion model to focus on smooth and +consistent hand poses. We evaluate our method on augmented instructional +datasets based on EpicKitchens and Ego4D, demonstrating significant +improvements over state-of-the-art methods in terms of instructional clarity, +especially of the hand motion in the target region, across diverse environments +and actions.Video results can be found on the project webpage: +https://excitedbutter.github.io/Instructional-Video-Generation/ + +
+
+ comment: 14 pages, 5 figures and 4 tables +
+
+
+
+
+ + ☆ Frequency-Adaptive Low-Latency Object Detection Using Events and Frames + + +
+ Fusing Events and RGB images for object detection leverages the robustness of +Event cameras in adverse environments and the rich semantic information +provided by RGB cameras. However, two critical mismatches: low-latency Events +\textit{vs.}~high-latency RGB frames; temporally sparse labels in training +\textit{vs.}~continuous flow in inference, significantly hinder the +high-frequency fusion-based object detection. To address these challenges, we +propose the \textbf{F}requency-\textbf{A}daptive Low-Latency \textbf{O}bject +\textbf{D}etector (FAOD). FAOD aligns low-frequency RGB frames with +high-frequency Events through an Align Module, which reinforces cross-modal +style and spatial proximity to address the Event-RGB Mismatch. We further +propose a training strategy, Time Shift, which enforces the module to align the +prediction from temporally shifted Event-RGB pairs and their original +representation, that is, consistent with Event-aligned annotations. This +strategy enables the network to use high-frequency Event data as the primary +reference while treating low-frequency RGB images as supplementary information, +retaining the low-latency nature of the Event stream toward high-frequency +detection. Furthermore, we observe that these corrected Event-RGB pairs +demonstrate better generalization from low training frequency to higher +inference frequencies compared to using Event data alone. Extensive experiments +on the PKU-DAVIS-SOD and DSEC-Detection datasets demonstrate that our FAOD +achieves SOTA performance. Specifically, in the PKU-DAVIS-SOD Dataset, FAOD +achieves 9.8 points improvement in terms of the mAP in fully paired Event-RGB +data with only a quarter of the parameters compared to SODFormer, and even +maintains robust performance (only a 3 points drop in mAP) under 80$\times$ +Event-RGB frequency mismatch. + +
+
+
+
+
+ + ☆ AnyDressing: Customizable Multi-Garment Virtual Dressing via Latent + Diffusion Models + + +
+ Recent advances in garment-centric image generation from text and image +prompts based on diffusion models are impressive. However, existing methods +lack support for various combinations of attire, and struggle to preserve the +garment details while maintaining faithfulness to the text prompts, limiting +their performance across diverse scenarios. In this paper, we focus on a new +task, i.e., Multi-Garment Virtual Dressing, and we propose a novel AnyDressing +method for customizing characters conditioned on any combination of garments +and any personalized text prompts. AnyDressing comprises two primary networks +named GarmentsNet and DressingNet, which are respectively dedicated to +extracting detailed clothing features and generating customized images. +Specifically, we propose an efficient and scalable module called +Garment-Specific Feature Extractor in GarmentsNet to individually encode +garment textures in parallel. This design prevents garment confusion while +ensuring network efficiency. Meanwhile, we design an adaptive +Dressing-Attention mechanism and a novel Instance-Level Garment Localization +Learning strategy in DressingNet to accurately inject multi-garment features +into their corresponding regions. This approach efficiently integrates +multi-garment texture cues into generated images and further enhances +text-image consistency. Additionally, we introduce a Garment-Enhanced Texture +Learning strategy to improve the fine-grained texture details of garments. +Thanks to our well-craft design, AnyDressing can serve as a plug-in module to +easily integrate with any community control extensions for diffusion models, +improving the diversity and controllability of synthesized images. Extensive +experiments show that AnyDressing achieves state-of-the-art results. + +
+
+ comment: Project page: https://crayon-shinchan.github.io/AnyDressing/ +
+
+
+
+
+ + ☆ Text Change Detection in Multilingual Documents Using Image Comparison + + +
+ Document comparison typically relies on optical character recognition (OCR) +as its core technology. However, OCR requires the selection of appropriate +language models for each document and the performance of multilingual or hybrid +models remains limited. To overcome these challenges, we propose text change +detection (TCD) using an image comparison model tailored for multilingual +documents. Unlike OCR-based approaches, our method employs word-level text +image-to-image comparison to detect changes. Our model generates bidirectional +change segmentation maps between the source and target documents. To enhance +performance without requiring explicit text alignment or scaling preprocessing, +we employ correlations among multi-scale attention features. We also construct +a benchmark dataset comprising actual printed and scanned word pairs in various +languages to evaluate our model. We validate our approach using our benchmark +dataset and public benchmarks Distorted Document Images and the LRDE Document +Binarization Dataset. We compare our model against state-of-the-art semantic +segmentation and change detection models, as well as to conventional OCR-based +models. + +
+
+ comment: 15pages, 11figures 6tables, wacv2025 accepted +
+
+
+
+
+ + ☆ Deep priors for satellite image restoration with accurate uncertainties + + +
+ Satellite optical images, upon their on-ground receipt, offer a distorted +view of the observed scene. Their restoration, classically including denoising, +deblurring, and sometimes super-resolution, is required before their +exploitation. Moreover, quantifying the uncertainty related to this restoration +could be valuable by lowering the risk of hallucination and avoiding +propagating these biases in downstream applications. Deep learning methods are +now state-of-the-art for satellite image restoration. However, they require to +train a specific network for each sensor and they do not provide the associated +uncertainties. This paper proposes a generic method involving a single network +to restore images from several sensors and a scalable way to derive the +uncertainties. We focus on deep regularization (DR) methods, which learn a deep +prior on target images before plugging it into a model-based optimization +scheme. First, we introduce VBLE-xz, which solves the inverse problem in the +latent space of a variational compressive autoencoder, estimating the +uncertainty jointly in the latent and in the image spaces. It enables scalable +posterior sampling with relevant and calibrated uncertainties. Second, we +propose the denoiser-based method SatDPIR, adapted from DPIR, which efficiently +computes accurate point estimates. We conduct a comprehensive set of +experiments on very high resolution simulated and real Pleiades images, +asserting both the performance and robustness of the proposed methods. VBLE-xz +and SatDPIR achieve state-of-the-art results compared to direct inversion +methods. In particular, VBLE-xz is a scalable method to get realistic posterior +samples and accurate uncertainties, while SatDPIR represents a compelling +alternative to direct inversion methods when uncertainty quantification is not +required. + +
+
+
+
+
+ + ☆ CrossSDF: 3D Reconstruction of Thin Structures From Cross-Sections + + +
+ Reconstructing complex structures from planar cross-sections is a challenging +problem, with wide-reaching applications in medical imaging, manufacturing, and +topography. Out-of-the-box point cloud reconstruction methods can often fail +due to the data sparsity between slicing planes, while current bespoke methods +struggle to reconstruct thin geometric structures and preserve topological +continuity. This is important for medical applications where thin vessel +structures are present in CT and MRI scans. This paper introduces \method, a +novel approach for extracting a 3D signed distance field from 2D signed +distances generated from planar contours. Our approach makes the training of +neural SDFs contour-aware by using losses designed for the case where geometry +is known within 2D slices. Our results demonstrate a significant improvement +over existing methods, effectively reconstructing thin structures and producing +accurate 3D models without the interpolation artifacts or over-smoothing of +prior approaches. + +
+
+
+
+
+ + ☆ MVUDA: Unsupervised Domain Adaptation for Multi-view Pedestrian + Detection + + +
+ We address multi-view pedestrian detection in a setting where labeled data is +collected using a multi-camera setup different from the one used for testing. +While recent multi-view pedestrian detectors perform well on the camera rig +used for training, their performance declines when applied to a different +setup. To facilitate seamless deployment across varied camera rigs, we propose +an unsupervised domain adaptation (UDA) method that adapts the model to new +rigs without requiring additional labeled data. Specifically, we leverage the +mean teacher self-training framework with a novel pseudo-labeling technique +tailored to multi-view pedestrian detection. This method achieves +state-of-the-art performance on multiple benchmarks, including +MultiviewX$\rightarrow$Wildtrack. Unlike previous methods, our approach +eliminates the need for external labeled monocular datasets, thereby reducing +reliance on labeled data. Extensive evaluations demonstrate the effectiveness +of our method and validate key design choices. By enabling robust adaptation +across camera setups, our work enhances the practicality of multi-view +pedestrian detectors and establishes a strong UDA baseline for future research. + +
+
+
+
+
+ + ☆ Thermal and RGB Images Work Better Together in Wind Turbine Damage + Detection + + +
+ The inspection of wind turbine blades (WTBs) is crucial for ensuring their +structural integrity and operational efficiency. Traditional inspection methods +can be dangerous and inefficient, prompting the use of unmanned aerial vehicles +(UAVs) that access hard-to-reach areas and capture high-resolution imagery. In +this study, we address the challenge of enhancing defect detection on WTBs by +integrating thermal and RGB images obtained from UAVs. We propose a +multispectral image composition method that combines thermal and RGB imagery +through spatial coordinate transformation, key point detection, binary +descriptor creation, and weighted image overlay. Using a benchmark dataset of +WTB images annotated for defects, we evaluated several state-of-the-art object +detection models. Our results show that composite images significantly improve +defect detection efficiency. Specifically, the YOLOv8 model's accuracy +increased from 91% to 95%, precision from 89% to 94%, recall from 85% to 92%, +and F1-score from 87% to 93%. The number of false positives decreased from 6 to +3, and missed defects reduced from 5 to 2. These findings demonstrate that +integrating thermal and RGB imagery enhances defect detection on WTBs, +contributing to improved maintenance and reliability. + +
+
+ comment: Unmanned aerial vehicle, image composition, multispectral images, + green energy, data quality management, weighted overlay +
+
+
+
+
+ + ☆ Adult Glioma Segmentation in Sub-Saharan Africa using Transfer Learning + on Stratified Finetuning Data MICCAI + + +
+ Gliomas, a kind of brain tumor characterized by high mortality, present +substantial diagnostic challenges in low- and middle-income countries, +particularly in Sub-Saharan Africa. This paper introduces a novel approach to +glioma segmentation using transfer learning to address challenges in +resource-limited regions with minimal and low-quality MRI data. We leverage +pre-trained deep learning models, nnU-Net and MedNeXt, and apply a stratified +fine-tuning strategy using the BraTS2023-Adult-Glioma and BraTS-Africa +datasets. Our method exploits radiomic analysis to create stratified training +folds, model training on a large brain tumor dataset, and transfer learning to +the Sub-Saharan context. A weighted model ensembling strategy and adaptive +post-processing are employed to enhance segmentation accuracy. The evaluation +of our proposed method on unseen validation cases on the BraTS-Africa 2024 task +resulted in lesion-wise mean Dice scores of 0.870, 0.865, and 0.926, for +enhancing tumor, tumor core, and whole tumor regions and was ranked first for +the challenge. Our approach highlights the ability of integrated +machine-learning techniques to bridge the gap between the medical imaging +capabilities of resource-limited countries and established developed regions. +By tailoring our methods to a target population's specific needs and +constraints, we aim to enhance diagnostic capabilities in isolated +environments. Our findings underscore the importance of approaches like local +data integration and stratification refinement to address healthcare +disparities, ensure practical applicability, and enhance impact. + +
+
+ comment: 10 pages, 3 figures, 3 tables. This paper was accepted at + MICCAI-BraTS 2024 +
+
+
+
+
+ + ☆ D-LORD for Motion Stylization + + +
+ This paper introduces a novel framework named D-LORD (Double Latent +Optimization for Representation Disentanglement), which is designed for motion +stylization (motion style transfer and motion retargeting). The primary +objective of this framework is to separate the class and content information +from a given motion sequence using a data-driven latent optimization approach. +Here, class refers to person-specific style, such as a particular emotion or an +individual's identity, while content relates to the style-agnostic aspect of an +action, such as walking or jumping, as universally understood concepts. The key +advantage of D-LORD is its ability to perform style transfer without needing +paired motion data. Instead, it utilizes class and content labels during the +latent optimization process. By disentangling the representation, the framework +enables the transformation of one motion sequences style to another's style +using Adaptive Instance Normalization. The proposed D-LORD framework is +designed with a focus on generalization, allowing it to handle different class +and content labels for various applications. Additionally, it can generate +diverse motion sequences when specific class and content labels are provided. +The framework's efficacy is demonstrated through experimentation on three +datasets: the CMU XIA dataset for motion style transfer, the MHAD dataset, and +the RRIS Ability dataset for motion retargeting. Notably, this paper presents +the first generalized framework for motion style transfer and motion +retargeting, showcasing its potential contributions in this area. + +
+
+
+
+
+ + ☆ HyperFLINT: Hypernetwork-based Flow Estimation and Temporal + Interpolation for Scientific Ensemble Visualization + + +
+ We present HyperFLINT (Hypernetwork-based FLow estimation and temporal +INTerpolation), a novel deep learning-based approach for estimating flow +fields, temporally interpolating scalar fields, and facilitating parameter +space exploration in spatio-temporal scientific ensemble data. This work +addresses the critical need to explicitly incorporate ensemble parameters into +the learning process, as traditional methods often neglect these, limiting +their ability to adapt to diverse simulation settings and provide meaningful +insights into the data dynamics. HyperFLINT introduces a hypernetwork to +account for simulation parameters, enabling it to generate accurate +interpolations and flow fields for each timestep by dynamically adapting to +varying conditions, thereby outperforming existing parameter-agnostic +approaches. The architecture features modular neural blocks with convolutional +and deconvolutional layers, supported by a hypernetwork that generates weights +for the main network, allowing the model to better capture intricate simulation +dynamics. A series of experiments demonstrates HyperFLINT's significantly +improved performance in flow field estimation and temporal interpolation, as +well as its potential in enabling parameter space exploration, offering +valuable insights into complex scientific ensembles. + +
+
+
+
+
+ + ☆ Magnetic Resonance Imaging Feature-Based Subtyping and Model Ensemble + for Enhanced Brain Tumor Segmentation MICCAI + + +
+ Accurate and automatic segmentation of brain tumors in multi-parametric +magnetic resonance imaging (mpMRI) is essential for quantitative measurements, +which play an increasingly important role in clinical diagnosis and prognosis. +The International Brain Tumor Segmentation (BraTS) Challenge 2024 offers a +unique benchmarking opportunity, including various types of brain tumors in +both adult and pediatric populations, such as pediatric brain tumors (PED), +meningiomas (MEN-RT) and brain metastases (MET), among others. Compared to +previous editions, BraTS 2024 has implemented changes to substantially increase +clinical relevance, such as refined tumor regions for evaluation. We propose a +deep learning-based ensemble approach that integrates state-of-the-art +segmentation models. Additionally, we introduce innovative, adaptive pre- and +post-processing techniques that employ MRI-based radiomic analyses to +differentiate tumor subtypes. Given the heterogeneous nature of the tumors +present in the BraTS datasets, this approach enhances the precision and +generalizability of segmentation models. On the final testing sets, our method +achieved mean lesion-wise Dice similarity coefficients of 0.926, 0.801, and +0.688 for the whole tumor in PED, MEN-RT, and MET, respectively. These results +demonstrate the effectiveness of our approach in improving segmentation +performance and generalizability for various brain tumor types. + +
+
+ comment: 11 pages, 4 figures, 3 tables. This paper was accepted at + MICCAI-BraTS 2024 +
+
+
+
+
+ + ☆ LossAgent: Towards Any Optimization Objectives for Image Processing with + LLM Agents + + +
+ We present the first loss agent, dubbed LossAgent, for low-level image +processing tasks, e.g., image super-resolution and restoration, intending to +achieve any customized optimization objectives of low-level image processing in +different practical applications. Notably, not all optimization objectives, +such as complex hand-crafted perceptual metrics, text description, and +intricate human feedback, can be instantiated with existing low-level losses, +e.g., MSE loss. which presents a crucial challenge in optimizing image +processing networks in an end-to-end manner. To eliminate this, our LossAgent +introduces the powerful large language model (LLM) as the loss agent, where the +rich textual understanding of prior knowledge empowers the loss agent with the +potential to understand complex optimization objectives, trajectory, and state +feedback from external environments in the optimization process of the +low-level image processing networks. In particular, we establish the loss +repository by incorporating existing loss functions that support the end-to-end +optimization for low-level image processing. Then, we design the +optimization-oriented prompt engineering for the loss agent to actively and +intelligently decide the compositional weights for each loss in the repository +at each optimization interaction, thereby achieving the required optimization +trajectory for any customized optimization objectives. Extensive experiments on +three typical low-level image processing tasks and multiple optimization +objectives have shown the effectiveness and applicability of our proposed +LossAgent. Code and pre-trained models will be available at +https://github.com/lbc12345/LossAgent. + +
+
+
+
+
+ + ☆ BodyMetric: Evaluating the Realism of HumanBodies in Text-to-Image + Generation + + +
+ Accurately generating images of human bodies from text remains a challenging +problem for state of the art text-to-image models. Commonly observed +body-related artifacts include extra or missing limbs, unrealistic poses, +blurred body parts, etc. Currently, evaluation of such artifacts relies heavily +on time-consuming human judgments, limiting the ability to benchmark models at +scale. We address this by proposing BodyMetric, a learnable metric that +predicts body realism in images. BodyMetric is trained on realism labels and +multi-modal signals including 3D body representations inferred from the input +image, and textual descriptions. In order to facilitate this approach, we +design an annotation pipeline to collect expert ratings on human body realism +leading to a new dataset for this task, namely, BodyRealism. Ablation studies +support our architectural choices for BodyMetric and the importance of +leveraging a 3D human body prior in capturing body-related artifacts in 2D +images. In comparison to concurrent metrics which evaluate general user +preference in images, BodyMetric specifically reflects body-related artifacts. +We demonstrate the utility of BodyMetric through applications that were +previously infeasible at scale. In particular, we use BodyMetric to benchmark +the generation ability of text-to-image models to produce realistic human +bodies. We also demonstrate the effectiveness of BodyMetric in ranking +generated images based on the predicted realism scores. + +
+
+
+
+
+ + ☆ Unified Framework for Open-World Compositional Zero-shot Learning + + +
+ Open-World Compositional Zero-Shot Learning (OW-CZSL) addresses the challenge +of recognizing novel compositions of known primitives and entities. Even though +prior works utilize language knowledge for recognition, such approaches exhibit +limited interactions between language-image modalities. Our approach primarily +focuses on enhancing the inter-modality interactions through fostering richer +interactions between image and textual data. Additionally, we introduce a novel +module aimed at alleviating the computational burden associated with exhaustive +exploration of all possible compositions during the inference stage. While +previous methods exclusively learn compositions jointly or independently, we +introduce an advanced hybrid procedure that leverages both learning mechanisms +to generate final predictions. Our proposed model, achieves state-of-the-art in +OW-CZSL in three datasets, while surpassing Large Vision Language Models (LLVM) +in two datasets. + +
+
+
+
+
+ + ☆ SoRA: Singular Value Decomposed Low-Rank Adaptation for Domain + Generalizable Representation Learning + + +
+ Domain generalization (DG) aims to adapt a model using one or multiple source +domains to ensure robust performance in unseen target domains. Recently, +Parameter-Efficient Fine-Tuning (PEFT) of foundation models has shown promising +results in the context of DG problem. Nevertheless, existing PEFT methods still +struggle to strike a balance between preserving generalizable components of the +pre-trained model and learning task-specific features. To gain insights into +the distribution of generalizable components, we begin by analyzing the +pre-trained weights through the lens of singular value decomposition. Building +on these insights, we introduce Singular Value Decomposed Low-Rank Adaptation +(SoRA), an approach that selectively tunes minor singular components while +keeping the residual parts frozen. SoRA effectively retains the generalization +ability of the pre-trained model while efficiently acquiring task-specific +skills. Furthermore, we freeze domain-generalizable blocks and employ an +annealing weight decay strategy, thereby achieving an optimal balance in the +delicate trade-off between generalizability and discriminability. SoRA attains +state-of-the-art results on multiple benchmarks that span both domain +generalized semantic segmentation to domain generalized object detection. In +addition, our methods introduce no additional inference overhead or +regularization loss, maintain compatibility with any backbone or head, and are +designed to be versatile, allowing easy integration into a wide range of tasks. + +
+
+ comment: Project page: https://ysj9909.github.io/SoRA.github.io/ +
+
+
+
+
+ + ☆ TransAdapter: Vision Transformer for Feature-Centric Unsupervised Domain + Adaptation + + +
+ Unsupervised Domain Adaptation (UDA) aims to utilize labeled data from a +source domain to solve tasks in an unlabeled target domain, often hindered by +significant domain gaps. Traditional CNN-based methods struggle to fully +capture complex domain relationships, motivating the shift to vision +transformers like the Swin Transformer, which excel in modeling both local and +global dependencies. In this work, we propose a novel UDA approach leveraging +the Swin Transformer with three key modules. A Graph Domain Discriminator +enhances domain alignment by capturing inter-pixel correlations through graph +convolutions and entropy-based attention differentiation. An Adaptive Double +Attention module combines Windows and Shifted Windows attention with dynamic +reweighting to align long-range and local features effectively. Finally, a +Cross-Feature Transform modifies Swin Transformer blocks to improve +generalization across domains. Extensive benchmarks confirm the +state-of-the-art performance of our versatile method, which requires no +task-specific alignment modules, establishing its adaptability to diverse +applications. + +
+
+
+
+
+ + ☆ ZipAR: Accelerating Autoregressive Image Generation through Spatial + Locality + + +
+ In this paper, we propose ZipAR, a training-free, plug-and-play parallel +decoding framework for accelerating auto-regressive (AR) visual generation. The +motivation stems from the observation that images exhibit local structures, and +spatially distant regions tend to have minimal interdependence. Given a +partially decoded set of visual tokens, in addition to the original next-token +prediction scheme in the row dimension, the tokens corresponding to spatially +adjacent regions in the column dimension can be decoded in parallel, enabling +the ``next-set prediction'' paradigm. By decoding multiple tokens +simultaneously in a single forward pass, the number of forward passes required +to generate an image is significantly reduced, resulting in a substantial +improvement in generation efficiency. Experiments demonstrate that ZipAR can +reduce the number of model forward passes by up to 91% on the Emu3-Gen model +without requiring any additional retraining. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Benchmarking and Enhancing Surgical Phase Recognition Models for + Robotic-Assisted Esophagectomy SP + + +
+ Robotic-assisted minimally invasive esophagectomy (RAMIE) is a recognized +treatment for esophageal cancer, offering better patient outcomes compared to +open surgery and traditional minimally invasive surgery. RAMIE is highly +complex, spanning multiple anatomical areas and involving repetitive phases and +non-sequential phase transitions. Our goal is to leverage deep learning for +surgical phase recognition in RAMIE to provide intraoperative support to +surgeons. To achieve this, we have developed a new surgical phase recognition +dataset comprising 27 videos. Using this dataset, we conducted a comparative +analysis of state-of-the-art surgical phase recognition models. To more +effectively capture the temporal dynamics of this complex procedure, we +developed a novel deep learning model featuring an encoder-decoder structure +with causal hierarchical attention, which demonstrates superior performance +compared to existing models. + +
+
+ comment: Accepted for presentation at the SPIE Medical Imaging Conference, + 2025 +
+
+
+
+
+ + ☆ INFP: Audio-Driven Interactive Head Generation in Dyadic Conversations + + +
+ Imagine having a conversation with a socially intelligent agent. It can +attentively listen to your words and offer visual and linguistic feedback +promptly. This seamless interaction allows for multiple rounds of conversation +to flow smoothly and naturally. In pursuit of actualizing it, we propose INFP, +a novel audio-driven head generation framework for dyadic interaction. Unlike +previous head generation works that only focus on single-sided communication, +or require manual role assignment and explicit role switching, our model drives +the agent portrait dynamically alternates between speaking and listening state, +guided by the input dyadic audio. Specifically, INFP comprises a Motion-Based +Head Imitation stage and an Audio-Guided Motion Generation stage. The first +stage learns to project facial communicative behaviors from real-life +conversation videos into a low-dimensional motion latent space, and use the +motion latent codes to animate a static image. The second stage learns the +mapping from the input dyadic audio to motion latent codes through denoising, +leading to the audio-driven head generation in interactive scenarios. To +facilitate this line of research, we introduce DyConv, a large scale dataset of +rich dyadic conversations collected from the Internet. Extensive experiments +and visualizations demonstrate superior performance and effectiveness of our +method. Project Page: https://grisoon.github.io/INFP/. + +
+
+
+
+
+ + ☆ Mask of truth: model sensitivity to unexpected regions of medical images + + +
+ The development of larger models for medical image analysis has led to +increased performance. However, it also affected our ability to explain and +validate model decisions. Models can use non-relevant parts of images, also +called spurious correlations or shortcuts, to obtain high performance on +benchmark datasets but fail in real-world scenarios. In this work, we challenge +the capacity of convolutional neural networks (CNN) to classify chest X-rays +and eye fundus images while masking out clinically relevant parts of the image. +We show that all models trained on the PadChest dataset, irrespective of the +masking strategy, are able to obtain an Area Under the Curve (AUC) above +random. Moreover, the models trained on full images obtain good performance on +images without the region of interest (ROI), even superior to the one obtained +on images only containing the ROI. We also reveal a possible spurious +correlation in the Chaksu dataset while the performances are more aligned with +the expectation of an unbiased model. We go beyond the performance analysis +with the usage of the explainability method SHAP and the analysis of +embeddings. We asked a radiology resident to interpret chest X-rays under +different masking to complement our findings with clinical knowledge. Our code +is available at https://github.com/TheoSourget/MMC_Masking and +https://github.com/TheoSourget/MMC_Masking_EyeFundus + +
+
+
+
+
+ + ☆ PriorMotion: Generative Class-Agnostic Motion Prediction with + Raster-Vector Motion Field Priors + + +
+ Reliable perception of spatial and motion information is crucial for safe +autonomous navigation. Traditional approaches typically fall into two +categories: object-centric and class-agnostic methods. While object-centric +methods often struggle with missed detections, leading to inaccuracies in +motion prediction, many class-agnostic methods focus heavily on encoder design, +often overlooking important priors like rigidity and temporal consistency, +leading to suboptimal performance, particularly with sparse LiDAR data at +distant region. To address these issues, we propose $\textbf{PriorMotion}$, a +generative framework that extracts rasterized and vectorized scene +representations to model spatio-temporal priors. Our model comprises a BEV +encoder, an Raster-Vector prior Encoder, and a Spatio-Temporal prior Generator, +improving both spatial and temporal consistency in motion prediction. +Additionally, we introduce a standardized evaluation protocol for +class-agnostic motion prediction. Experiments on the nuScenes dataset show that +PriorMotion achieves state-of-the-art performance, with further validation on +advanced FMCW LiDAR confirming its robustness. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ IF-MDM: Implicit Face Motion Diffusion Model for High-Fidelity Realtime + Talking Head Generation CVPR 2025 + + +
+ We introduce a novel approach for high-resolution talking head generation +from a single image and audio input. Prior methods using explicit face models, +like 3D morphable models (3DMM) and facial landmarks, often fall short in +generating high-fidelity videos due to their lack of appearance-aware motion +representation. While generative approaches such as video diffusion models +achieve high video quality, their slow processing speeds limit practical +application. Our proposed model, Implicit Face Motion Diffusion Model (IF-MDM), +employs implicit motion to encode human faces into appearance-aware compressed +facial latents, enhancing video generation. Although implicit motion lacks the +spatial disentanglement of explicit models, which complicates alignment with +subtle lip movements, we introduce motion statistics to help capture +fine-grained motion information. Additionally, our model provides motion +controllability to optimize the trade-off between motion intensity and visual +quality during inference. IF-MDM supports real-time generation of 512x512 +resolution videos at up to 45 frames per second (fps). Extensive evaluations +demonstrate its superior performance over existing diffusion and explicit face +models. The code will be released publicly, available alongside supplementary +materials. The video results can be found on +https://bit.ly/ifmdm_supplementary. + +
+
+ comment: underreview in CVPR 2025 +
+
+
+
+
+ + ☆ Blind Underwater Image Restoration using Co-Operational Regressor + Networks + + +
+ The exploration of underwater environments is essential for applications such +as biological research, archaeology, and infrastructure maintenanceHowever, +underwater imaging is challenging due to the waters unique properties, +including scattering, absorption, color distortion, and reduced visibility. To +address such visual degradations, a variety of approaches have been proposed +covering from basic signal processing methods to deep learning models; however, +none of them has proven to be consistently successful. In this paper, we +propose a novel machine learning model, Co-Operational Regressor Networks +(CoRe-Nets), designed to achieve the best possible underwater image +restoration. A CoRe-Net consists of two co-operating networks: the Apprentice +Regressor (AR), responsible for image transformation, and the Master Regressor +(MR), which evaluates the Peak Signal-to-Noise Ratio (PSNR) of the images +generated by the AR and feeds it back to AR. CoRe-Nets are built on +Self-Organized Operational Neural Networks (Self-ONNs), which offer a superior +learning capability by modulating nonlinearity in kernel transformations. The +effectiveness of the proposed model is demonstrated on the benchmark Large +Scale Underwater Image (LSUI) dataset. Leveraging the joint learning +capabilities of the two cooperating networks, the proposed model achieves the +state-of-art restoration performance with significantly reduced computational +complexity and often presents such results that can even surpass the visual +quality of the ground truth with a 2-pass application. Our results and the +optimized PyTorch implementation of the proposed approach are now publicly +shared on GitHub. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ LaserGuider: A Laser Based Physical Backdoor Attack against Deep Neural + Networks + + +
+ Backdoor attacks embed hidden associations between triggers and targets in +deep neural networks (DNNs), causing them to predict the target when a trigger +is present while maintaining normal behavior otherwise. Physical backdoor +attacks, which use physical objects as triggers, are feasible but lack remote +control, temporal stealthiness, flexibility, and mobility. To overcome these +limitations, in this work, we propose a new type of backdoor triggers utilizing +lasers that feature long-distance transmission and instant-imaging properties. +Based on the laser-based backdoor triggers, we present a physical backdoor +attack, called LaserGuider, which possesses remote control ability and achieves +high temporal stealthiness, flexibility, and mobility. We also introduce a +systematic approach to optimize laser parameters for improving attack +effectiveness. Our evaluation on traffic sign recognition DNNs, critical in +autonomous vehicles, demonstrates that LaserGuider with three different +laser-based triggers achieves over 90% attack success rate with negligible +impact on normal inputs. Additionally, we release LaserMark, the first dataset +of real world traffic signs stamped with physical laser spots, to support +further research in backdoor attacks and defenses. + +
+
+ comment: In Proceedings of the 23rd International Conference on Applied + Cryptography and Network Security (ACNS), Munich, Germany, 23-26 June, 2025 +
+
+
+
+
+ + ☆ UNCOVER: Unknown Class Object Detection for Autonomous Vehicles in + Real-time + + +
+ Autonomous driving (AD) operates in open-world scenarios, where encountering +unknown objects is inevitable. However, standard object detectors trained on a +limited number of base classes tend to ignore any unknown objects, posing +potential risks on the road. To address this, it is important to learn a +generic rather than a class specific objectness from objects seen during +training. We therefore introduce an occupancy prediction together with bounding +box regression. It learns to score the objectness by calculating the ratio of +the predicted area occupied by actual objects. To enhance its generalizability, +we increase the object diversity by exploiting data from other domains via +Mosaic and Mixup augmentation. The objects outside the AD training classes are +classified as a newly added out-of-distribution (OOD) class. Our solution +UNCOVER, for UNknown Class Object detection for autonomous VEhicles in +Real-time, excels at achieving both real-time detection and high recall of +unknown objects on challenging AD benchmarks. To further attain very low false +positive rates, particularly for close objects, we introduce a post-hoc +filtering step that utilizes geometric cues extracted from the depth map, +typically available within the AD system. + +
+
+
+
+
+ + ☆ Exploring Fully Convolutional Networks for the Segmentation of + Hyperspectral Imaging Applied to Advanced Driver Assistance Systems + + +
+ Advanced Driver Assistance Systems (ADAS) are designed with the main purpose +of increasing the safety and comfort of vehicle occupants. Most of current +computer vision-based ADAS perform detection and tracking tasks quite +successfully under regular conditions, but are not completely reliable, +particularly under adverse weather and changing lighting conditions, neither in +complex situations with many overlapping objects. In this work we explore the +use of hyperspectral imaging (HSI) in ADAS on the assumption that the distinct +near infrared (NIR) spectral reflectances of different materials can help to +better separate the objects in a driving scene. In particular, this paper +describes some experimental results of the application of fully convolutional +networks (FCN) to the image segmentation of HSI for ADAS applications. More +specifically, our aim is to investigate to what extent the spatial features +codified by convolutional filters can be helpful to improve the performance of +HSI segmentation systems. With that aim, we use the HSI-Drive v1.1 dataset, +which provides a set of labelled images recorded in real driving conditions +with a small-size snapshot NIR-HSI camera. Finally, we analyze the +implementability of such a HSI segmentation system by prototyping the developed +FCN model together with the necessary hyperspectral cube preprocessing stage +and characterizing its performance on an MPSoC. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2411.19274 +
+
+
+
+
+ + ☆ HyperDefect-YOLO: Enhance YOLO with HyperGraph Computation for + Industrial Defect Detection + + +
+ In the manufacturing industry, defect detection is an essential but +challenging task aiming to detect defects generated in the process of +production. Though traditional YOLO models presents a good performance in +defect detection, they still have limitations in capturing high-order feature +interrelationships, which hurdles defect detection in the complex scenarios and +across the scales. To this end, we introduce hypergraph computation into YOLO +framework, dubbed HyperDefect-YOLO (HD-YOLO), to improve representative ability +and semantic exploitation. HD-YOLO consists of Defect Aware Module (DAM) and +Mixed Graph Network (MGNet) in the backbone, which specialize for perception +and extraction of defect features. To effectively aggregate multi-scale +features, we propose HyperGraph Aggregation Network (HGANet) which combines +hypergraph and attention mechanism to aggregate multi-scale features. +Cross-Scale Fusion (CSF) is proposed to adaptively fuse and handle features +instead of simple concatenation and convolution. Finally, we propose Semantic +Aware Module (SAM) in the neck to enhance semantic exploitation for accurately +localizing defects with different sizes in the disturbed background. HD-YOLO +undergoes rigorous evaluation on public HRIPCB and NEU-DET datasets with +significant improvements compared to state-of-the-art methods. We also evaluate +HD-YOLO on self-built MINILED dataset collected in real industrial scenarios to +demonstrate the effectiveness of the proposed method. The source codes are at +https://github.com/Jay-zzcoder/HD-YOLO. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Switti: Designing Scale-Wise Transformers for Text-to-Image Synthesis + + +
+ This work presents Switti, a scale-wise transformer for text-to-image +generation. Starting from existing next-scale prediction AR models, we first +explore them for T2I generation and propose architectural modifications to +improve their convergence and overall performance. We then argue that +scale-wise transformers do not require causality and propose a non-causal +counterpart facilitating ~11% faster sampling and lower memory usage while also +achieving slightly better generation quality. Furthermore, we reveal that +classifier-free guidance at high-resolution scales is often unnecessary and can +even degrade performance. By disabling guidance at these scales, we achieve an +additional sampling acceleration of ~20% and improve the generation of +fine-grained details. Extensive human preference studies and automated +evaluations show that Switti outperforms existing T2I AR models and competes +with state-of-the-art T2I diffusion models while being up to 7 times faster. + +
+
+ comment: 20 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Map It Anywhere (MIA): Empowering Bird's Eye View Mapping using + Large-scale Public Data NeurIPS 2024 + + +
+ Top-down Bird's Eye View (BEV) maps are a popular representation for ground +robot navigation due to their richness and flexibility for downstream tasks. +While recent methods have shown promise for predicting BEV maps from +First-Person View (FPV) images, their generalizability is limited to small +regions captured by current autonomous vehicle-based datasets. In this context, +we show that a more scalable approach towards generalizable map prediction can +be enabled by using two large-scale crowd-sourced mapping platforms, Mapillary +for FPV images and OpenStreetMap for BEV semantic maps. We introduce Map It +Anywhere (MIA), a data engine that enables seamless curation and modeling of +labeled map prediction data from existing open-source map platforms. Using our +MIA data engine, we display the ease of automatically collecting a dataset of +1.2 million pairs of FPV images & BEV maps encompassing diverse geographies, +landscapes, environmental factors, camera models & capture scenarios. We +further train a simple camera model-agnostic model on this data for BEV map +prediction. Extensive evaluations using established benchmarks and our dataset +show that the data curated by MIA enables effective pretraining for +generalizable BEV map prediction, with zero-shot performance far exceeding +baselines trained on existing datasets by 35%. Our analysis highlights the +promise of using large-scale public maps for developing & testing generalizable +BEV perception, paving the way for more robust autonomous navigation. Website: +https://mapitanywhere.github.io/ + +
+
+ comment: Accepted at the 38th Conference on Neural Information Processing + Systems (NeurIPS 2024) Track on Datasets and Benchmarks. Website: + https://mapitanywhere.github.io/ +
+
+
+
+
+ + ♻ ☆ Negative Token Merging: Image-based Adversarial Feature Guidance + + +
+ Text-based adversarial guidance using a negative prompt has emerged as a +widely adopted approach to steer diffusion models away from producing undesired +concepts. While useful, performing adversarial guidance using text alone can be +insufficient to capture complex visual concepts or avoid specific visual +elements like copyrighted characters. In this paper, for the first time we +explore an alternate modality in this direction by performing adversarial +guidance directly using visual features from a reference image or other images +in a batch. We introduce negative token merging (NegToMe), a simple but +effective training-free approach which performs adversarial guidance through +images by selectively pushing apart matching visual features between reference +and generated images during the reverse diffusion process. By simply adjusting +the used reference, NegToMe enables a diverse range of applications. Notably, +when using other images in same batch as reference, we find that NegToMe +significantly enhances output diversity (e.g., racial, gender, visual) by +guiding features of each image away from others. Similarly, when used w.r.t. +copyrighted reference images, NegToMe reduces visual similarity to copyrighted +content by 34.57%. NegToMe is simple to implement using just few-lines of code, +uses only marginally higher (<4%) inference time and is compatible with +different diffusion architectures, including those like Flux, which don't +natively support the use of a negative prompt. Code is available at +https://negtome.github.io + +
+
+
+
+
+ + ♻ ☆ Learning to Reconstruct Accelerated MRI Through K-space Cold Diffusion + without Noise + + +
+ Deep learning-based MRI reconstruction models have achieved superior +performance these days. Most recently, diffusion models have shown remarkable +performance in image generation, in-painting, super-resolution, image editing +and more. As a generalized diffusion model, cold diffusion further broadens the +scope and considers models built around arbitrary image transformations such as +blurring, down-sampling, etc. In this paper, we propose a k-space cold +diffusion model that performs image degradation and restoration in k-space +without the need for Gaussian noise. We provide comparisons with multiple deep +learning-based MRI reconstruction models and perform tests on a well-known +large open-source MRI dataset. Our results show that this novel way of +performing degradation can generate high-quality reconstruction images for +accelerated MRI. + +
+
+ comment: 21 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Regularization by Neural Style Transfer for MRI Field-Transfer + Reconstruction with Limited Data + + +
+ Recent advances in MRI reconstruction have achieved remarkable success with +deep learning-based models. However, most methods depend on large-scale, +task-specific datasets, leaving reconstruction in data-limited settings as a +critical but underexplored challenge. Regularization by denoising (RED) is a +general pipeline that incorporates a denoiser as a prior for image +reconstruction, showing promising results in various image processing tasks, +including denoising, deblurring, and super-resolution. In this work, we propose +a regularization by neural style transfer (RNST) method to further leverage the +priors from the neural transfer and denoising engine. RNST effectively +reconstructs high-quality images from noisy, low-quality inputs across varying +image styles, even with limited data. We validate RNST on clinical MRI scans, +demonstrating its ability to significantly improve image quality. These +findings underline the potential of RNST for MRI field-transfer reconstruction +and its promise in addressing reconstruction tasks in data-constrained +scenarios. + +
+
+ comment: 31 pages, 9 figures, 3 tables, 1 algorithm chart +
+
+
+
+
+ + ♻ ☆ MUSE-VL: Modeling Unified VLM through Semantic Discrete Encoding + + +
+ We introduce MUSE-VL, a Unified Vision-Language Model through Semantic +discrete Encoding for multimodal understanding and generation. Recently, the +research community has begun exploring unified models for visual generation and +understanding. However, existing vision tokenizers (e.g., VQGAN) only consider +low-level information, which makes it difficult to align with texture semantic +features. This results in high training complexity and necessitates a large +amount of training data to achieve optimal performance. Additionally, their +performance is still far from dedicated understanding models. This paper +proposes Semantic Discrete Encoding (SDE), which effectively aligns the +information of visual tokens and language tokens by adding semantic constraints +to the visual tokenizer. This greatly reduces training difficulty and improves +the performance of the unified model. The proposed model significantly +surpasses the previous state-of-the-art in various vision-language benchmarks +and achieves better performance than dedicated understanding models. + +
+
+
+
+
+ + ♻ ☆ SciFIBench: Benchmarking Large Multimodal Models for Scientific Figure + Interpretation NeurIPS 2024 + + +
+ Large multimodal models (LMMs) have proven flexible and generalisable across +many tasks and fields. Although they have strong potential to aid scientific +research, their capabilities in this domain are not well characterised. A key +aspect of scientific research is the ability to understand and interpret +figures, which serve as a rich, compressed source of complex information. In +this work, we present SciFIBench, a scientific figure interpretation benchmark +consisting of 2000 questions split between two tasks across 8 categories. The +questions are curated from arXiv paper figures and captions, using adversarial +filtering to find hard negatives and human verification for quality control. We +evaluate 28 LMMs on SciFIBench, finding it to be a challenging benchmark. +Finally, we investigate the alignment and reasoning faithfulness of the LMMs on +augmented question sets from our benchmark. We release SciFIBench to encourage +progress in this domain. + +
+
+ comment: Accepted at NeurIPS 2024 (Datasets and Benchmarks Track) +
+
+
+
+
+ + ♻ ☆ GeoPos: A Minimal Positional Encoding for Enhanced Fine-Grained Details + in Image Synthesis Using Convolutional Neural Networks WACV 2025 + + +
+ The enduring inability of image generative models to recreate intricate +geometric features, such as those present in human hands and fingers has been +an ongoing problem in image generation for nearly a decade. While strides have +been made by increasing model sizes and diversifying training datasets, this +issue remains prevalent across all models, from denoising diffusion models to +Generative Adversarial Networks (GAN), pointing to a fundamental shortcoming in +the underlying architectures. In this paper, we demonstrate how this problem +can be mitigated by augmenting convolution layers geometric capabilities +through providing them with a single input channel incorporating the relative +n-dimensional Cartesian coordinate system. We show this drastically improves +quality of images generated by Diffusion Models, GANs, and Variational +AutoEncoders (VAE). + +
+
+ comment: Accepted at WACV 2025. Contains 19 pages, 15 figures, and 9 tables +
+
+
+
+
+ + ♻ ☆ DynMF: Neural Motion Factorization for Real-time Dynamic View Synthesis + with 3D Gaussian Splatting + + +
+ Accurately and efficiently modeling dynamic scenes and motions is considered +so challenging a task due to temporal dynamics and motion complexity. To +address these challenges, we propose DynMF, a compact and efficient +representation that decomposes a dynamic scene into a few neural trajectories. +We argue that the per-point motions of a dynamic scene can be decomposed into a +small set of explicit or learned trajectories. Our carefully designed neural +framework consisting of a tiny set of learned basis queried only in time allows +for rendering speed similar to 3D Gaussian Splatting, surpassing 120 FPS, while +at the same time, requiring only double the storage compared to static scenes. +Our neural representation adequately constrains the inherently underconstrained +motion field of a dynamic scene leading to effective and fast optimization. +This is done by biding each point to motion coefficients that enforce the +per-point sharing of basis trajectories. By carefully applying a sparsity loss +to the motion coefficients, we are able to disentangle the motions that +comprise the scene, independently control them, and generate novel motion +combinations that have never been seen before. We can reach state-of-the-art +render quality within just 5 minutes of training and in less than half an hour, +we can synthesize novel views of dynamic scenes with superior photorealistic +quality. Our representation is interpretable, efficient, and expressive enough +to offer real-time view synthesis of complex dynamic scene motions, in +monocular and multi-view scenarios. + +
+
+ comment: Project page: https://agelosk.github.io/dynmf/ +
+
+
+
+
+ + ♻ ☆ LoSA: Long-Short-range Adapter for Scaling End-to-End Temporal Action + Localization WACV 2025 + + +
+ Temporal Action Localization (TAL) involves localizing and classifying action +snippets in an untrimmed video. The emergence of large video foundation models +has led RGB-only video backbones to outperform previous methods needing both +RGB and optical flow modalities. Leveraging these large models is often limited +to training only the TAL head due to the prohibitively large GPU memory +required to adapt the video backbone for TAL. To overcome this limitation, we +introduce LoSA, the first memory-and-parameter-efficient backbone adapter +designed specifically for TAL to handle untrimmed videos. LoSA specializes for +TAL by introducing Long-Short-range Adapters that adapt the intermediate layers +of the video backbone over different temporal ranges. These adapters run +parallel to the video backbone to significantly reduce memory footprint. LoSA +also includes Long-Short-range Gated Fusion that strategically combines the +output of these adapters from the video backbone layers to enhance the video +features provided to the TAL head. Experiments show that LoSA significantly +outperforms all existing methods on standard TAL benchmarks, THUMOS-14 and +ActivityNet-v1.3, by scaling end-to-end backbone adaptation to +billion-parameter-plus models like VideoMAEv2~(ViT-g) and leveraging them +beyond head-only transfer learning. + +
+
+ comment: WACV 2025 Accepted +
+
+
+
+
+ + ♻ ☆ Unsupervised Denoising for Signal-Dependent and Row-Correlated Imaging + Noise WACV + + +
+ Accurate analysis of microscopy images is hindered by the presence of noise. +This noise is usually signal-dependent and often additionally correlated along +rows or columns of pixels. Current self- and unsupervised denoisers can address +signal-dependent noise, but none can reliably remove noise that is also row- or +column-correlated. Here, we present the first fully unsupervised deep +learning-based denoiser capable of handling imaging noise that is +row-correlated as well as signal-dependent. Our approach uses a Variational +Autoencoder (VAE) with a specially designed autoregressive decoder. This +decoder is capable of modeling row-correlated and signal-dependent noise but is +incapable of independently modeling underlying clean signal. The VAE therefore +produces latent variables containing only clean signal information, and these +are mapped back into image space using a proposed second decoder network. Our +method does not require a pre-trained noise model and can be trained from +scratch using unpaired noisy data. We benchmark our approach on microscopy +datatsets from a range of imaging modalities and sensor types, each with row- +or column-correlated, signal-dependent noise, and show that it outperforms +existing self- and unsupervised denoisers. + +
+
+ comment: Accepted in IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ Textual Knowledge Matters: Cross-Modality Co-Teaching for Generalized + Visual Class Discovery ECCV2024 + + +
+ In this paper, we study the problem of Generalized Category Discovery (GCD), +which aims to cluster unlabeled data from both known and unknown categories +using the knowledge of labeled data from known categories. Current GCD methods +rely on only visual cues, which however neglect the multi-modality perceptive +nature of human cognitive processes in discovering novel visual categories. To +address this, we propose a two-phase TextGCD framework to accomplish +multi-modality GCD by exploiting powerful Visual-Language Models. TextGCD +mainly includes a retrieval-based text generation (RTG) phase and a +cross-modality co-teaching (CCT) phase. First, RTG constructs a visual lexicon +using category tags from diverse datasets and attributes from Large Language +Models, generating descriptive texts for images in a retrieval manner. Second, +CCT leverages disparities between textual and visual modalities to foster +mutual learning, thereby enhancing visual GCD. In addition, we design an +adaptive class aligning strategy to ensure the alignment of category +perceptions between modalities as well as a soft-voting mechanism to integrate +multi-modality cues. Experiments on eight datasets show the large superiority +of our approach over state-of-the-art methods. Notably, our approach +outperforms the best competitor, by 7.7% and 10.8% in All accuracy on +ImageNet-1k and CUB, respectively. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models WACV 2025 + + +
+ In this work, we address the challenging and emergent problem of novel object +detection (NOD), focusing on the accurate detection of both known and novel +object categories during inference. Traditional object detection algorithms are +inherently closed-set, limiting their capability to handle NOD. We present a +novel approach to transform existing closed-set detectors into open-set +detectors. This transformation is achieved by leveraging the complementary +strengths of pre-trained foundational models, specifically CLIP and SAM, +through our cooperative mechanism. Furthermore, by integrating this mechanism +with state-of-the-art open-set detectors such as GDINO, we establish new +benchmarks in object detection performance. Our method achieves 17.42 mAP in +novel object detection and 42.08 mAP for known objects on the challenging LVIS +dataset. Adapting our approach to the COCO OVD split, we surpass the current +state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our +code is available at https://rohit901.github.io/coop-foundation-models/ . + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ HydraViT: Stacking Heads for a Scalable ViT NeurIPS'24 + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+ comment: Accepted at NeurIPS'24, please cite the conference version +
+
+
+
+
+ + ♻ ☆ Fab-ME: A Vision State-Space and Attention-Enhanced Framework for Fabric + Defect Detection + + +
+ Effective defect detection is critical for ensuring the quality, +functionality, and economic value of textile products. However, existing +methods face challenges in achieving high accuracy, real-time performance, and +efficient global information extraction. To address these issues, we propose +Fab-ME, an advanced framework based on YOLOv8s, specifically designed for the +accurate detection of 20 fabric defect types. Our contributions include the +introduction of the cross-stage partial bottleneck with two convolutions (C2F) +vision state-space (C2F-VMamba) module, which integrates visual state-space +(VSS) blocks into the YOLOv8s feature fusion network neck, enhancing the +capture of intricate details and global context while maintaining high +processing speeds. Additionally, we incorporate an enhanced multi-scale channel +attention (EMCA) module into the final layer of the feature extraction network, +significantly improving sensitivity to small targets. Experimental results on +the Tianchi fabric defect detection dataset demonstrate that Fab-ME achieves a +3.5% improvement in mAP@0.5 compared to the original YOLOv8s, validating its +effectiveness for precise and efficient fabric defect detection. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ DreamLCM: Towards High-Quality Text-to-3D Generation via Latent + Consistency Model ACM MM 2024 + + +
+ Recently, the text-to-3D task has developed rapidly due to the appearance of +the SDS method. However, the SDS method always generates 3D objects with poor +quality due to the over-smooth issue. This issue is attributed to two factors: +1) the DDPM single-step inference produces poor guidance gradients; 2) the +randomness from the input noises and timesteps averages the details of the 3D +contents. In this paper, to address the issue, we propose DreamLCM which +incorporates the Latent Consistency Model (LCM). DreamLCM leverages the +powerful image generation capabilities inherent in LCM, enabling generating +consistent and high-quality guidance, i.e., predicted noises or images. Powered +by the improved guidance, the proposed method can provide accurate and detailed +gradients to optimize the target 3D models. In addition, we propose two +strategies to enhance the generation quality further. Firstly, we propose a +guidance calibration strategy, utilizing Euler Solver to calibrate the guidance +distribution to accelerate 3D models to converge. Secondly, we propose a dual +timestep strategy, increasing the consistency of guidance and optimizing 3D +models from geometry to appearance in DreamLCM. Experiments show that DreamLCM +achieves state-of-the-art results in both generation quality and training +efficiency. The code is available at https://github.com/1YimingZhong/DreamLCM. + +
+
+ comment: 15 pages, 9 figures, ACM MM 2024 +
+
+
+
+
+ + ♻ ☆ Identity-Preserving Text-to-Video Generation by Frequency Decomposition + + +
+ Identity-preserving text-to-video (IPT2V) generation aims to create +high-fidelity videos with consistent human identity. It is an important task in +video generation but remains an open problem for generative models. This paper +pushes the technical frontier of IPT2V in two directions that have not been +resolved in literature: (1) A tuning-free pipeline without tedious case-by-case +finetuning, and (2) A frequency-aware heuristic identity-preserving DiT-based +control scheme. We propose ConsisID, a tuning-free DiT-based controllable IPT2V +model to keep human identity consistent in the generated video. Inspired by +prior findings in frequency analysis of diffusion transformers, it employs +identity-control signals in the frequency domain, where facial features can be +decomposed into low-frequency global features and high-frequency intrinsic +features. First, from a low-frequency perspective, we introduce a global facial +extractor, which encodes reference images and facial key points into a latent +space, generating features enriched with low-frequency information. These +features are then integrated into shallow layers of the network to alleviate +training challenges associated with DiT. Second, from a high-frequency +perspective, we design a local facial extractor to capture high-frequency +details and inject them into transformer blocks, enhancing the model's ability +to preserve fine-grained features. We propose a hierarchical training strategy +to leverage frequency information for identity preservation, transforming a +vanilla pre-trained video generation model into an IPT2V model. Extensive +experiments demonstrate that our frequency-aware heuristic scheme provides an +optimal control solution for DiT-based models. Thanks to this scheme, our +ConsisID generates high-quality, identity-preserving videos, making strides +towards more effective IPT2V. + +
+
+ comment: 12 pages, 8 figures, Code: https://github.com/PKU-YuanGroup/ConsisID +
+
+
+
+
+ + ♻ ☆ Camouflaged Object Tracking: A Benchmark + + +
+ Visual tracking has seen remarkable advancements, largely driven by the +availability of large-scale training datasets that have enabled the development +of highly accurate and robust algorithms. While significant progress has been +made in tracking general objects, research on more challenging scenarios, such +as tracking camouflaged objects, remains limited. Camouflaged objects, which +blend seamlessly with their surroundings or other objects, present unique +challenges for detection and tracking in complex environments. This challenge +is particularly critical in applications such as military, security, +agriculture, and marine monitoring, where precise tracking of camouflaged +objects is essential. To address this gap, we introduce the Camouflaged Object +Tracking Dataset (COTD), a specialized benchmark designed specifically for +evaluating camouflaged object tracking methods. The COTD dataset comprises 200 +sequences and approximately 80,000 frames, each annotated with detailed +bounding boxes. Our evaluation of 20 existing tracking algorithms reveals +significant deficiencies in their performance with camouflaged objects. To +address these issues, we propose a novel tracking framework, HiPTrack-MLS, +which demonstrates promising results in improving tracking performance for +camouflaged objects. COTD and code are avialable at +https://github.com/openat25/HIPTrack-MLS. + +
+
+
+
+
+ + ♻ ☆ Calib3D: Calibrating Model Preferences for Reliable 3D Scene + Understanding WACV 2025 + + +
+ Safety-critical 3D scene understanding tasks necessitate not only accurate +but also confident predictions from 3D perception models. This study introduces +Calib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D +scene understanding models from an uncertainty estimation viewpoint. We +comprehensively evaluate 28 state-of-the-art models across 10 diverse 3D +datasets, uncovering insightful phenomena that cope with both the aleatoric and +epistemic uncertainties in 3D scene understanding. We discover that despite +achieving impressive levels of accuracy, existing models frequently fail to +provide reliable uncertainty estimates -- a pitfall that critically undermines +their applicability in safety-sensitive contexts. Through extensive analysis of +key factors such as network capacity, LiDAR representations, rasterization +resolutions, and 3D data augmentation techniques, we correlate these aspects +directly with the model calibration efficacy. Furthermore, we introduce DeptS, +a novel depth-aware scaling approach aimed at enhancing 3D model calibration. +Extensive experiments across a wide range of configurations validate the +superiority of our method. We hope this work could serve as a cornerstone for +fostering reliable 3D scene understanding. Code and benchmark toolkit are +publicly available. + +
+
+ comment: WACV 2025; 26 pages, 8 figures, 12 tables; Code at + https://github.com/ldkong1205/Calib3D +
+
+
+
+
+ + ♻ ☆ Cross-domain and Cross-dimension Learning for Image-to-Graph + Transformers + + +
+ Direct image-to-graph transformation is a challenging task that involves +solving object detection and relationship prediction in a single model. Due to +this task's complexity, large training datasets are rare in many domains, +making the training of deep-learning methods challenging. This data sparsity +necessitates transfer learning strategies akin to the state-of-the-art in +general computer vision. In this work, we introduce a set of methods enabling +cross-domain and cross-dimension learning for image-to-graph transformers. We +propose (1) a regularized edge sampling loss to effectively learn object +relations in multiple domains with different numbers of edges, (2) a domain +adaptation framework for image-to-graph transformers aligning image- and +graph-level features from different domains, and (3) a projection function that +allows using 2D data for training 3D transformers. We demonstrate our method's +utility in cross-domain and cross-dimension experiments, where we utilize +labeled data from 2D road networks for simultaneous learning in vastly +different target domains. Our method consistently outperforms standard transfer +learning and self-supervised pretraining on challenging benchmarks, such as +retinal or whole-brain vessel graph extraction. + +
+
+
+
+
+ + ♻ ☆ LIME: Localized Image Editing via Attention Regularization in Diffusion + Models WACV'25 + + +
+ Diffusion models (DMs) have gained prominence due to their ability to +generate high-quality varied images with recent advancements in text-to-image +generation. The research focus is now shifting towards the controllability of +DMs. A significant challenge within this domain is localized editing, where +specific areas of an image are modified without affecting the rest of the +content. This paper introduces LIME for localized image editing in diffusion +models. LIME does not require user-specified regions of interest (RoI) or +additional text input, but rather employs features from pre-trained methods and +a straightforward clustering method to obtain precise editing mask. Then, by +leveraging cross-attention maps, it refines these segments for finding regions +to obtain localized edits. Finally, we propose a novel cross-attention +regularization technique that penalizes unrelated cross-attention scores in the +RoI during the denoising steps, ensuring localized edits. Our approach, without +re-training, fine-tuning and additional user inputs, consistently improves the +performance of existing methods in various editing benchmarks. The project page +can be found at https://enisimsar.github.io/LIME/. + +
+
+ comment: WACV'25 +
+
+
+
+
+ + ♻ ☆ MetricGold: Leveraging Text-To-Image Latent Diffusion Models for Metric + Depth Estimation + + +
+ Recovering metric depth from a single image remains a fundamental challenge +in computer vision, requiring both scene understanding and accurate scaling. +While deep learning has advanced monocular depth estimation, current models +often struggle with unfamiliar scenes and layouts, particularly in zero-shot +scenarios and when predicting scale-ergodic metric depth. We present +MetricGold, a novel approach that harnesses generative diffusion model's rich +priors to improve metric depth estimation. Building upon recent advances in +MariGold, DDVM and Depth Anything V2 respectively, our method combines latent +diffusion, log-scaled metric depth representation, and synthetic data training. +MetricGold achieves efficient training on a single RTX 3090 within two days +using photo-realistic synthetic data from HyperSIM, VirtualKitti, and +TartanAir. Our experiments demonstrate robust generalization across diverse +datasets, producing sharper and higher quality metric depth estimates compared +to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Context Prompting for Zero-Shot Action Detection WACV2025 + + +
+ Spatio-temporal action detection encompasses the tasks of localizing and +classifying individual actions within a video. Recent works aim to enhance this +process by incorporating interaction modeling, which captures the relationship +between people and their surrounding context. However, these approaches have +primarily focused on fully-supervised learning, and the current limitation lies +in the lack of generalization capability to recognize unseen action categories. +In this paper, we aim to adapt the pretrained image-language models to detect +unseen actions. To this end, we propose a method which can effectively leverage +the rich knowledge of visual-language models to perform Person-Context +Interaction. Meanwhile, our Context Prompting module will utilize contextual +information to prompt labels, thereby enhancing the generation of more +representative text features. Moreover, to address the challenge of recognizing +distinct actions by multiple people at the same timestamp, we design the +Interest Token Spotting mechanism which employs pretrained visual knowledge to +find each person's interest context tokens, and then these tokens will be used +for prompting to generate text features tailored to each individual. To +evaluate the ability to detect unseen actions, we propose a comprehensive +benchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our +method achieves superior results compared to previous approaches and can be +further extended to multi-action videos, bringing it closer to real-world +applications. The code and data can be found in +https://webber2933.github.io/ST-CLIP-project-page. + +
+
+ comment: Accepted by WACV2025. Project page: + https://webber2933.github.io/ST-CLIP-project-page +
+
+
+
+
+ + ♻ ☆ Molmo and PixMo: Open Weights and Open Data for State-of-the-Art + Vision-Language Models + + +
+ Today's most advanced vision-language models (VLMs) remain proprietary. The +strongest open-weight models rely heavily on synthetic data from proprietary +VLMs to achieve good performance, effectively distilling these closed VLMs into +open ones. As a result, the community has been missing foundational knowledge +about how to build performant VLMs from scratch. We present Molmo, a new family +of VLMs that are state-of-the-art in their class of openness. Our key +contribution is a collection of new datasets called PixMo, including a dataset +of highly detailed image captions for pre-training, a free-form image Q&A +dataset for fine-tuning, and an innovative 2D pointing dataset, all collected +without the use of external VLMs. The success of our approach relies on careful +modeling choices, a well-tuned training pipeline, and, most critically, the +quality of our newly collected datasets. Our best-in-class 72B model not only +outperforms others in the class of open weight and data models, but also +outperforms larger proprietary models including Claude 3.5 Sonnet, and Gemini +1.5 Pro and Flash, second only to GPT-4o based on both academic benchmarks and +on a large human evaluation. Our model weights, new datasets, and source code +are available at https://molmo.allenai.org/blog. + +
+
+ comment: Updated with ablations and more technical details +
+
+
+
+
+ + ♻ ☆ ELSA: Evaluating Localization of Social Activities in Urban Streets + using Open-Vocabulary Detection + + +
+ Existing Open Vocabulary Detection (OVD) models exhibit a number of +challenges. They often struggle with semantic consistency across diverse +inputs, and are often sensitive to slight variations in input phrasing, leading +to inconsistent performance. The calibration of their predictive confidence, +especially in complex multi-label scenarios, remains suboptimal, frequently +resulting in overconfident predictions that do not accurately reflect their +context understanding. To understand these limitations, multi-label detection +benchmarks are needed. A particularly challenging domain for such benchmarking +is social activities. Due to the lack of multi-label benchmarks for social +interactions, in this work we present ELSA: Evaluating Localization of Social +Activities. ELSA draws on theoretical frameworks in urban sociology and design +and uses in-the-wild street-level imagery, where the size of groups and the +types of activities vary significantly. ELSA includes more than 900 manually +annotated images with more than 4,300 multi-labeled bounding boxes for +individual and group activities. We introduce a novel confidence score +computation method NLSE and a novel Dynamic Box Aggregation (DBA) algorithm to +assess semantic consistency in overlapping predictions. We report our results +on the widely-used SOTA models Grounding DINO, Detic, OWL, and MDETR. Our +evaluation protocol considers semantic stability and localization accuracy and +further exposes the limitations of existing approaches. + +
+
+
+
+
+ + ♻ ☆ Excretion Detection in Pigsties Using Convolutional and Transformerbased + Deep Neural Networks + + +
+ Animal excretions in form of urine puddles and feces are a significant source +of emissions in livestock farming. Automated detection of soiled floor in barns +can contribute to improved management processes but also the derived +information can be used to model emission dynamics. Previous research +approaches to determine the puddle area require manual detection of the puddle +in the barn. While humans can detect animal excretions on thermal images of a +livestock barn, automated approaches using thresholds fail due to other objects +of the same temperature, such as the animals themselves. In addition, various +parameters such as the type of housing, animal species, age, sex, weather and +unknown factors can influence the type and shape of excretions. Due to this +heterogeneity, a method for automated detection of excretions must therefore be +not only be accurate but also robust to varying conditions. These requirements +can be met by using contemporary deep learning models from the field of +artificial intelligence. This work is the first to investigate the suitability +of different deep learning models for the detection of excretions in pigsties, +thereby comparing established convolutional architectures with recent +transformer-based approaches. The detection models Faster R-CNN, YOLOv8, DETR +and DAB-DETR are compared and statistically assessed on two created training +datasets representing two pig houses. We apply a method derived from nested +cross-validation and report on the results in terms of eight common detection +metrics. Our work demonstrates that all investigated deep learning models are +generally suitable for reliably detecting excretions with an average precision +of over 90%. The models also show robustness on out of distribution data that +possesses differences from the conditions in the training data, however, with +expected slight decreases in the overall detection performance. + +
+
+ comment: Keywords: Artificial Intelligence, Objected detection, Pig, Urine + puddle, Thermal IR data, CNN vs Transformer, Precision Livestock Farming; + Stats: 54 pages, 13 figures, 1 graphical abstract +
+
+
+
+
+ + ♻ ☆ FG-MDM: Towards Zero-Shot Human Motion Generation via ChatGPT-Refined + Descriptions + + +
+ Recently, significant progress has been made in text-based motion generation, +enabling the generation of diverse and high-quality human motions that conform +to textual descriptions. However, generating motions beyond the distribution of +original datasets remains challenging, i.e., zero-shot generation. By adopting +a divide-and-conquer strategy, we propose a new framework named Fine-Grained +Human Motion Diffusion Model (FG-MDM) for zero-shot human motion generation. +Specifically, we first parse previous vague textual annotations into +fine-grained descriptions of different body parts by leveraging a large +language model. We then use these fine-grained descriptions to guide a +transformer-based diffusion model, which further adopts a design of part +tokens. FG-MDM can generate human motions beyond the scope of original datasets +owing to descriptions that are closer to motion essence. Our experimental +results demonstrate the superiority of FG-MDM over previous methods in +zero-shot settings. We will release our fine-grained textual annotations for +HumanML3D and KIT. + +
+
+ comment: Project Page: https://sx0207.github.io/fg-mdm/ +
+
+
+
+
+ + ♻ ☆ Align3R: Aligned Monocular Depth Estimation for Dynamic Videos + + +
+ Recent developments in monocular depth estimation methods enable high-quality +depth estimation of single-view images but fail to estimate consistent video +depth across different frames. Recent works address this problem by applying a +video diffusion model to generate video depth conditioned on the input video, +which is training-expensive and can only produce scale-invariant depth values +without camera poses. In this paper, we propose a novel video-depth estimation +method called Align3R to estimate temporal consistent depth maps for a dynamic +video. Our key idea is to utilize the recent DUSt3R model to align estimated +monocular depth maps of different timesteps. First, we fine-tune the DUSt3R +model with additional estimated monocular depth as inputs for the dynamic +scenes. Then, we apply optimization to reconstruct both depth maps and camera +poses. Extensive experiments demonstrate that Align3R estimates consistent +video depth and camera poses for a monocular video with superior performance +than baseline methods. + +
+
+ comment: Project Page: https://igl-hkust.github.io/Align3R.github.io/ +
+
+
+
+
+ + ♻ ☆ C2P-CLIP: Injecting Category Common Prompt in CLIP to Enhance + Generalization in Deepfake Detection + + +
+ This work focuses on AIGC detection to develop universal detectors capable of +identifying various types of forgery images. Recent studies have found large +pre-trained models, such as CLIP, are effective for generalizable deepfake +detection along with linear classifiers. However, two critical issues remain +unresolved: 1) understanding why CLIP features are effective on deepfake +detection through a linear classifier; and 2) exploring the detection potential +of CLIP. In this study, we delve into the underlying mechanisms of CLIP's +detection capabilities by decoding its detection features into text and +performing word frequency analysis. Our finding indicates that CLIP detects +deepfakes by recognizing similar concepts (Fig. \ref{fig:fig1} a). Building on +this insight, we introduce Category Common Prompt CLIP, called C2P-CLIP, which +integrates the category common prompt into the text encoder to inject +category-related concepts into the image encoder, thereby enhancing detection +performance (Fig. \ref{fig:fig1} b). Our method achieves a 12.41\% improvement +in detection accuracy compared to the original CLIP, without introducing +additional parameters during testing. Comprehensive experiments conducted on +two widely-used datasets, encompassing 20 generation models, validate the +efficacy of the proposed method, demonstrating state-of-the-art performance. +The code is available at +\url{https://github.com/chuangchuangtan/C2P-CLIP-DeepfakeDetection} + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating + Large Multimodal Models in Literacy + + +
+ Large Multimodal Models (LMMs) have demonstrated impressive performance on +recognizing document images with natural language instructions. However, it +remains unclear to what extent capabilities in literacy with rich structure and +fine-grained visual challenges. The current landscape lacks a comprehensive +benchmark to effectively measure the literate capabilities of LMMs. Existing +benchmarks are often limited by narrow scenarios and specified tasks. To this +end, we introduce CC-OCR, a comprehensive benchmark that possess a diverse +range of scenarios, tasks, and challenges. CC-OCR comprises four OCR-centric +tracks: multi-scene text reading, multilingual text reading, document parsing, +and key information extraction. It includes 39 subsets with 7,058 full +annotated images, of which 41% are sourced from real applications, being +released for the first time. Furthermore, we evaluate nine prominent LMMs and +reveal both the strengths and weaknesses of these models, particularly in text +grounding, multi-orientation, and hallucination of repetition. CC-OCR aims to +comprehensively evaluate the capabilities of LMMs on OCR-centered tasks, +driving advancement in LMMs. + +
+
+ comment: 23 pages, 14 figures; The code will released at + https://github.com/QwenLM/CC-OCR +
+
+
+
+
+ + ♻ ☆ Divide, Ensemble and Conquer: The Last Mile on Unsupervised Domain + Adaptation for Semantic Segmentation + + +
+ The last mile of unsupervised domain adaptation (UDA) for semantic +segmentation is the challenge of solving the syn-to-real domain gap. Recent UDA +methods have progressed significantly, yet they often rely on strategies +customized for synthetic single-source datasets (e.g., GTA5), which limits +their generalisation to multi-source datasets. Conversely, synthetic +multi-source datasets hold promise for advancing the last mile of UDA but +remain underutilized in current research. Thus, we propose DEC, a flexible UDA +framework for multi-source datasets. Following a divide-and-conquer strategy, +DEC simplifies the task by categorizing semantic classes, training models for +each category, and fusing their outputs by an ensemble model trained +exclusively on synthetic datasets to obtain the final segmentation mask. DEC +can integrate with existing UDA methods, achieving state-of-the-art performance +on Cityscapes, BDD100K, and Mapillary Vistas, significantly narrowing the +syn-to-real domain gap. + +
+
+ comment: Accepted by TIV +
+
+
+
+
+ + ♻ ☆ SEAL: Semantic Attention Learning for Long Video Representation + + +
+ Long video understanding presents challenges due to the inherent high +computational complexity and redundant temporal information. An effective +representation for long videos must process such redundancy efficiently while +preserving essential contents for downstream tasks. This paper introduces +SEmantic Attention Learning (SEAL), a novel unified representation for long +videos. To reduce computational complexity, long videos are decomposed into +three distinct types of semantic entities: scenes, objects, and actions, +allowing models to operate on a handful of entities rather than a large number +of frames or pixels. To further address redundancy, we propose an attention +learning module that balances token relevance with diversity formulated as a +subset selection optimization problem. Our representation is versatile, +enabling applications across various long video understanding tasks. Extensive +experiments show that SEAL significantly outperforms state-of-the-art methods +in video question answering and temporal grounding tasks and benchmarks +including LVBench, MovieChat-1K, and Ego4D. + +
+
+
+
+
+ + ♻ ☆ Looking at Model Debiasing through the Lens of Anomaly Detection WACV + + +
+ It is widely recognized that deep neural networks are sensitive to bias in +the data. This means that during training these models are likely to learn +spurious correlations between data and labels, resulting in limited +generalization abilities and low performance. In this context, model debiasing +approaches can be devised aiming at reducing the model's dependency on such +unwanted correlations, either leveraging the knowledge of bias information or +not. In this work, we focus on the latter and more realistic scenario, showing +the importance of accurately predicting the bias-conflicting and bias-aligned +samples to obtain compelling performance in bias mitigation. On this ground, we +propose to conceive the problem of model bias from an out-of-distribution +perspective, introducing a new bias identification method based on anomaly +detection. We claim that when data is mostly biased, bias-conflicting samples +can be regarded as outliers with respect to the bias-aligned distribution in +the feature space of a biased model, thus allowing for precisely detecting them +with an anomaly detection method. Coupling the proposed bias identification +approach with bias-conflicting data upsampling and augmentation in a two-step +strategy, we reach state-of-the-art performance on synthetic and real benchmark +datasets. Ultimately, our proposed approach shows that the data bias issue does +not necessarily require complex debiasing methods, given that an accurate bias +identification procedure is defined. Source code is available at +https://github.com/Malga-Vision/MoDAD + +
+
+ comment: 13 pages, 8 figures; Accepted at IEEE/CVF Winter Conference on + Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ MC-LLaVA: Multi-Concept Personalized Vision-Language Model + + +
+ Current vision-language models (VLMs) show exceptional abilities across +diverse tasks including visual question answering. To enhance user experience +in practical applications, recent studies investigate VLM personalization to +understand user-provided concepts. However, existing studies mainly focus on +single-concept personalization, neglecting the existence and interplay of +multiple concepts, which limits the real-world applicability of personalized +VLMs. In this paper, we propose the first multi-concept personalization method +named MC-LLaVA along with a high-quality multi-concept personalization dataset. +Specifically, MC-LLaVA uses a joint training strategy incorporating multiple +concepts in a single training step, allowing VLMs to perform accurately in +multi-concept personalization. To reduce the cost of joint training, MC-LLaVA +leverages visual token information for concept token initialization, yielding +improved concept representation and accelerating joint training. To advance +multi-concept personalization research, we further contribute a high-quality +dataset. We carefully collect images from various movies that contain multiple +characters and manually generate the multi-concept question-answer samples. Our +dataset features diverse movie types and question-answer types. We conduct +comprehensive qualitative and quantitative experiments to demonstrate that +MC-LLaVA can achieve impressive multi-concept personalized responses, paving +the way for VLMs to become better user-specific assistants. The code and +dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA. + +
+
+
+
+
+ + ♻ ☆ DeiSAM: Segment Anything with Deictic Prompting NeurIPS 2024 + + +
+ Large-scale, pre-trained neural networks have demonstrated strong +capabilities in various tasks, including zero-shot image segmentation. To +identify concrete objects in complex scenes, humans instinctively rely on +deictic descriptions in natural language, i.e., referring to something +depending on the context such as "The object that is on the desk and behind the +cup.". However, deep learning approaches cannot reliably interpret such deictic +representations due to their lack of reasoning capabilities in complex +scenarios. To remedy this issue, we propose DeiSAM -- a combination of large +pre-trained neural networks with differentiable logic reasoners -- for deictic +promptable segmentation. Given a complex, textual segmentation description, +DeiSAM leverages Large Language Models (LLMs) to generate first-order logic +rules and performs differentiable forward reasoning on generated scene graphs. +Subsequently, DeiSAM segments objects by matching them to the logically +inferred image regions. As part of our evaluation, we propose the Deictic +Visual Genome (DeiVG) dataset, containing paired visual input and complex, +deictic textual prompts. Our empirical results demonstrate that DeiSAM is a +substantial improvement over purely data-driven baselines for deictic +promptable segmentation. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Stitch in Time Saves Nine: Small VLM is a Precise Guidance for + Accelerating Large VLMs + + +
+ Vision-language models (VLMs) have shown remarkable success across various +multi-modal tasks, yet large VLMs encounter significant efficiency challenges +due to processing numerous visual tokens. A promising approach to accelerating +large VLM inference is using partial information, such as attention maps from +specific layers, to assess token importance and prune less essential tokens. +However, our study reveals three key insights: (i) Partial attention +information is insufficient for accurately identifying critical visual tokens, +resulting in suboptimal performance, especially at low token retention ratios; +(ii) Global attention information, such as the attention map aggregated across +all layers, more effectively preserves essential tokens and maintains +comparable performance under aggressive pruning. However, the attention maps +from all layers requires a full inference pass, which increases computational +load and is therefore impractical in existing methods; and (iii) The global +attention map aggregated from a small VLM closely resembles that of a large +VLM, suggesting an efficient alternative. Based on these findings, we introduce +a \textbf{training-free} method, \underline{\textbf{S}}mall VLM +\underline{\textbf{G}}uidance for accelerating \underline{\textbf{L}}arge VLMs +(\textbf{SGL}). Specifically, we employ the attention map aggregated from a +small VLM to guide visual token pruning in a large VLM. Additionally, an early +exiting mechanism is developed to fully use the small VLM's predictions, +dynamically invoking the larger VLM only when necessary, yielding a superior +trade-off between accuracy and computation. Extensive evaluations across 11 +benchmarks demonstrate the effectiveness and generalizability of SGL, achieving +up to 91\% pruning ratio for visual tokens while retaining competitive +performance. + +
+
+
+
+
+ + ♻ ☆ Memory-efficient Continual Learning with Neural Collapse Contrastive WACV 2025 + + +
+ Contrastive learning has significantly improved representation quality, +enhancing knowledge transfer across tasks in continual learning (CL). However, +catastrophic forgetting remains a key challenge, as contrastive based methods +primarily focus on "soft relationships" or "softness" between samples, which +shift with changing data distributions and lead to representation overlap +across tasks. Recently, the newly identified Neural Collapse phenomenon has +shown promise in CL by focusing on "hard relationships" or "hardness" between +samples and fixed prototypes. However, this approach overlooks "softness", +crucial for capturing intra-class variability, and this rigid focus can also +pull old class representations toward current ones, increasing forgetting. +Building on these insights, we propose Focal Neural Collapse Contrastive +(FNC2), a novel representation learning loss that effectively balances both +soft and hard relationships. Additionally, we introduce the Hardness-Softness +Distillation (HSD) loss to progressively preserve the knowledge gained from +these relationships across tasks. Our method outperforms state-of-the-art +approaches, particularly in minimizing memory reliance. Remarkably, even +without the use of memory, our approach rivals rehearsal-based methods, +offering a compelling solution for data privacy concerns. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ SAT-HMR: Real-Time Multi-Person 3D Mesh Estimation via Scale-Adaptive + Tokens + + +
+ We propose a one-stage framework for real-time multi-person 3D human mesh +estimation from a single RGB image. While current one-stage methods, which +follow a DETR-style pipeline, achieve state-of-the-art (SOTA) performance with +high-resolution inputs, we observe that this particularly benefits the +estimation of individuals in smaller scales of the image (e.g., those far from +the camera), but at the cost of significantly increased computation overhead. +To address this, we introduce scale-adaptive tokens that are dynamically +adjusted based on the relative scale of each individual in the image within the +DETR framework. Specifically, individuals in smaller scales are processed at +higher resolutions, larger ones at lower resolutions, and background regions +are further distilled. These scale-adaptive tokens more efficiently encode the +image features, facilitating subsequent decoding to regress the human mesh, +while allowing the model to allocate computational resources more effectively +and focus on more challenging cases. Experiments show that our method preserves +the accuracy benefits of high-resolution processing while substantially +reducing computational cost, achieving real-time inference with performance +comparable to SOTA methods. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Learning on Model Weights using Tree Experts + + +
+ The increasing availability of public models begs the question: can we train +neural networks that use other networks as input? Such models allow us to study +different aspects of a given neural network, for example, determining the +categories in a model's training dataset. However, machine learning on model +weights is challenging as they often exhibit significant variation unrelated to +the models' semantic properties (nuisance variation). Here, we identify a key +property of real-world models: most public models belong to a small set of +Model Trees, where all models within a tree are fine-tuned from a common +ancestor (e.g., a foundation model). Importantly, we find that within each tree +there is less nuisance variation between models. Concretely, while learning +across Model Trees requires complex architectures, even a linear classifier +trained on a single model layer often works within trees. While effective, +these linear classifiers are computationally expensive, especially when dealing +with larger models that have many parameters. To address this, we introduce +Probing Experts (ProbeX), a theoretically motivated and lightweight method. +Notably, ProbeX is the first probing method specifically designed to learn from +the weights of a single hidden model layer. We demonstrate the effectiveness of +ProbeX by predicting the categories in a model's training dataset based only on +its weights. Excitingly, ProbeX can also map the weights of Stable Diffusion +into a shared weight-language embedding space, enabling zero-shot model +classification. + +
+
+ comment: Project page: https://horwitz.ai/probex/ +
+
+
+
+
+ + ♻ ☆ Words in Motion: Extracting Interpretable Control Vectors for Motion + Transformers + + +
+ Transformer-based models generate hidden states that are difficult to +interpret. In this work, we aim to interpret these hidden states and control +them at inference, with a focus on motion forecasting. We use linear probes to +measure neural collapse towards interpretable motion features in hidden states. +High probing accuracy implies meaningful directions and distances between +hidden states of opposing features, which we use to fit interpretable control +vectors for activation steering at inference. To optimize our control vectors, +we use sparse autoencoders with fully-connected, convolutional, MLPMixer layers +and various activation functions. Notably, we show that enforcing sparsity in +hidden states leads to a more linear relationship between control vector +temperatures and forecasts. Our approach enables mechanistic interpretability +and zero-shot generalization to unseen dataset characteristics with negligible +computational overhead. Our implementation is available at +https://github.com/kit-mrt/future-motion + +
+
+ comment: Add autoencoders with convolutional, MLPMixer layers, and JumpReLU + activations +
+
+
+
+
+ + ♻ ☆ The Hatching-Box: A Novel System for Automated Monitoring and + Quantification of \textit{Drosophila melanogaster} Developmental Behavior + + +
+ In this paper we propose the Hatching-Box, a novel imaging and analysis +system to automatically monitor and quantify the developmental behavior of +Drosophila in standard rearing vials and during regular rearing routines, +rendering explicit experiments obsolete. This is achieved by combining custom +tailored imaging hardware with dedicated detection and tracking algorithms, +enabling the quantification of larvae, filled/empty pupae and flies over +multiple days. Given the affordable and reproducible design of the Hatching-Box +in combination with our generic client/server-based software, the system can +easily be scaled to monitor an arbitrary amount of rearing vials +simultaneously. We evaluated our system on a curated image dataset comprising +nearly 470,000 annotated objects and performed several studies on real world +experiments. We successfully reproduced results from well-established circadian +experiments by comparing the eclosion periods of wild type flies to the clock +mutants $\textit{per}^{short}$, $\textit{per}^{long}$ and $\textit{per}^0$ +without involvement of any manual labor. Furthermore we show, that the +Hatching-Box is able to extract additional information about group behavior as +well as to reconstruct the whole life-cycle of the individual specimens. These +results not only demonstrate the applicability of our system for long-term +experiments but also indicate its benefits for automated monitoring in the +general cultivation process. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ VGGHeads: 3D Multi Head Alignment with a Large-Scale Synthetic Dataset + + +
+ Human head detection, keypoint estimation, and 3D head model fitting are +essential tasks with many applications. However, traditional real-world +datasets often suffer from bias, privacy, and ethical concerns, and they have +been recorded in laboratory environments, which makes it difficult for trained +models to generalize. Here, we introduce \method -- a large-scale synthetic +dataset generated with diffusion models for human head detection and 3D mesh +estimation. Our dataset comprises over 1 million high-resolution images, each +annotated with detailed 3D head meshes, facial landmarks, and bounding boxes. +Using this dataset, we introduce a new model architecture capable of +simultaneous head detection and head mesh reconstruction from a single image in +a single step. Through extensive experimental evaluations, we demonstrate that +models trained on our synthetic data achieve strong performance on real images. +Furthermore, the versatility of our dataset makes it applicable across a broad +spectrum of tasks, offering a general and comprehensive representation of human +heads. + +
+
+
+
+
+ + ♻ ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ♻ ☆ FPANet: Frequency-based Video Demoireing using Frame-level Post + Alignment + + +
+ Moire patterns, created by the interference between overlapping grid patterns +in the pixel space, degrade the visual quality of images and videos. Therefore, +removing such patterns~(demoireing) is crucial, yet remains a challenge due to +their complexities in sizes and distortions. Conventional methods mainly tackle +this task by only exploiting the spatial domain of the input images, limiting +their capabilities in removing large-scale moire patterns. Therefore, this work +proposes FPANet, an image-video demoireing network that learns filters in both +frequency and spatial domains, improving the restoration quality by removing +various sizes of moire patterns. To further enhance, our model takes multiple +consecutive frames, learning to extract frame-invariant content features and +outputting better quality temporally consistent images. We demonstrate the +effectiveness of our proposed method with a publicly available large-scale +dataset, observing that ours outperforms the state-of-the-art approaches in +terms of image and video quality metrics and visual experience. + +
+
+ comment: Accepted version, to appear in Neural Networks +
+
+
+
+
+ + ♻ ☆ Memories are One-to-Many Mapping Alleviators in Talking Face Generation + + +
+ Talking face generation aims at generating photo-realistic video portraits of +a target person driven by input audio. Due to its nature of one-to-many mapping +from the input audio to the output video (e.g., one speech content may have +multiple feasible visual appearances), learning a deterministic mapping like +previous works brings ambiguity during training, and thus causes inferior +visual results. Although this one-to-many mapping could be alleviated in part +by a two-stage framework (i.e., an audio-to-expression model followed by a +neural-rendering model), it is still insufficient since the prediction is +produced without enough information (e.g., emotions, wrinkles, etc.). In this +paper, we propose MemFace to complement the missing information with an +implicit memory and an explicit memory that follow the sense of the two stages +respectively. More specifically, the implicit memory is employed in the +audio-to-expression model to capture high-level semantics in the +audio-expression shared space, while the explicit memory is employed in the +neural-rendering model to help synthesize pixel-level details. Our experimental +results show that our proposed MemFace surpasses all the state-of-the-art +results across multiple scenarios consistently and significantly. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence + (2024). Project page: see https://memoryface.github.io +
+
+
+
+
+ + ♻ ☆ LUDVIG: Learning-free Uplifting of 2D Visual features to Gaussian + Splatting scenes + + +
+ We address the problem of extending the capabilities of vision foundation +models such as DINO, SAM, and CLIP, to 3D tasks. Specifically, we introduce a +novel method to uplift 2D image features into 3D Gaussian Splatting scenes. +Unlike traditional approaches that rely on minimizing a reconstruction loss, +our method employs a simpler and more efficient feature aggregation technique, +augmented by a graph diffusion mechanism. Graph diffusion enriches features +from a given model, such as CLIP, by leveraging pairwise similarities that +encode 3D geometry or similarities induced by another embedding like DINOv2. +Our approach achieves performance comparable to the state of the art on +multiple downstream tasks while delivering significant speed-ups. Notably, we +obtain competitive segmentation results using generic DINOv2 features, despite +DINOv2 not being trained on millions of annotated segmentation masks like SAM. +When applied to CLIP features, our method demonstrates strong performance in +open-vocabulary, language-based object detection tasks, highlighting the +versatility of our approach. + +
+
+
+
+
+ + ♻ ☆ SPIN: Spacecraft Imagery for Navigation + + +
+ The scarcity of data acquired under actual space operational conditions poses +a significant challenge for developing learning-based visual navigation +algorithms crucial for autonomous spacecraft navigation. This data shortage is +primarily due to the prohibitive costs and inherent complexities of space +operations. While existing datasets, predominantly relying on +computer-simulated data, have partially addressed this gap, they present +notable limitations. Firstly, these datasets often utilize proprietary image +generation tools, restricting the evaluation of navigation methods in novel, +unseen scenarios. Secondly, they provide limited ground-truth data, typically +focusing solely on the spacecraft's translation and rotation relative to the +camera. To address these limitations, we present SPIN (SPacecraft Imagery for +Navigation), an open-source spacecraft image generation tool designed to +support a wide range of visual navigation scenarios in space, with a particular +focus on relative navigation tasks. SPIN provides multiple modalities of +ground-truth data and allows researchers to employ custom 3D models of +satellites, define specific camera-relative poses, and adjust settings such as +camera parameters or environmental illumination conditions. We also propose a +method for exploiting our tool as a data augmentation module. We validate our +tool on the spacecraft pose estimation task by training with a SPIN-generated +replica of SPEED+, reaching a 47% average error reduction on SPEED+ testbed +data (that simulates realistic space conditions), further reducing it to a 60% +error reduction when using SPIN as a data augmentation method. Both the SPIN +tool (and source code) and our SPIN-generated version of SPEED+ will be +publicly released upon paper acceptance on GitHub. +https://github.com/vpulab/SPIN + +
+
+
+
+
+ + ♻ ☆ Scaling Laws for Task-Optimized Models of the Primate Visual Ventral + Stream + + +
+ When trained on large-scale object classification datasets, certain +artificial neural network models begin to approximate core object recognition +(COR) behaviors and neural response patterns in the primate visual ventral +stream (VVS). While recent machine learning advances suggest that scaling model +size, dataset size, and compute resources improve task performance, the impact +of scaling on brain alignment remains unclear. In this study, we explore +scaling laws for modeling the primate VVS by systematically evaluating over 600 +models trained under controlled conditions on benchmarks spanning V1, V2, V4, +IT and COR behaviors. We observe that while behavioral alignment continues to +scale with larger models, neural alignment saturates. This observation remains +true across model architectures and training datasets, even though models with +stronger inductive bias and datasets with higher-quality images are more +compute-efficient. Increased scaling is especially beneficial for higher-level +visual areas, where small models trained on few samples exhibit only poor +alignment. Finally, we develop a scaling recipe, indicating that a greater +proportion of compute should be allocated to data samples over model size. Our +results suggest that while scaling alone might suffice for alignment with human +core object recognition behavior, it will not yield improved models of the +brain's visual ventral stream with current architectures and datasets, +highlighting the need for novel strategies in building brain-like models. + +
+
+ comment: 10 pages for the main paper, 23 pages in total. 7 main figures and 7 + supplementary figures. Code, model weights, and benchmark results can be + accessed at https://github.com/epflneuroailab/scaling-primate-vvs - In + version 2, Figure 7 and the related discussion are added, and the appendix is + updated +
+
+
+
+
+ + ♻ ☆ Unsupervised Modality-Transferable Video Highlight Detection with + Representation Activation Sequence Learning + + +
+ Identifying highlight moments of raw video materials is crucial for improving +the efficiency of editing videos that are pervasive on internet platforms. +However, the extensive work of manually labeling footage has created obstacles +to applying supervised methods to videos of unseen categories. The absence of +an audio modality that contains valuable cues for highlight detection in many +videos also makes it difficult to use multimodal strategies. In this paper, we +propose a novel model with cross-modal perception for unsupervised highlight +detection. The proposed model learns representations with visual-audio level +semantics from image-audio pair data via a self-reconstruction task. To achieve +unsupervised highlight detection, we investigate the latent representations of +the network and propose the representation activation sequence learning (RASL) +module with k-point contrastive learning to learn significant representation +activations. To connect the visual modality with the audio modality, we use the +symmetric contrastive learning (SCL) module to learn the paired visual and +audio representations. Furthermore, an auxiliary task of masked feature vector +sequence (FVS) reconstruction is simultaneously conducted during pretraining +for representation enhancement. During inference, the cross-modal pretrained +model can generate representations with paired visual-audio semantics given +only the visual modality. The RASL module is used to output the highlight +scores. The experimental results show that the proposed framework achieves +superior performance compared to other state-of-the-art approaches. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing, 2024 +
+
+
+
+
+ + ♻ ☆ LRSAA: Large-scale Remote Sensing Image Target Recognition and Automatic + Annotation + + +
+ This paper presents a method for object recognition and automatic labeling in +large-area remote sensing images called LRSAA. The method integrates YOLOv11 +and MobileNetV3-SSD object detection algorithms through ensemble learning to +enhance model performance. Furthermore, it employs Poisson disk sampling +segmentation techniques and the EIOU metric to optimize the training and +inference processes of segmented images, followed by the integration of +results. This approach not only reduces the demand for computational resources +but also achieves a good balance between accuracy and speed. The source code +for this project has been made publicly available on +https://github.com/anaerovane/LRSAA. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2411.07802 +
+
+
+
+
+ + ♻ ☆ SCMM: Calibrating Cross-modal Representations for Text-Based Person + Search + + +
+ Text-Based Person Search (TBPS) is a crucial task that enables accurate +retrieval of target individuals from large-scale galleries with only given +textual caption. For cross-modal TBPS tasks, it is critical to obtain +well-distributed representation in the common embedding space to reduce the +inter-modal gap. Furthermore, learning detailed image-text correspondences is +essential to discriminate similar targets and enable fine-grained search. To +address these challenges, we present a simple yet effective method named Sew +Calibration and Masked Modeling (SCMM) that calibrates cross-modal +representations by learning compact and well-aligned embeddings. SCMM is +distinguished by two novel losses to provide fine-grained cross-modal +representations: 1) a Sew calibration loss that takes the quality of textual +captions as guidance and aligns features between image and text modalities, and +2) a Masked Caption Modeling (MCM) loss that leverages a masked caption +prediction task to establish detailed and generic relationships between textual +and visual parts. The dual-pronged strategy refines feature alignment and +enriches cross-modal correspondences, enabling the accurate distinction of +similar individuals. Consequently, its streamlined dual-encoder architecture +avoids complex branches and interactions and facilitates high-speed inference +suitable for real-time requirements. Consequently, high-speed inference is +achieved, which is essential for resource-limited applications often demanding +real-time processing. Extensive experiments on three popular TBPS benchmarks +demonstrate the superiority of SCMM, achieving top results with 73.81%, 64.25%, +and 57.35% Rank-1 accuracy on CUHK-PEDES, ICFG-PEDES, and RSTPReID, +respectively. We hope SCMM's scalable and cost-effective design will serve as a +strong baseline and facilitate future research in this field. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Continual Low-Rank Scaled Dot-product Attention + + +
+ Transformers are widely used for their ability to capture data relations in +sequence processing, with great success for a wide range of static tasks. +However, the computational and memory footprint of their main component, i.e., +the Scaled Dot-product Attention, is commonly overlooked. This makes their +adoption in applications involving stream data processing with constraints in +response latency, computational and memory resources infeasible. Some works +have proposed methods to lower the computational cost of transformers, i.e. +low-rank approximations, sparsity in attention, and efficient formulations for +Continual Inference. In this paper, we introduce a new formulation of the +Scaled Dot-product Attention based on the Nystr\"om approximation that is +suitable for Continual Inference. In experiments on Online Audio Classification +and Online Action Detection tasks, the proposed Continual Scaled Dot-product +Attention can lower the number of operations by up to three orders of magnitude +compared to the original Transformers while retaining the predictive +performance of competing models. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Improving Fine-Grained Control via Aggregation of Multiple Diffusion + Models + + +
+ While many diffusion models perform well when controlling for particular +aspect among style, character, and interaction, they struggle with fine-grained +control due to dataset limitations and intricate model architecture design. +This paper introduces a novel algorithm, Aggregation of Multiple Diffusion +Models (AMDM), which synthesizes features from multiple diffusion models into a +specified model, activating specific features for fine-grained control. +Experimental results demonstrate that AMDM significantly improves fine-grained +control without training, proving its effectiveness. Additionally, it reveals +that diffusion models initially focus on features such as position, attributes, +and style, with later stages improving generation quality and consistency. AMDM +offers a new perspective for tackling the challenges of fine-grained +conditional control generation in diffusion models: We can fully utilize +existing or develop new conditional diffusion models that control specific +aspects, and then aggregate them using AMDM algorithm. This eliminates the need +for constructing complex datasets, designing intricate model architectures, and +incurring high training costs. Code is available at: +https://github.com/Hammour-steak/AMDM. + +
+
+
+
+
+ + ♻ ☆ LoRA of Change: Learning to Generate LoRA for the Editing Instruction + from A Single Before-After Image Pair + + +
+ In this paper, we propose the LoRA of Change (LoC) framework for image +editing with visual instructions, i.e., before-after image pairs. Compared to +the ambiguities, insufficient specificity, and diverse interpretations of +natural language, visual instructions can accurately reflect users' intent. +Building on the success of LoRA in text-based image editing and generation, we +dynamically learn an instruction-specific LoRA to encode the "change" in a +before-after image pair, enhancing the interpretability and reusability of our +model. Furthermore, generalizable models for image editing with visual +instructions typically require quad data, i.e., a before-after image pair, +along with query and target images. Due to the scarcity of such quad data, +existing models are limited to a narrow range of visual instructions. To +overcome this limitation, we introduce the LoRA Reverse optimization technique, +enabling large-scale training with paired data alone. Extensive qualitative and +quantitative experiments demonstrate that our model produces high-quality +images that align with user intent and support a broad spectrum of real-world +visual instructions. + +
+
+
+
+
+ + ♻ ☆ Practical Operator Sketching Framework for Accelerating Iterative + Data-Driven Solutions in Inverse Problems + + +
+ We propose a new operator-sketching paradigm for designing efficient +iterative data-driven reconstruction (IDR) schemes, e.g. Plug-and-Play +algorithms and deep unrolling networks. These IDR schemes are currently the +state-of-the-art solutions for imaging inverse problems. However, for +high-dimensional imaging tasks, especially X-ray CT and MRI imaging, these IDR +schemes typically become inefficient both in terms of computation, due to the +need of computing multiple times the high-dimensional forward and adjoint +operators. In this work, we explore and propose a universal dimensionality +reduction framework for accelerating IDR schemes in solving imaging inverse +problems, based on leveraging the sketching techniques from stochastic +optimization. Using this framework, we derive a number of accelerated IDR +schemes, such as the plug-and-play multi-stage sketched gradient (PnP-MS2G) and +sketching-based primal-dual (LSPD and Sk-LSPD) deep unrolling networks. +Meanwhile, for fully accelerating PnP schemes when the denoisers are +computationally expensive, we provide novel stochastic lazy denoising schemes +(Lazy-PnP and Lazy-PnP-EQ), leveraging the ProxSkip scheme in optimization and +equivariant image denoisers, which can massively accelerate the PnP algorithms +with improved practicality. We provide theoretical analysis for recovery +guarantees of instances of the proposed framework. Our numerical experiments on +natural image processing and tomographic image reconstruction demonstrate the +remarkable effectiveness of our sketched IDR schemes. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 151 + +
+
+
+ + ☆ PaintScene4D: Consistent 4D Scene Generation from Text Prompts + + +
+ Recent advances in diffusion models have revolutionized 2D and 3D content +creation, yet generating photorealistic dynamic 4D scenes remains a significant +challenge. Existing dynamic 4D generation methods typically rely on distilling +knowledge from pre-trained 3D generative models, often fine-tuned on synthetic +object datasets. Consequently, the resulting scenes tend to be object-centric +and lack photorealism. While text-to-video models can generate more realistic +scenes with motion, they often struggle with spatial understanding and provide +limited control over camera viewpoints during rendering. To address these +limitations, we present PaintScene4D, a novel text-to-4D scene generation +framework that departs from conventional multi-view generative models in favor +of a streamlined architecture that harnesses video generative models trained on +diverse real-world datasets. Our method first generates a reference video using +a video generation model, and then employs a strategic camera array selection +for rendering. We apply a progressive warping and inpainting technique to +ensure both spatial and temporal consistency across multiple viewpoints. +Finally, we optimize multi-view images using a dynamic renderer, enabling +flexible camera control based on user preferences. Adopting a training-free +architecture, our PaintScene4D efficiently produces realistic 4D scenes that +can be viewed from arbitrary trajectories. The code will be made publicly +available. Our project page is at https://paintscene4d.github.io/ + +
+
+ comment: Project page: https://paintscene4d.github.io/ +
+
+
+
+
+ + ☆ QUEEN: QUantized Efficient ENcoding of Dynamic Gaussians for Streaming + Free-viewpoint Videos NeurIPS 2024 + + +
+ Online free-viewpoint video (FVV) streaming is a challenging problem, which +is relatively under-explored. It requires incremental on-the-fly updates to a +volumetric representation, fast training and rendering to satisfy real-time +constraints and a small memory footprint for efficient transmission. If +achieved, it can enhance user experience by enabling novel applications, e.g., +3D video conferencing and live volumetric video broadcast, among others. In +this work, we propose a novel framework for QUantized and Efficient ENcoding +(QUEEN) for streaming FVV using 3D Gaussian Splatting (3D-GS). QUEEN directly +learns Gaussian attribute residuals between consecutive frames at each +time-step without imposing any structural constraints on them, allowing for +high quality reconstruction and generalizability. To efficiently store the +residuals, we further propose a quantization-sparsity framework, which contains +a learned latent-decoder for effectively quantizing attribute residuals other +than Gaussian positions and a learned gating module to sparsify position +residuals. We propose to use the Gaussian viewspace gradient difference vector +as a signal to separate the static and dynamic content of the scene. It acts as +a guide for effective sparsity learning and speeds up training. On diverse FVV +benchmarks, QUEEN outperforms the state-of-the-art online FVV methods on all +metrics. Notably, for several highly dynamic scenes, it reduces the model size +to just 0.7 MB per frame while training in under 5 sec and rendering at 350 +FPS. Project website is at https://research.nvidia.com/labs/amri/projects/queen + +
+
+ comment: Accepted at NeurIPS 2024, Project website: + https://research.nvidia.com/labs/amri/projects/queen +
+
+
+
+
+ + ☆ VisionZip: Longer is Better but Not Necessary in Vision Language Models + + +
+ Recent advancements in vision-language models have enhanced performance by +increasing the length of visual tokens, making them much longer than text +tokens and significantly raising computational costs. However, we observe that +the visual tokens generated by popular vision encoders, such as CLIP and +SigLIP, contain significant redundancy. To address this, we introduce +VisionZip, a simple yet effective method that selects a set of informative +tokens for input to the language model, reducing visual token redundancy and +improving efficiency while maintaining model performance. The proposed +VisionZip can be widely applied to image and video understanding tasks and is +well-suited for multi-turn dialogues in real-world scenarios, where previous +methods tend to underperform. Experimental results show that VisionZip +outperforms the previous state-of-the-art method by at least 5% performance +gains across nearly all settings. Moreover, our method significantly enhances +model inference speed, improving the prefilling time by 8x and enabling the +LLaVA-Next 13B model to infer faster than the LLaVA-Next 7B model while +achieving better results. Furthermore, we analyze the causes of this redundancy +and encourage the community to focus on extracting better visual features +rather than merely increasing token length. Our code is available at +https://github.com/dvlab-research/VisionZip . + +
+
+ comment: 2 columns, 28 pages, 15 figures, 18 tables +
+
+
+
+
+ + ☆ Code-as-Monitor: Constraint-aware Visual Programming for Reactive and + Proactive Robotic Failure Detection + + +
+ Automatic detection and prevention of open-set failures are crucial in +closed-loop robotic systems. Recent studies often struggle to simultaneously +identify unexpected failures reactively after they occur and prevent +foreseeable ones proactively. To this end, we propose Code-as-Monitor (CaM), a +novel paradigm leveraging the vision-language model (VLM) for both open-set +reactive and proactive failure detection. The core of our method is to +formulate both tasks as a unified set of spatio-temporal constraint +satisfaction problems and use VLM-generated code to evaluate them for real-time +monitoring. To enhance the accuracy and efficiency of monitoring, we further +introduce constraint elements that abstract constraint-related entities or +their parts into compact geometric elements. This approach offers greater +generality, simplifies tracking, and facilitates constraint-aware visual +programming by leveraging these elements as visual prompts. Experiments show +that CaM achieves a 28.7% higher success rate and reduces execution time by +31.8% under severe disturbances compared to baselines across three simulators +and a real-world setting. Moreover, CaM can be integrated with open-loop +control policies to form closed-loop systems, enabling long-horizon tasks in +cluttered scenes with dynamic environments. + +
+
+ comment: Project page: https://zhoues.github.io/Code-as-Monitor/ +
+
+
+
+
+ + ☆ EgoPlan-Bench2: A Benchmark for Multimodal Large Language Model Planning + in Real-World Scenarios + + +
+ The advent of Multimodal Large Language Models, leveraging the power of Large +Language Models, has recently demonstrated superior multimodal understanding +and reasoning abilities, heralding a new era for artificial general +intelligence. However, achieving AGI necessitates more than just comprehension +and reasoning. A crucial capability required is effective planning in diverse +scenarios, which involves making reasonable decisions based on complex +environments to solve real-world problems. Despite its importance, the planning +abilities of current MLLMs in varied scenarios remain underexplored. In this +paper, we introduce EgoPlan-Bench2, a rigorous and comprehensive benchmark +designed to assess the planning capabilities of MLLMs across a wide range of +real-world scenarios. EgoPlan-Bench2 encompasses everyday tasks spanning 4 +major domains and 24 detailed scenarios, closely aligned with human daily life. +EgoPlan-Bench2 is constructed through a semi-automatic process utilizing +egocentric videos, complemented by manual verification. Grounded in a +first-person perspective, it mirrors the way humans approach problem-solving in +everyday life. We evaluate 21 competitive MLLMs and provide an in-depth +analysis of their limitations, revealing that they face significant challenges +in real-world planning. To further improve the planning proficiency of current +MLLMs, we propose a training-free approach using multimodal Chain-of-Thought +(CoT) prompting through investigating the effectiveness of various multimodal +prompts in complex planning. Our approach enhances the performance of GPT-4V by +10.24 on EgoPlan-Bench2 without additional training. Our work not only sheds +light on the current limitations of MLLMs in planning, but also provides +insights for future enhancements in this critical area. We have made data and +code available at https://qiulu66.github.io/egoplanbench2/. + +
+
+ comment: Code & data are available at: + https://qiulu66.github.io/egoplanbench2/ +
+
+
+
+
+ + ☆ Moto: Latent Motion Token as the Bridging Language for Robot + Manipulation + + +
+ Recent developments in Large Language Models pre-trained on extensive corpora +have shown significant success in various natural language processing tasks +with minimal fine-tuning. This success offers new promise for robotics, which +has long been constrained by the high cost of action-labeled data. We ask: +given the abundant video data containing interaction-related knowledge +available as a rich "corpus", can a similar generative pre-training approach be +effectively applied to enhance robot learning? The key challenge is to identify +an effective representation for autoregressive pre-training that benefits robot +manipulation tasks. Inspired by the way humans learn new skills through +observing dynamic environments, we propose that effective robotic learning +should emphasize motion-related knowledge, which is closely tied to low-level +actions and is hardware-agnostic, facilitating the transfer of learned motions +to actual robot actions. To this end, we introduce Moto, which converts video +content into latent Motion Token sequences by a Latent Motion Tokenizer, +learning a bridging "language" of motion from videos in an unsupervised manner. +We pre-train Moto-GPT through motion token autoregression, enabling it to +capture diverse visual motion knowledge. After pre-training, Moto-GPT +demonstrates the promising ability to produce semantically interpretable motion +tokens, predict plausible motion trajectories, and assess trajectory +rationality through output likelihood. To transfer learned motion priors to +real robot actions, we implement a co-fine-tuning strategy that seamlessly +bridges latent motion token prediction and real robot control. Extensive +experiments show that the fine-tuned Moto-GPT exhibits superior robustness and +efficiency on robot manipulation benchmarks, underscoring its effectiveness in +transferring knowledge from video data to downstream visual manipulation tasks. + +
+
+ comment: Project released at: https://chenyi99.github.io/moto/ +
+
+
+
+
+ + ☆ Marvel: Accelerating Safe Online Reinforcement Learning with Finetuned + Offline Policy + + +
+ The high costs and risks involved in extensive environment interactions +hinder the practical application of current online safe reinforcement learning +(RL) methods. While offline safe RL addresses this by learning policies from +static datasets, the performance therein is usually limited due to reliance on +data quality and challenges with out-of-distribution (OOD) actions. Inspired by +recent successes in offline-to-online (O2O) RL, it is crucial to explore +whether offline safe RL can be leveraged to facilitate faster and safer online +policy learning, a direction that has yet to be fully investigated. To fill +this gap, we first demonstrate that naively applying existing O2O algorithms +from standard RL would not work well in the safe RL setting due to two unique +challenges: \emph{erroneous Q-estimations}, resulted from offline-online +objective mismatch and offline cost sparsity, and \emph{Lagrangian mismatch}, +resulted from difficulties in aligning Lagrange multipliers between offline and +online policies. To address these challenges, we introduce \textbf{Marvel}, a +novel framework for O2O safe RL, comprising two key components that work in +concert: \emph{Value Pre-Alignment} to align the Q-functions with the +underlying truth before online learning, and \emph{Adaptive PID Control} to +effectively adjust the Lagrange multipliers during online finetuning. Extensive +experiments demonstrate that Marvel significantly outperforms existing +baselines in both reward maximization and safety constraint satisfaction. By +introducing the first policy-finetuning based framework for O2O safe RL, which +is compatible with many offline and online safe RL methods, our work has the +great potential to advance the field towards more efficient and practical safe +RL solutions. + +
+
+
+
+
+ + ☆ Florence-VL: Enhancing Vision-Language Models with Generative Vision + Encoder and Depth-Breadth Fusion + + +
+ We present Florence-VL, a new family of multimodal large language models +(MLLMs) with enriched visual representations produced by Florence-2, a +generative vision foundation model. Unlike the widely used CLIP-style vision +transformer trained by contrastive learning, Florence-2 can capture different +levels and aspects of visual features, which are more versatile to be adapted +to diverse downstream tasks. We propose a novel feature-fusion architecture and +an innovative training recipe that effectively integrates Florence-2's visual +features into pretrained LLMs, such as Phi 3.5 and LLama 3. In particular, we +propose "depth-breath fusion (DBFusion)" to fuse the visual features extracted +from different depths and under multiple prompts. Our model training is +composed of end-to-end pretraining of the whole model followed by finetuning of +the projection layer and the LLM, on a carefully designed recipe of diverse +open-source datasets that include high-quality image captions and +instruction-tuning pairs. Our quantitative analysis and visualization of +Florence-VL's visual features show its advantages over popular vision encoders +on vision-language alignment, where the enriched depth and breath play +important roles. Florence-VL achieves significant improvements over existing +state-of-the-art MLLMs across various multi-modal and vision-centric benchmarks +covering general VQA, perception, hallucination, OCR, Chart, +knowledge-intensive understanding, etc. To facilitate future research, our +models and the complete training recipe are open-sourced. +https://github.com/JiuhaiChen/Florence-VL + +
+
+
+
+
+ + ☆ FedDUAL: A Dual-Strategy with Adaptive Loss and Dynamic Aggregation for + Mitigating Data Heterogeneity in Federated Learning + + +
+ Federated Learning (FL) marks a transformative approach to distributed model +training by combining locally optimized models from various clients into a +unified global model. While FL preserves data privacy by eliminating +centralized storage, it encounters significant challenges such as performance +degradation, slower convergence, and reduced robustness of the global model due +to the heterogeneity in client data distributions. Among the various forms of +data heterogeneity, label skew emerges as a particularly formidable and +prevalent issue, especially in domains such as image classification. To address +these challenges, we begin with comprehensive experiments to pinpoint the +underlying issues in the FL training process. Based on our findings, we then +introduce an innovative dual-strategy approach designed to effectively resolve +these issues. First, we introduce an adaptive loss function for client-side +training, meticulously crafted to preserve previously acquired knowledge while +maintaining an optimal equilibrium between local optimization and global model +coherence. Secondly, we develop a dynamic aggregation strategy for aggregating +client models at the server. This approach adapts to each client's unique +learning patterns, effectively addressing the challenges of diverse data across +the network. Our comprehensive evaluation, conducted across three diverse +real-world datasets, coupled with theoretical convergence guarantees, +demonstrates the superior efficacy of our method compared to several +established state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Targeting the Core: A Simple and Effective Method to Attack RAG-based + Agents via Direct LLM Manipulation + + +
+ AI agents, powered by large language models (LLMs), have transformed +human-computer interactions by enabling seamless, natural, and context-aware +communication. While these advancements offer immense utility, they also +inherit and amplify inherent safety risks such as bias, fairness, +hallucinations, privacy breaches, and a lack of transparency. This paper +investigates a critical vulnerability: adversarial attacks targeting the LLM +core within AI agents. Specifically, we test the hypothesis that a deceptively +simple adversarial prefix, such as \textit{Ignore the document}, can compel +LLMs to produce dangerous or unintended outputs by bypassing their contextual +safeguards. Through experimentation, we demonstrate a high attack success rate +(ASR), revealing the fragility of existing LLM defenses. These findings +emphasize the urgent need for robust, multi-layered security measures tailored +to mitigate vulnerabilities at the LLM level and within broader agent-based +architectures. + +
+
+
+
+
+ + ☆ Establishing Task Scaling Laws via Compute-Efficient Model Ladders + + +
+ We develop task scaling laws and model ladders to predict the individual task +performance of pretrained language models (LMs) in the overtrained setting. +Standard power laws for language modeling loss cannot accurately model task +performance. Therefore, we leverage a two-step prediction approach: first use +model and data size to predict a task-specific loss, and then use this task +loss to predict task performance. We train a set of small-scale "ladder" +models, collect data points to fit the parameterized functions of the two +prediction steps, and make predictions for two target models: a 7B model +trained to 4T tokens and a 13B model trained to 5T tokens. Training the ladder +models only costs 1% of the compute used for the target models. On four +multiple-choice tasks written in ranked classification format, we can predict +the accuracy of both target models within 2 points of absolute error. We have +higher prediction error on four other tasks (average absolute error 6.9) and +find that these are often tasks with higher variance in task metrics. We also +find that using less compute to train fewer ladder models tends to deteriorate +predictions. Finally, we empirically show that our design choices and the +two-step approach lead to superior performance in establishing scaling laws. + +
+
+
+
+
+ + ☆ Probabilistic Gaussian Superposition for Efficient 3D Occupancy + Prediction + + +
+ 3D semantic occupancy prediction is an important task for robust +vision-centric autonomous driving, which predicts fine-grained geometry and +semantics of the surrounding scene. Most existing methods leverage dense +grid-based scene representations, overlooking the spatial sparsity of the +driving scenes. Although 3D semantic Gaussian serves as an object-centric +sparse alternative, most of the Gaussians still describe the empty region with +low efficiency. To address this, we propose a probabilistic Gaussian +superposition model which interprets each Gaussian as a probability +distribution of its neighborhood being occupied and conforms to probabilistic +multiplication to derive the overall geometry. Furthermore, we adopt the exact +Gaussian mixture model for semantics calculation to avoid unnecessary +overlapping of Gaussians. To effectively initialize Gaussians in non-empty +region, we design a distribution-based initialization module which learns the +pixel-aligned occupancy distribution instead of the depth of surfaces. We +conduct extensive experiments on nuScenes and KITTI-360 datasets and our +GaussianFormer-2 achieves state-of-the-art performance with high efficiency. +Code: https://github.com/huang-yh/GaussianFormer. + +
+
+ comment: Code is available at: https://github.com/huang-yh/GaussianFormer +
+
+
+
+
+ + ☆ EmbodiedOcc: Embodied 3D Occupancy Prediction for Vision-based Online + Scene Understanding + + +
+ 3D occupancy prediction provides a comprehensive description of the +surrounding scenes and has become an essential task for 3D perception. Most +existing methods focus on offline perception from one or a few views and cannot +be applied to embodied agents which demands to gradually perceive the scene +through progressive embodied exploration. In this paper, we formulate an +embodied 3D occupancy prediction task to target this practical scenario and +propose a Gaussian-based EmbodiedOcc framework to accomplish it. We initialize +the global scene with uniform 3D semantic Gaussians and progressively update +local regions observed by the embodied agent. For each update, we extract +semantic and structural features from the observed image and efficiently +incorporate them via deformable cross-attention to refine the regional +Gaussians. Finally, we employ Gaussian-to-voxel splatting to obtain the global +3D occupancy from the updated 3D Gaussians. Our EmbodiedOcc assumes an unknown +(i.e., uniformly distributed) environment and maintains an explicit global +memory of it with 3D Gaussians. It gradually gains knowledge through local +refinement of regional Gaussians, which is consistent with how humans +understand new scenes through embodied exploration. We reorganize an +EmbodiedOcc-ScanNet benchmark based on local annotations to facilitate the +evaluation of the embodied 3D occupancy prediction task. Experiments +demonstrate that our EmbodiedOcc outperforms existing local prediction methods +and accomplishes the embodied occupancy prediction with high accuracy and +strong expandability. Our code is available at: +https://github.com/YkiWu/EmbodiedOcc. + +
+
+ comment: Code: https://github.com/YkiWu/EmbodiedOcc +
+
+
+
+
+ + ☆ Discriminative Fine-tuning of LVLMs + + +
+ Contrastively-trained Vision-Language Models (VLMs) like CLIP have become the +de facto approach for discriminative vision-language representation learning. +However, these models have limited language understanding, often exhibiting a +"bag of words" behavior. At the same time, Large Vision-Language Models +(LVLMs), which combine vision encoders with LLMs, have been shown capable of +detailed vision-language reasoning, yet their autoregressive nature renders +them less suitable for discriminative tasks. + In this work, we propose to combine "the best of both worlds": a new training +approach for discriminative fine-tuning of LVLMs that results in strong +discriminative and compositional capabilities. Essentially, our approach +converts a generative LVLM into a discriminative one, unlocking its capability +for powerful image-text discrimination combined with enhanced language +understanding. + Our contributions include: (1) A carefully designed training/optimization +framework that utilizes image-text pairs of variable length and granularity for +training the model with both contrastive and next-token prediction losses. This +is accompanied by ablation studies that justify the necessity of our +framework's components. (2) A parameter-efficient adaptation method using a +combination of soft prompting and LoRA adapters. (3) Significant improvements +over state-of-the-art CLIP-like models of similar size, including standard +image-text retrieval benchmarks and notable gains in compositionality. + +
+
+ comment: Preprint. The first two authors contributed equally +
+
+
+
+
+ + ☆ Machine Theory of Mind for Autonomous Cyber-Defence + + +
+ Intelligent autonomous agents hold much potential for the domain of +cyber-security. However, due to many state-of-the-art approaches relying on +uninterpretable black-box models, there is growing demand for methods that +offer stakeholders clear and actionable insights into their latent beliefs and +motivations. To address this, we evaluate Theory of Mind (ToM) approaches for +Autonomous Cyber Operations. Upon learning a robust prior, ToM models can +predict an agent's goals, behaviours, and contextual beliefs given only a +handful of past behaviour observations. In this paper, we introduce a novel +Graph Neural Network (GNN)-based ToM architecture tailored for cyber-defence, +Graph-In, Graph-Out (GIGO)-ToM, which can accurately predict both the targets +and attack trajectories of adversarial cyber agents over arbitrary computer +network topologies. To evaluate the latter, we propose a novel extension of the +Wasserstein distance for measuring the similarity of graph-based probability +distributions. Whereas the standard Wasserstein distance lacks a fixed +reference scale, we introduce a graph-theoretic normalization factor that +enables a standardized comparison between networks of different sizes. We +furnish this metric, which we term the Network Transport Distance (NTD), with a +weighting function that emphasizes predictions according to custom node +features, allowing network operators to explore arbitrary strategic +considerations. Benchmarked against a Graph-In, Dense-Out (GIDO)-ToM +architecture in an abstract cyber-defence environment, our empirical +evaluations show that GIGO-ToM can accurately predict the goals and behaviours +of various unseen cyber-attacking agents across a range of network topologies, +as well as learn embeddings that can effectively characterize their policies. + +
+
+ comment: 29 pages, 17 figures, 12 tables +
+
+
+
+
+ + ☆ Artificial intelligence and the internal processes of creativity + + +
+ Artificial intelligence (AI) systems capable of generating creative outputs +are reshaping our understanding of creativity. This shift presents an +opportunity for creativity researchers to reevaluate the key components of the +creative process. In particular, the advanced capabilities of AI underscore the +importance of studying the internal processes of creativity. This paper +explores the neurobiological machinery that underlies these internal processes +and describes the experiential component of creativity. It is concluded that +although the products of artificial and human creativity can be similar, the +internal processes are different. The paper also discusses how AI may +negatively affect the internal processes of human creativity, such as the +development of skills, the integration of knowledge, and the diversity of +ideas. + +
+
+
+
+
+ + ☆ BhashaVerse : Translation Ecosystem for Indian Subcontinent Languages + + +
+ This paper focuses on developing translation models and related applications +for 36 Indian languages, including Assamese, Awadhi, Bengali, Bhojpuri, Braj, +Bodo, Dogri, English, Konkani, Gondi, Gujarati, Hindi, Hinglish, Ho, Kannada, +Kangri, Kashmiri (Arabic and Devanagari), Khasi, Mizo, Magahi, Maithili, +Malayalam, Marathi, Manipuri (Bengali and Meitei), Nepali, Oriya, Punjabi, +Sanskrit, Santali, Sinhala, Sindhi (Arabic and Devanagari), Tamil, Tulu, +Telugu, and Urdu. Achieving this requires parallel and other types of corpora +for all 36 * 36 language pairs, addressing challenges like script variations, +phonetic differences, and syntactic diversity. For instance, languages like +Kashmiri and Sindhi, which use multiple scripts, demand script normalization +for alignment, while low-resource languages such as Khasi and Santali require +synthetic data augmentation to ensure sufficient coverage and quality. + To address these challenges, this work proposes strategies for corpus +creation by leveraging existing resources, developing parallel datasets, +generating domain-specific corpora, and utilizing synthetic data techniques. +Additionally, it evaluates machine translation across various dimensions, +including standard and discourse-level translation, domain-specific +translation, reference-based and reference-free evaluation, error analysis, and +automatic post-editing. By integrating these elements, the study establishes a +comprehensive framework to improve machine translation quality and enable +better cross-lingual communication in India's linguistically diverse ecosystem. + +
+
+
+
+
+ + ☆ RMD: A Simple Baseline for More General Human Motion Generation via + Training-free Retrieval-Augmented Motion Diffuse + + +
+ While motion generation has made substantial progress, its practical +application remains constrained by dataset diversity and scale, limiting its +ability to handle out-of-distribution scenarios. To address this, we propose a +simple and effective baseline, RMD, which enhances the generalization of motion +generation through retrieval-augmented techniques. Unlike previous +retrieval-based methods, RMD requires no additional training and offers three +key advantages: (1) the external retrieval database can be flexibly replaced; +(2) body parts from the motion database can be reused, with an LLM facilitating +splitting and recombination; and (3) a pre-trained motion diffusion model +serves as a prior to improve the quality of motions obtained through retrieval +and direct combination. Without any training, RMD achieves state-of-the-art +performance, with notable advantages on out-of-distribution data. + +
+
+
+
+
+ + ☆ Retrieval-Augmented Machine Translation with Unstructured Knowledge + + +
+ Retrieval-augmented generation (RAG) introduces additional information to +enhance large language models (LLMs). In machine translation (MT), previous +work typically retrieves in-context examples from paired MT corpora, or +domain-specific knowledge from knowledge graphs, to enhance models' MT ability. +However, a large amount of world knowledge is organized in unstructured +documents, and might not be fully paired across different languages. In this +paper, we study retrieval-augmented MT using unstructured documents. +Specifically, we build RAGtrans, the first benchmark to train and evaluate +LLMs' retrieval-augmented MT ability. RAGtrans contains 79K MT samples +collected via GPT-4o and human translators. Besides, documents from different +languages are also provided to supply the knowledge to these samples. Based on +RAGtrans, we further propose a multi-task training method to teach LLMs how to +use information from multilingual documents during their translation. The +method uses existing multilingual corpora to create auxiliary training +objectives without additional labeling requirements. Extensive experiments show +that the method improves LLMs by 1.58-3.09 BLEU and 1.00-2.03 COMET scores. + +
+
+
+
+
+ + ☆ Action Mapping for Reinforcement Learning in Continuous Environments + with Constraints + + +
+ Deep reinforcement learning (DRL) has had success across various domains, but +applying it to environments with constraints remains challenging due to poor +sample efficiency and slow convergence. Recent literature explored +incorporating model knowledge to mitigate these problems, particularly through +the use of models that assess the feasibility of proposed actions. However, +integrating feasibility models efficiently into DRL pipelines in environments +with continuous action spaces is non-trivial. We propose a novel DRL training +strategy utilizing action mapping that leverages feasibility models to +streamline the learning process. By decoupling the learning of feasible actions +from policy optimization, action mapping allows DRL agents to focus on +selecting the optimal action from a reduced feasible action set. We demonstrate +through experiments that action mapping significantly improves training +performance in constrained environments with continuous action spaces, +especially with imperfect feasibility models. + +
+
+
+
+
+ + ☆ GRAM: Generalization in Deep RL with a Robust Adaptation Module + + +
+ The reliable deployment of deep reinforcement learning in real-world settings +requires the ability to generalize across a variety of conditions, including +both in-distribution scenarios seen during training as well as novel +out-of-distribution scenarios. In this work, we present a framework for +dynamics generalization in deep reinforcement learning that unifies these two +distinct types of generalization within a single architecture. We introduce a +robust adaptation module that provides a mechanism for identifying and reacting +to both in-distribution and out-of-distribution environment dynamics, along +with a joint training pipeline that combines the goals of in-distribution +adaptation and out-of-distribution robustness. Our algorithm GRAM achieves +strong generalization performance across in-distribution and +out-of-distribution scenarios upon deployment, which we demonstrate on a +variety of realistic simulated locomotion tasks with a quadruped robot. + +
+
+
+
+
+ + ☆ The Hyperfitting Phenomenon: Sharpening and Stabilizing LLMs for + Open-Ended Text Generation ICLR + + +
+ This paper introduces the counter-intuitive generalization results of +overfitting pre-trained large language models (LLMs) on very small datasets. In +the setting of open-ended text generation, it is well-documented that LLMs tend +to generate repetitive and dull sequences, a phenomenon that is especially +apparent when generating using greedy decoding. This issue persists even with +state-of-the-art LLMs containing billions of parameters, trained via next-token +prediction on large datasets. We find that by further fine-tuning these models +to achieve a near-zero training loss on a small set of samples -- a process we +refer to as hyperfitting -- the long-sequence generative capabilities are +greatly enhanced. Greedy decoding with these Hyperfitted models even outperform +Top-P sampling over long-sequences, both in terms of diversity and human +preferences. This phenomenon extends to LLMs of various sizes, different +domains, and even autoregressive image generation. We further find this +phenomena to be distinctly different from that of Grokking and double descent. +Surprisingly, our experiments indicate that hyperfitted models rarely fall into +repeating sequences they were trained on, and even explicitly blocking these +sequences results in high-quality output. All hyperfitted models produce +extremely low-entropy predictions, often allocating nearly all probability to a +single token. + +
+
+ comment: Under review at ICLR +
+
+
+
+
+ + ☆ Densing Law of LLMs + + +
+ Large Language Models (LLMs) have emerged as a milestone in artificial +intelligence, and their performance can improve as the model size increases. +However, this scaling brings great challenges to training and inference +efficiency, particularly for deploying LLMs in resource-constrained +environments, and the scaling trend is becoming increasingly unsustainable. +This paper introduces the concept of ``\textit{capacity density}'' as a new +metric to evaluate the quality of the LLMs across different scales and +describes the trend of LLMs in terms of both effectiveness and efficiency. To +calculate the capacity density of a given target LLM, we first introduce a set +of reference models and develop a scaling law to predict the downstream +performance of these reference models based on their parameter sizes. We then +define the \textit{effective parameter size} of the target LLM as the parameter +size required by a reference model to achieve equivalent performance, and +formalize the capacity density as the ratio of the effective parameter size to +the actual parameter size of the target LLM. Capacity density provides a +unified framework for assessing both model effectiveness and efficiency. Our +further analysis of recent open-source base LLMs reveals an empirical law (the +densing law)that the capacity density of LLMs grows exponentially over time. +More specifically, using some widely used benchmarks for evaluation, the +capacity density of LLMs doubles approximately every three months. The law +provides new perspectives to guide future LLM development, emphasizing the +importance of improving capacity density to achieve optimal results with +minimal computational overhead. + +
+
+
+
+
+ + ☆ T2I-FactualBench: Benchmarking the Factuality of Text-to-Image Models + with Knowledge-Intensive Concepts + + +
+ Evaluating the quality of synthesized images remains a significant challenge +in the development of text-to-image (T2I) generation. Most existing studies in +this area primarily focus on evaluating text-image alignment, image quality, +and object composition capabilities, with comparatively fewer studies +addressing the evaluation of the factuality of T2I models, particularly when +the concepts involved are knowledge-intensive. To mitigate this gap, we present +T2I-FactualBench in this work - the largest benchmark to date in terms of the +number of concepts and prompts specifically designed to evaluate the factuality +of knowledge-intensive concept generation. T2I-FactualBench consists of a +three-tiered knowledge-intensive text-to-image generation framework, ranging +from the basic memorization of individual knowledge concepts to the more +complex composition of multiple knowledge concepts. We further introduce a +multi-round visual question answering (VQA) based evaluation framework to +assess the factuality of three-tiered knowledge-intensive text-to-image +generation tasks. Experiments on T2I-FactualBench indicate that current +state-of-the-art (SOTA) T2I models still leave significant room for +improvement. + +
+
+
+
+
+ + ☆ SIDA: Social Media Image Deepfake Detection, Localization and + Explanation with Large Multimodal Model + + +
+ The rapid advancement of generative models in creating highly realistic +images poses substantial risks for misinformation dissemination. For instance, +a synthetic image, when shared on social media, can mislead extensive audiences +and erode trust in digital content, resulting in severe repercussions. Despite +some progress, academia has not yet created a large and diversified deepfake +detection dataset for social media, nor has it devised an effective solution to +address this issue. In this paper, we introduce the Social media Image +Detection dataSet (SID-Set), which offers three key advantages: (1) extensive +volume, featuring 300K AI-generated/tampered and authentic images with +comprehensive annotations, (2) broad diversity, encompassing fully synthetic +and tampered images across various classes, and (3) elevated realism, with +images that are predominantly indistinguishable from genuine ones through mere +visual inspection. Furthermore, leveraging the exceptional capabilities of +large multimodal models, we propose a new image deepfake detection, +localization, and explanation framework, named SIDA (Social media Image +Detection, localization, and explanation Assistant). SIDA not only discerns the +authenticity of images, but also delineates tampered regions through mask +prediction and provides textual explanations of the model's judgment criteria. +Compared with state-of-the-art deepfake detection models on SID-Set and other +benchmarks, extensive experiments demonstrate that SIDA achieves superior +performance among diversified settings. The code, model, and dataset will be +released. + +
+
+
+
+
+ + ☆ PoTable: Programming Standardly on Table-based Reasoning Like a Human + Analyst + + +
+ Table-based reasoning has garnered substantial research interest, +particularly in its integration with Large Language Model (LLM) which has +revolutionized the general reasoning paradigm. Numerous LLM-based studies +introduce symbolic tools (e.g., databases, Python) as assistants to extend +human-like abilities in structured table understanding and complex arithmetic +computations. However, these studies can be improved better in simulating human +cognitive behavior when using symbolic tools, as they still suffer from +limitations of non-standard logical splits and constrained operation pools. In +this study, we propose PoTable as a novel table-based reasoning method that +simulates a human tabular analyst, which integrates a Python interpreter as the +real-time executor accompanied by an LLM-based operation planner and code +generator. Specifically, PoTable follows a human-like logical stage split and +extends the operation pool into an open-world space without any constraints. +Through planning and executing in each distinct stage, PoTable standardly +completes the entire reasoning process and produces superior reasoning results +along with highly accurate, steply commented and completely executable +programs. Accordingly, the effectiveness and explainability of PoTable are +fully demonstrated. Extensive experiments over three evaluation datasets from +two public benchmarks on two backbones show the outstanding performance of our +approach. In particular, GPT-based PoTable achieves over 4% higher absolute +accuracy than runner-ups on all evaluation datasets. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Enhancing Whole Slide Image Classification through Supervised + Contrastive Domain Adaptation + + +
+ Domain shift in the field of histopathological imaging is a common phenomenon +due to the intra- and inter-hospital variability of staining and digitization +protocols. The implementation of robust models, capable of creating generalized +domains, represents a need to be solved. In this work, a new domain adaptation +method to deal with the variability between histopathological images from +multiple centers is presented. In particular, our method adds a training +constraint to the supervised contrastive learning approach to achieve domain +adaptation and improve inter-class separability. Experiments performed on +domain adaptation and classification of whole-slide images of six skin cancer +subtypes from two centers demonstrate the method's usefulness. The results +reflect superior performance compared to not using domain adaptation after +feature extraction or staining normalization. + +
+
+ comment: Accepted in CASEIB 2024 +
+
+
+
+
+ + ☆ Transient Multi-Agent Path Finding for Lifelong Navigation in Dense + Environments ICAPS 2025 + + +
+ Multi-Agent Path Finding (MAPF) deals with finding conflict-free paths for a +set of agents from an initial configuration to a given target configuration. +The Lifelong MAPF (LMAPF) problem is a well-studied online version of MAPF in +which an agent receives a new target when it reaches its current target. The +common approach for solving LMAPF is to treat it as a sequence of MAPF +problems, periodically replanning from the agents' current configurations to +their current targets. A significant drawback in this approach is that in MAPF +the agents must reach a configuration in which all agents are at their targets +simultaneously, which is needlessly restrictive for LMAPF. Techniques have been +proposed to indirectly mitigate this drawback. We describe cases where these +mitigation techniques fail. As an alternative, we propose to solve LMAPF +problems by solving a sequence of modified MAPF problems, in which the +objective is for each agent to eventually visit its target, but not necessarily +for all agents to do so simultaneously. We refer to this MAPF variant as +Transient MAPF (TMAPF) and propose several algorithms for solving it based on +existing MAPF algorithms. A limited experimental evaluation identifies some +cases where using a TMAPF algorithm instead of a MAPF algorithm with an LMAPF +framework can improve the system throughput significantly. + +
+
+ comment: Submitted to The 35th International Conference on Automated Planning + and Scheduling (ICAPS 2025) +
+
+
+
+
+ + ☆ CLINICSUM: Utilizing Language Models for Generating Clinical Summaries + from Patient-Doctor Conversations + + +
+ This paper presents ClinicSum, a novel framework designed to automatically +generate clinical summaries from patient-doctor conversations. It utilizes a +two-module architecture: a retrieval-based filtering module that extracts +Subjective, Objective, Assessment, and Plan (SOAP) information from +conversation transcripts, and an inference module powered by fine-tuned +Pre-trained Language Models (PLMs), which leverage the extracted SOAP data to +generate abstracted clinical summaries. To fine-tune the PLM, we created a +training dataset of consisting 1,473 conversations-summaries pair by +consolidating two publicly available datasets, FigShare and MTS-Dialog, with +ground truth summaries validated by Subject Matter Experts (SMEs). ClinicSum's +effectiveness is evaluated through both automatic metrics (e.g., ROUGE, +BERTScore) and expert human assessments. Results show that ClinicSum +outperforms state-of-the-art PLMs, demonstrating superior precision, recall, +and F-1 scores in automatic evaluations and receiving high preference from SMEs +in human assessment, making it a robust solution for automated clinical +summarization. + +
+
+ comment: accepted at the the 2024 IEEE International Conference on Big Data + workshop Workshop on Big Data and AI for Healthcare +
+
+
+
+
+ + ☆ DEIM: DETR with Improved Matching for Fast Convergence + + +
+ We introduce DEIM, an innovative and efficient training framework designed to +accelerate convergence in real-time object detection with Transformer-based +architectures (DETR). To mitigate the sparse supervision inherent in one-to-one +(O2O) matching in DETR models, DEIM employs a Dense O2O matching strategy. This +approach increases the number of positive samples per image by incorporating +additional targets, using standard data augmentation techniques. While Dense +O2O matching speeds up convergence, it also introduces numerous low-quality +matches that could affect performance. To address this, we propose the +Matchability-Aware Loss (MAL), a novel loss function that optimizes matches +across various quality levels, enhancing the effectiveness of Dense O2O. +Extensive experiments on the COCO dataset validate the efficacy of DEIM. When +integrated with RT-DETR and D-FINE, it consistently boosts performance while +reducing training time by 50%. Notably, paired with RT-DETRv2, DEIM achieves +53.2% AP in a single day of training on an NVIDIA 4090 GPU. Additionally, +DEIM-trained real-time models outperform leading real-time object detectors, +with DEIM-D-FINE-L and DEIM-D-FINE-X achieving 54.7% and 56.5% AP at 124 and 78 +FPS on an NVIDIA T4 GPU, respectively, without the need for additional data. We +believe DEIM sets a new baseline for advancements in real-time object +detection. Our code and pre-trained models are available at +https://github.com/ShihuaHuang95/DEIM. + +
+
+ comment: Exceeding all existing real-time object detectors, including YOLOv11 + and D-FINE +
+
+
+
+
+ + ☆ HyperMARL: Adaptive Hypernetworks for Multi-Agent RL + + +
+ Balancing individual specialisation and shared behaviours is a critical +challenge in multi-agent reinforcement learning (MARL). Existing methods +typically focus on encouraging diversity or leveraging shared representations. +Full parameter sharing (FuPS) improves sample efficiency but struggles to learn +diverse behaviours when required, while no parameter sharing (NoPS) enables +diversity but is computationally expensive and sample inefficient. To address +these challenges, we introduce HyperMARL, a novel approach using hypernetworks +to balance efficiency and specialisation. HyperMARL generates agent-specific +actor and critic parameters, enabling agents to adaptively exhibit diverse or +homogeneous behaviours as needed, without modifying the learning objective or +requiring prior knowledge of the optimal diversity. Furthermore, HyperMARL +decouples agent-specific and state-based gradients, which empirically +correlates with reduced policy gradient variance, potentially offering insights +into its ability to capture diverse behaviours. Across MARL benchmarks +requiring homogeneous, heterogeneous, or mixed behaviours, HyperMARL +consistently matches or outperforms FuPS, NoPS, and diversity-focused methods, +achieving NoPS-level diversity with a shared architecture. These results +highlight the potential of hypernetworks as a versatile approach to the +trade-off between specialisation and shared behaviours in MARL. + +
+
+
+
+
+ + ☆ Customize Segment Anything Model for Multi-Modal Semantic Segmentation + with Mixture of LoRA Experts + + +
+ The recent Segment Anything Model (SAM) represents a significant breakthrough +in scaling segmentation models, delivering strong performance across various +downstream applications in the RGB modality. However, directly applying SAM to +emerging visual modalities, such as depth and event data results in suboptimal +performance in multi-modal segmentation tasks. In this paper, we make the first +attempt to adapt SAM for multi-modal semantic segmentation by proposing a +Mixture of Low-Rank Adaptation Experts (MoE-LoRA) tailored for different input +visual modalities. By training only the MoE-LoRA layers while keeping SAM's +weights frozen, SAM's strong generalization and segmentation capabilities can +be preserved for downstream tasks. Specifically, to address cross-modal +inconsistencies, we propose a novel MoE routing strategy that adaptively +generates weighted features across modalities, enhancing multi-modal feature +integration. Additionally, we incorporate multi-scale feature extraction and +fusion by adapting SAM's segmentation head and introducing an auxiliary +segmentation head to combine multi-scale features for improved segmentation +performance effectively. Extensive experiments were conducted on three +multi-modal benchmarks: DELIVER, MUSES, and MCubeS. The results consistently +demonstrate that the proposed method significantly outperforms state-of-the-art +approaches across diverse scenarios. Notably, under the particularly +challenging condition of missing modalities, our approach exhibits a +substantial performance gain, achieving an improvement of 32.15% compared to +existing methods. + +
+
+
+
+
+ + ☆ Relationships between Keywords and Strong Beats in Lyrical Music + + +
+ Artificial Intelligence (AI) song generation has emerged as a popular topic, +yet the focus on exploring the latent correlations between specific lyrical and +rhythmic features remains limited. In contrast, this pilot study particularly +investigates the relationships between keywords and rhythmically stressed +features such as strong beats in songs. It focuses on several key elements: +keywords or non-keywords, stressed or unstressed syllables, and strong or weak +beats, with the aim of uncovering insightful correlations. Experimental results +indicate that, on average, 80.8\% of keywords land on strong beats, whereas +62\% of non-keywords fall on weak beats. The relationship between stressed +syllables and strong or weak beats is weak, revealing that keywords have the +strongest relationships with strong beats. Additionally, the lyrics-rhythm +matching score, a key matching metric measuring keywords on strong beats and +non-keywords on weak beats across various time signatures, is 0.765, while the +matching score for syllable types is 0.495. This study demonstrates that word +types strongly align with their corresponding beat types, as evidenced by the +distinct patterns, whereas syllable types exhibit a much weaker alignment. This +disparity underscores the greater reliability of word types in capturing +rhythmic structures in music, highlighting their crucial role in effective +rhythmic matching and analysis. We also conclude that keywords that +consistently align with strong beats are more reliable indicators of +lyrics-rhythm associations, providing valuable insights for AI-driven song +generation through enhanced structural analysis. Furthermore, our development +of tailored Lyrics-Rhythm Matching (LRM) metrics maximizes lyrical alignments +with corresponding beat stresses, and our novel LRM file format captures +critical lyrical and rhythmic information without needing original sheet music. + +
+
+ comment: Accepted by IEEE BigData 2024 +
+
+
+
+
+ + ☆ Directed Structural Adaptation to Overcome Statistical Conflicts and + Enable Continual Learning AAAI-2024 + + +
+ Adaptive networks today rely on overparameterized fixed topologies that +cannot break through the statistical conflicts they encounter in the data they +are exposed to, and are prone to "catastrophic forgetting" as the network +attempts to reuse the existing structures to learn new task. We propose a +structural adaptation method, DIRAD, that can complexify as needed and in a +directed manner without being limited by statistical conflicts within a +dataset. We then extend this method and present the PREVAL framework, designed +to prevent "catastrophic forgetting" in continual learning by detection of new +data and assigning encountered data to suitable models adapted to process them, +without needing task labels anywhere in the workflow. We show the reliability +of the DIRAD in growing a network with high performance and orders-of-magnitude +simpler than fixed topology networks; and demonstrate the proof-of-concept +operation of PREVAL, in which continual adaptation to new tasks is observed +while being able to detect and discern previously-encountered tasks. + +
+
+ comment: Presented in Deployable AI (DAI) workshop at AAAI-2024 +
+
+
+
+
+ + ☆ Leveraging Large Language Models to Generate Course-specific + Semantically Annotated Learning Objects + + +
+ Background: Over the past few decades, the process and methodology of +automated question generation (AQG) have undergone significant transformations. +Recent progress in generative natural language models has opened up new +potential in the generation of educational content. + Objectives: This paper explores the potential of large language models (LLMs) +for generating computer science questions that are sufficiently annotated for +automatic learner model updates, are fully situated in the context of a +particular course, and address the cognitive dimension understand. + Methods: Unlike previous attempts that might use basic methods like ChatGPT, +our approach involves more targeted strategies such as retrieval-augmented +generation (RAG) to produce contextually relevant and pedagogically meaningful +learning objects. + Results and Conclusions: Our results show that generating structural, +semantic annotations works well. However, this success was not reflected in the +case of relational annotations. The quality of the generated questions often +did not meet educational standards, highlighting that although LLMs can +contribute to the pool of learning materials, their current level of +performance requires significant human intervention to refine and validate the +generated content. + +
+
+ comment: Accepted at Journal of Computer Assisted Learning (2024) +
+
+
+
+
+ + ☆ Bench-CoE: a Framework for Collaboration of Experts from Benchmark + + +
+ Large Language Models (LLMs) are key technologies driving intelligent systems +to handle multiple tasks. To meet the demands of various tasks, an increasing +number of LLMs-driven experts with diverse capabilities have been developed, +accompanied by corresponding benchmarks to evaluate their performance. This +paper proposes the Bench-CoE framework, which enables Collaboration of Experts +(CoE) by effectively leveraging benchmark evaluations to achieve optimal +performance across various tasks. Bench-CoE includes a set of expert models, a +router for assigning tasks to corresponding experts, and a benchmark dataset +for training the router. Moreover, we formulate Query-Level and Subject-Level +approaches based on our framework, and analyze the merits and drawbacks of +these two approaches. Finally, we conduct a series of experiments with vary +data distributions on both language and multimodal tasks to validate that our +proposed Bench-CoE outperforms any single model in terms of overall +performance. We hope this method serves as a baseline for further research in +this area. The code is available at +\url{https://github.com/ZhangXJ199/Bench-CoE}. + +
+
+ comment: The code is available at + \url{https://github.com/ZhangXJ199/Bench-CoE} +
+
+
+
+
+ + ☆ Frequency-Adaptive Low-Latency Object Detection Using Events and Frames + + +
+ Fusing Events and RGB images for object detection leverages the robustness of +Event cameras in adverse environments and the rich semantic information +provided by RGB cameras. However, two critical mismatches: low-latency Events +\textit{vs.}~high-latency RGB frames; temporally sparse labels in training +\textit{vs.}~continuous flow in inference, significantly hinder the +high-frequency fusion-based object detection. To address these challenges, we +propose the \textbf{F}requency-\textbf{A}daptive Low-Latency \textbf{O}bject +\textbf{D}etector (FAOD). FAOD aligns low-frequency RGB frames with +high-frequency Events through an Align Module, which reinforces cross-modal +style and spatial proximity to address the Event-RGB Mismatch. We further +propose a training strategy, Time Shift, which enforces the module to align the +prediction from temporally shifted Event-RGB pairs and their original +representation, that is, consistent with Event-aligned annotations. This +strategy enables the network to use high-frequency Event data as the primary +reference while treating low-frequency RGB images as supplementary information, +retaining the low-latency nature of the Event stream toward high-frequency +detection. Furthermore, we observe that these corrected Event-RGB pairs +demonstrate better generalization from low training frequency to higher +inference frequencies compared to using Event data alone. Extensive experiments +on the PKU-DAVIS-SOD and DSEC-Detection datasets demonstrate that our FAOD +achieves SOTA performance. Specifically, in the PKU-DAVIS-SOD Dataset, FAOD +achieves 9.8 points improvement in terms of the mAP in fully paired Event-RGB +data with only a quarter of the parameters compared to SODFormer, and even +maintains robust performance (only a 3 points drop in mAP) under 80$\times$ +Event-RGB frequency mismatch. + +
+
+
+
+
+ + ☆ If You Can't Use Them, Recycle Them: Optimizing Merging at Scale + Mitigates Performance Tradeoffs + + +
+ Model merging has shown great promise at combining expert models, but the +benefit of merging is unclear when merging ``generalist'' models trained on +many tasks. We explore merging in the context of large ($\sim100$B) models, by +\textit{recycling} checkpoints that exhibit tradeoffs among different tasks. +Such checkpoints are often created in the process of developing a frontier +model, and many suboptimal ones are usually discarded. Given a pool of model +checkpoints obtained from different training runs (e.g., different stages, +objectives, hyperparameters, and data mixtures), which naturally show tradeoffs +across different language capabilities (e.g., instruction following vs. code +generation), we investigate whether merging can recycle such suboptimal models +into a Pareto-optimal one. Our optimization algorithm tunes the weight of each +checkpoint in a linear combination, resulting in a Pareto-optimal models that +outperforms both individual models and merge-based baselines. Further analysis +shows that good merges tend to include almost all checkpoints with with +non-zero weights, indicating that even seemingly bad initial checkpoints can +contribute to good final merges. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Methodology for Online Estimation of Rheological Parameters in Polymer + Melts Using Deep Learning and Microfluidics + + +
+ Microfluidic devices are increasingly used in biological and chemical +experiments due to their cost-effectiveness for rheological estimation in +fluids. However, these devices often face challenges in terms of accuracy, +size, and cost. This study presents a methodology, integrating deep learning, +modeling and simulation to enhance the design of microfluidic systems, used to +develop an innovative approach for viscosity measurement of polymer melts. We +use synthetic data generated from the simulations to train a deep learning +model, which then identifies rheological parameters of polymer melts from +pressure drop and flow rate measurements in a microfluidic circuit, enabling +online estimation of fluid properties. By improving the accuracy and +flexibility of microfluidic rheological estimation, our methodology accelerates +the design and testing of microfluidic devices, reducing reliance on physical +prototypes, and offering significant contributions to the field. + +
+
+ comment: 12 pages, 6 figures, Winter Simulation Conference 2024 +
+
+
+
+
+ + ☆ Understanding Memorization in Generative Models via Sharpness in + Probability Landscapes + + +
+ In this paper, we introduce a geometric framework to analyze memorization in +diffusion models using the eigenvalues of the Hessian of the log probability +density. We propose that memorization arises from isolated points in the +learned probability distribution, characterized by sharpness in the probability +landscape, as indicated by large negative eigenvalues of the Hessian. Through +experiments on various datasets, we demonstrate that these eigenvalues +effectively detect and quantify memorization. Our approach provides a clear +understanding of memorization in diffusion models and lays the groundwork for +developing strategies to ensure secure and reliable generative models + +
+
+
+
+
+ + ☆ Monet: Mixture of Monosemantic Experts for Transformers + + +
+ Understanding the internal computations of large language models (LLMs) is +crucial for aligning them with human values and preventing undesirable +behaviors like toxic content generation. However, mechanistic interpretability +is hindered by polysemanticity -- where individual neurons respond to multiple, +unrelated concepts. While Sparse Autoencoders (SAEs) have attempted to +disentangle these features through sparse dictionary learning, they have +compromised LLM performance due to reliance on post-hoc reconstruction loss. To +address this issue, we introduce Mixture of Monosemantic Experts for +Transformers (Monet) architecture, which incorporates sparse dictionary +learning directly into end-to-end Mixture-of-Experts pretraining. Our novel +expert decomposition method enables scaling the expert count to 262,144 per +layer while total parameters scale proportionally to the square root of the +number of experts. Our analyses demonstrate mutual exclusivity of knowledge +across experts and showcase the parametric knowledge encapsulated within +individual experts. Moreover, Monet allows knowledge manipulation over domains, +languages, and toxicity mitigation without degrading general performance. Our +pursuit of transparent LLMs highlights the potential of scaling expert counts +to enhance} mechanistic interpretability and directly resect the internal +knowledge to fundamentally adjust} model behavior. The source code and +pretrained checkpoints are available at https://github.com/dmis-lab/Monet. + +
+
+
+
+
+ + ☆ Text Change Detection in Multilingual Documents Using Image Comparison + + +
+ Document comparison typically relies on optical character recognition (OCR) +as its core technology. However, OCR requires the selection of appropriate +language models for each document and the performance of multilingual or hybrid +models remains limited. To overcome these challenges, we propose text change +detection (TCD) using an image comparison model tailored for multilingual +documents. Unlike OCR-based approaches, our method employs word-level text +image-to-image comparison to detect changes. Our model generates bidirectional +change segmentation maps between the source and target documents. To enhance +performance without requiring explicit text alignment or scaling preprocessing, +we employ correlations among multi-scale attention features. We also construct +a benchmark dataset comprising actual printed and scanned word pairs in various +languages to evaluate our model. We validate our approach using our benchmark +dataset and public benchmarks Distorted Document Images and the LRDE Document +Binarization Dataset. We compare our model against state-of-the-art semantic +segmentation and change detection models, as well as to conventional OCR-based +models. + +
+
+ comment: 15pages, 11figures 6tables, wacv2025 accepted +
+
+
+
+
+ + ☆ DeepFEA: Deep Learning for Prediction of Transient Finite Element + Analysis Solutions + + +
+ Finite Element Analysis (FEA) is a powerful but computationally intensive +method for simulating physical phenomena. Recent advancements in machine +learning have led to surrogate models capable of accelerating FEA. Yet there +are still limitations in developing surrogates of transient FEA models that can +simultaneously predict the solutions for both nodes and elements with +applicability on both the 2D and 3D domains. Motivated by this research gap, +this study proposes DeepFEA, a deep learning-based framework that leverages a +multilayer Convolutional Long Short-Term Memory (ConvLSTM) network branching +into two parallel convolutional neural networks to predict the solutions for +both nodes and elements of FEA models. The proposed network is optimized using +a novel adaptive learning algorithm, called Node-Element Loss Optimization +(NELO). NELO minimizes the error occurring at both branches of the network +enabling the prediction of solutions for transient FEA simulations. The +experimental evaluation of DeepFEA is performed on three datasets in the +context of structural mechanics, generated to serve as publicly available +reference datasets. The results show that DeepFEA can achieve less than 3% +normalized mean and root mean squared error for 2D and 3D simulation scenarios, +and inference times that are two orders of magnitude faster than FEA. In +contrast, relevant state-of-the-art methods face challenges with +multi-dimensional output and dynamic input prediction. Furthermore, DeepFEA's +robustness was demonstrated in a real-life biomedical scenario, confirming its +suitability for accurate and efficient predictions of FEA simulations. + +
+
+ comment: This work has been submitted to a journal for possible publication +
+
+
+
+
+ + ☆ Thermal and RGB Images Work Better Together in Wind Turbine Damage + Detection + + +
+ The inspection of wind turbine blades (WTBs) is crucial for ensuring their +structural integrity and operational efficiency. Traditional inspection methods +can be dangerous and inefficient, prompting the use of unmanned aerial vehicles +(UAVs) that access hard-to-reach areas and capture high-resolution imagery. In +this study, we address the challenge of enhancing defect detection on WTBs by +integrating thermal and RGB images obtained from UAVs. We propose a +multispectral image composition method that combines thermal and RGB imagery +through spatial coordinate transformation, key point detection, binary +descriptor creation, and weighted image overlay. Using a benchmark dataset of +WTB images annotated for defects, we evaluated several state-of-the-art object +detection models. Our results show that composite images significantly improve +defect detection efficiency. Specifically, the YOLOv8 model's accuracy +increased from 91% to 95%, precision from 89% to 94%, recall from 85% to 92%, +and F1-score from 87% to 93%. The number of false positives decreased from 6 to +3, and missed defects reduced from 5 to 2. These findings demonstrate that +integrating thermal and RGB imagery enhances defect detection on WTBs, +contributing to improved maintenance and reliability. + +
+
+ comment: Unmanned aerial vehicle, image composition, multispectral images, + green energy, data quality management, weighted overlay +
+
+
+
+
+ + ☆ Enhancing Mathematical Reasoning in LLMs with Background Operators + + +
+ We propose utilizing background operators for mathematical reasoning in large +language models (LLMs). To achieve this, we define a set of fundamental +mathematical predicates as the basic building blocks. For each mathematical +problem, we develop a Prolog solution that includes problem-specific predicates +and intermediate predicates derived from these background operators, ensuring +that each solution adheres to the defined operator set. We introduce the +MATH-Prolog corpus, which is derived from the counting and probability +categories of the MATH corpus. For efficient data augmentation, we apply K-fold +cross-validated self-training. This method incrementally generates new Prolog +solutions for each fold, incorporating those verified as correct into the +training set throughout the model training process. Our experimental results +demonstrate that 5-fold crossvalidated self-training effectively identifies +new, accurate Prolog solutions, achieving an accuracy of 84.6% on the +cross-validated set, and 84.8% on the test set during fine-tuning the +Meta-Llama-3.1-8B-Instruct model. This approach successfully uncovers new +solutions with fully computable inference steps for previously unseen problems. +Additionally, incorporating the background mathematical predicates into the +prompt enhances solution coverage. + +
+
+
+
+
+ + ☆ Pre-train, Align, and Disentangle: Empowering Sequential Recommendation + with Large Language Models + + +
+ Sequential recommendation (SR) aims to model the sequential dependencies in +users' historical interactions to better capture their evolving interests. +However, existing SR approaches primarily rely on collaborative data, which +leads to limitations such as the cold-start problem and sub-optimal +performance. Meanwhile, despite the success of large language models (LLMs), +their application in industrial recommender systems is hindered by high +inference latency, inability to capture all distribution statistics, and +catastrophic forgetting. To this end, we propose a novel Pre-train, Align, and +Disentangle (PAD) paradigm to empower recommendation models with LLMs. +Specifically, we first pre-train both the SR and LLM models to get +collaborative and textual embeddings. Next, a characteristic +recommendation-anchored alignment loss is proposed using multi-kernel maximum +mean discrepancy with Gaussian kernels. Finally, a triple-experts architecture, +consisting aligned and modality-specific experts with disentangled embeddings, +is fine-tuned in a frequency-aware manner. Experiments conducted on three +public datasets demonstrate the effectiveness of PAD, showing significant +improvements and compatibility with various SR backbone models, especially on +cold items. The implementation code and datasets will be publicly available. + +
+
+
+
+
+ + ☆ Missing Melodies: AI Music Generation and its "Nearly" Complete Omission + of the Global South + + +
+ Recent advances in generative AI have sparked renewed interest and expanded +possibilities for music generation. However, the performance and versatility of +these systems across musical genres are heavily influenced by the availability +of training data. We conducted an extensive analysis of over one million hours +of audio datasets used in AI music generation research and manually reviewed +more than 200 papers from eleven prominent AI and music conferences and +organizations (AAAI, ACM, EUSIPCO, EURASIP, ICASSP, ICML, IJCAI, ISMIR, +NeurIPS, NIME, SMC) to identify a critical gap in the fair representation and +inclusion of the musical genres of the Global South in AI research. Our +findings reveal a stark imbalance: approximately 86% of the total dataset hours +and over 93% of researchers focus primarily on music from the Global North. +However, around 40% of these datasets include some form of non-Western music, +genres from the Global South account for only 14.6% of the data. Furthermore, +approximately 51% of the papers surveyed concentrate on symbolic music +generation, a method that often fails to capture the cultural nuances inherent +in music from regions such as South Asia, the Middle East, and Africa. As AI +increasingly shapes the creation and dissemination of music, the significant +underrepresentation of music genres in datasets and research presents a serious +threat to global musical diversity. We also propose some important steps to +mitigate these risks and foster a more inclusive future for AI-driven music +generation. + +
+
+ comment: Submitted to CACM, 12 pages, 2 figures +
+
+
+
+
+ + ☆ D-LORD for Motion Stylization + + +
+ This paper introduces a novel framework named D-LORD (Double Latent +Optimization for Representation Disentanglement), which is designed for motion +stylization (motion style transfer and motion retargeting). The primary +objective of this framework is to separate the class and content information +from a given motion sequence using a data-driven latent optimization approach. +Here, class refers to person-specific style, such as a particular emotion or an +individual's identity, while content relates to the style-agnostic aspect of an +action, such as walking or jumping, as universally understood concepts. The key +advantage of D-LORD is its ability to perform style transfer without needing +paired motion data. Instead, it utilizes class and content labels during the +latent optimization process. By disentangling the representation, the framework +enables the transformation of one motion sequences style to another's style +using Adaptive Instance Normalization. The proposed D-LORD framework is +designed with a focus on generalization, allowing it to handle different class +and content labels for various applications. Additionally, it can generate +diverse motion sequences when specific class and content labels are provided. +The framework's efficacy is demonstrated through experimentation on three +datasets: the CMU XIA dataset for motion style transfer, the MHAD dataset, and +the RRIS Ability dataset for motion retargeting. Notably, this paper presents +the first generalized framework for motion style transfer and motion +retargeting, showcasing its potential contributions in this area. + +
+
+
+
+
+ + ☆ Practical Considerations for Agentic LLM Systems + + +
+ As the strength of Large Language Models (LLMs) has grown over recent years, +so too has interest in their use as the underlying models for autonomous +agents. Although LLMs demonstrate emergent abilities and broad expertise across +natural language domains, their inherent unpredictability makes the +implementation of LLM agents challenging, resulting in a gap between related +research and the real-world implementation of such systems. To bridge this gap, +this paper frames actionable insights and considerations from the research +community in the context of established application paradigms to enable the +construction and facilitate the informed deployment of robust LLM agents. +Namely, we position relevant research findings into four broad +categories--Planning, Memory, Tools, and Control Flow--based on common +practices in application-focused literature and highlight practical +considerations to make when designing agentic LLMs for real-world applications, +such as handling stochasticity and managing resources efficiently. While we do +not conduct empirical evaluations, we do provide the necessary background for +discussing critical aspects of agentic LLM designs, both in academia and +industry. + +
+
+ comment: 15 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ BodyMetric: Evaluating the Realism of HumanBodies in Text-to-Image + Generation + + +
+ Accurately generating images of human bodies from text remains a challenging +problem for state of the art text-to-image models. Commonly observed +body-related artifacts include extra or missing limbs, unrealistic poses, +blurred body parts, etc. Currently, evaluation of such artifacts relies heavily +on time-consuming human judgments, limiting the ability to benchmark models at +scale. We address this by proposing BodyMetric, a learnable metric that +predicts body realism in images. BodyMetric is trained on realism labels and +multi-modal signals including 3D body representations inferred from the input +image, and textual descriptions. In order to facilitate this approach, we +design an annotation pipeline to collect expert ratings on human body realism +leading to a new dataset for this task, namely, BodyRealism. Ablation studies +support our architectural choices for BodyMetric and the importance of +leveraging a 3D human body prior in capturing body-related artifacts in 2D +images. In comparison to concurrent metrics which evaluate general user +preference in images, BodyMetric specifically reflects body-related artifacts. +We demonstrate the utility of BodyMetric through applications that were +previously infeasible at scale. In particular, we use BodyMetric to benchmark +the generation ability of text-to-image models to produce realistic human +bodies. We also demonstrate the effectiveness of BodyMetric in ranking +generated images based on the predicted realism scores. + +
+
+
+
+
+ + ☆ Federated Learning in Mobile Networks: A Comprehensive Case Study on + Traffic Forecasting + + +
+ The increasing demand for efficient resource allocation in mobile networks +has catalyzed the exploration of innovative solutions that could enhance the +task of real-time cellular traffic prediction. Under these circumstances, +federated learning (FL) stands out as a distributed and privacy-preserving +solution to foster collaboration among different sites, thus enabling +responsive near-the-edge solutions. In this paper, we comprehensively study the +potential benefits of FL in telecommunications through a case study on +federated traffic forecasting using real-world data from base stations (BSs) in +Barcelona (Spain). Our study encompasses relevant aspects within the federated +experience, including model aggregation techniques, outlier management, the +impact of individual clients, personalized learning, and the integration of +exogenous sources of data. The performed evaluation is based on both prediction +accuracy and sustainability, thus showcasing the environmental impact of +employed FL algorithms in various settings. The findings from our study +highlight FL as a promising and robust solution for mobile traffic prediction, +emphasizing its twin merits as a privacy-conscious and environmentally +sustainable approach, while also demonstrating its capability to overcome data +heterogeneity and ensure high-quality predictions, marking a significant stride +towards its integration in mobile traffic management systems. + +
+
+
+
+
+ + ☆ Does your model understand genes? A benchmark of gene properties for + biological and text models + + +
+ The application of deep learning methods, particularly foundation models, in +biological research has surged in recent years. These models can be text-based +or trained on underlying biological data, especially omics data of various +types. However, comparing the performance of these models consistently has +proven to be a challenge due to differences in training data and downstream +tasks. To tackle this problem, we developed an architecture-agnostic +benchmarking approach that, instead of evaluating the models directly, +leverages entity representation vectors from each model and trains simple +predictive models for each benchmarking task. This ensures that all types of +models are evaluated using the same input and output types. Here we focus on +gene properties collected from professionally curated bioinformatics databases. +These gene properties are categorized into five major groups: genomic +properties, regulatory functions, localization, biological processes, and +protein properties. Overall, we define hundreds of tasks based on these +databases, which include binary, multi-label, and multi-class classification +tasks. We apply these benchmark tasks to evaluate expression-based models, +large language models, protein language models, DNA-based models, and +traditional baselines. Our findings suggest that text-based models and protein +language models generally outperform expression-based models in genomic +properties and regulatory functions tasks, whereas expression-based models +demonstrate superior performance in localization tasks. These results should +aid in the development of more informed artificial intelligence strategies for +biological understanding and therapeutic discovery. To ensure the +reproducibility and transparency of our findings, we have made the source code +and benchmark data publicly accessible for further investigation and expansion +at github.com/BiomedSciAI/gene-benchmark. + +
+
+
+
+
+ + ☆ ProtDAT: A Unified Framework for Protein Sequence Design from Any + Protein Text Description + + +
+ Protein design has become a critical method in advancing significant +potential for various applications such as drug development and enzyme +engineering. However, protein design methods utilizing large language models +with solely pretraining and fine-tuning struggle to capture relationships in +multi-modal protein data. To address this, we propose ProtDAT, a de novo +fine-grained framework capable of designing proteins from any descriptive +protein text input. ProtDAT builds upon the inherent characteristics of protein +data to unify sequences and text as a cohesive whole rather than separate +entities. It leverages an innovative multi-modal cross-attention, integrating +protein sequences and textual information for a foundational level and seamless +integration. Experimental results demonstrate that ProtDAT achieves the +state-of-the-art performance in protein sequence generation, excelling in +rationality, functionality, structural similarity, and validity. On 20,000 +text-sequence pairs from Swiss-Prot, it improves pLDDT by 6%, TM-score by 0.26, +and reduces RMSD by 1.2 {\AA}, highlighting its potential to advance protein +design. + +
+
+
+
+
+ + ☆ Automated Medical Report Generation for ECG Data: Bridging Medical Text + and Signal Processing with Deep Learning + + +
+ Recent advances in deep learning and natural language generation have +significantly improved image captioning, enabling automated, human-like +descriptions for visual content. In this work, we apply these captioning +techniques to generate clinician-like interpretations of ECG data. This study +leverages existing ECG datasets accompanied by free-text reports authored by +healthcare professionals (HCPs) as training data. These reports, while often +inconsistent, provide a valuable foundation for automated learning. We +introduce an encoder-decoder-based method that uses these reports to train +models to generate detailed descriptions of ECG episodes. This represents a +significant advancement in ECG analysis automation, with potential applications +in zero-shot classification and automated clinical decision support. + The model is tested on various datasets, including both 1- and 12-lead ECGs. +It significantly outperforms the state-of-the-art reference model by Qiu et +al., achieving a METEOR score of 55.53% compared to 24.51% achieved by the +reference model. Furthermore, several key design choices are discussed, +providing a comprehensive overview of current challenges and innovations in +this domain. + The source codes for this research are publicly available in our Git +repository https://git.zib.de/ableich/ecg-comment-generation-public + +
+
+
+
+
+ + ☆ Graph Neural Networks Need Cluster-Normalize-Activate Modules NeurIPS 2024 + + +
+ Graph Neural Networks (GNNs) are non-Euclidean deep learning models for +graph-structured data. Despite their successful and diverse applications, +oversmoothing prohibits deep architectures due to node features converging to a +single fixed point. This severely limits their potential to solve complex +tasks. To counteract this tendency, we propose a plug-and-play module +consisting of three steps: Cluster-Normalize-Activate (CNA). By applying CNA +modules, GNNs search and form super nodes in each layer, which are normalized +and activated individually. We demonstrate in node classification and property +prediction tasks that CNA significantly improves the accuracy over the +state-of-the-art. Particularly, CNA reaches 94.18% and 95.75% accuracy on Cora +and CiteSeer, respectively. It further benefits GNNs in regression tasks as +well, reducing the mean squared error compared to all baselines. At the same +time, GNNs with CNA require substantially fewer learnable parameters than +competing architectures. + +
+
+ comment: 17 pages, 6 figures, 6 tables, accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ ZipAR: Accelerating Autoregressive Image Generation through Spatial + Locality + + +
+ In this paper, we propose ZipAR, a training-free, plug-and-play parallel +decoding framework for accelerating auto-regressive (AR) visual generation. The +motivation stems from the observation that images exhibit local structures, and +spatially distant regions tend to have minimal interdependence. Given a +partially decoded set of visual tokens, in addition to the original next-token +prediction scheme in the row dimension, the tokens corresponding to spatially +adjacent regions in the column dimension can be decoded in parallel, enabling +the ``next-set prediction'' paradigm. By decoding multiple tokens +simultaneously in a single forward pass, the number of forward passes required +to generate an image is significantly reduced, resulting in a substantial +improvement in generation efficiency. Experiments demonstrate that ZipAR can +reduce the number of model forward passes by up to 91% on the Emu3-Gen model +without requiring any additional retraining. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Expanding Deep Learning-based Sensing Systems with Multi-Source + Knowledge Transfer + + +
+ Expanding the existing sensing systems to provide high-quality deep learning +models for more domains, such as new users or environments, is challenged by +the limited labeled data and the data and device heterogeneities. While +knowledge distillation methods could overcome label scarcity and device +heterogeneity, they assume the teachers are fully reliable and overlook the +data heterogeneity, which prevents the direct adoption of existing models. To +address this problem, this paper proposes an efficient knowledge transfer +framework, HaKT, to expand sensing systems. It first selects multiple +high-quality models from the system at a low cost and then fuses their +knowledge by assigning sample-wise weights to their predictions. Later, the +fused knowledge is selectively injected into the customized models for new +domains based on the knowledge quality. Extensive experiments on different +tasks, modalities, and settings show that HaKT outperforms stat-of-the-art +baselines by at most 16.5% accuracy and saves up to 39% communication traffic. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ From Code to Play: Benchmarking Program Search for Games Using Large + Language Models + + +
+ Large language models (LLMs) have shown impressive capabilities in generating +program code, opening exciting opportunities for applying program synthesis to +games. In this work, we explore the potential of LLMs to directly synthesize +usable code for a wide range of gaming applications, focusing on two +programming languages, Python and Java. We use an evolutionary hill-climbing +algorithm, where the mutations and seeds of the initial programs are controlled +by LLMs. For Python, the framework covers various game-related tasks, including +five miniature versions of Atari games, ten levels of Baba is You, an +environment inspired by Asteroids, and a maze generation task. For Java, the +framework contains 12 games from the TAG tabletop games framework. Across 29 +tasks, we evaluated 12 language models for Python and 8 for Java. Our findings +suggest that the performance of LLMs depends more on the task than on model +size. While larger models generate more executable programs, these do not +always result in higher-quality solutions but are much more expensive. No model +has a clear advantage, although on any specific task, one model may be better. +Trying many models on a problem and using the best results across them is more +reliable than using just one. + +
+
+ comment: Submitted to Transactions on Games Special Issue on Large Language + Models and Games +
+
+
+
+
+ + ☆ INFP: Audio-Driven Interactive Head Generation in Dyadic Conversations + + +
+ Imagine having a conversation with a socially intelligent agent. It can +attentively listen to your words and offer visual and linguistic feedback +promptly. This seamless interaction allows for multiple rounds of conversation +to flow smoothly and naturally. In pursuit of actualizing it, we propose INFP, +a novel audio-driven head generation framework for dyadic interaction. Unlike +previous head generation works that only focus on single-sided communication, +or require manual role assignment and explicit role switching, our model drives +the agent portrait dynamically alternates between speaking and listening state, +guided by the input dyadic audio. Specifically, INFP comprises a Motion-Based +Head Imitation stage and an Audio-Guided Motion Generation stage. The first +stage learns to project facial communicative behaviors from real-life +conversation videos into a low-dimensional motion latent space, and use the +motion latent codes to animate a static image. The second stage learns the +mapping from the input dyadic audio to motion latent codes through denoising, +leading to the audio-driven head generation in interactive scenarios. To +facilitate this line of research, we introduce DyConv, a large scale dataset of +rich dyadic conversations collected from the Internet. Extensive experiments +and visualizations demonstrate superior performance and effectiveness of our +method. Project Page: https://grisoon.github.io/INFP/. + +
+
+
+
+
+ + ☆ SocialMind: LLM-based Proactive AR Social Assistive System with + Human-like Perception for In-situ Live Interactions + + +
+ Social interactions are fundamental to human life. The recent emergence of +large language models (LLMs)-based virtual assistants has demonstrated their +potential to revolutionize human interactions and lifestyles. However, existing +assistive systems mainly provide reactive services to individual users, rather +than offering in-situ assistance during live social interactions with +conversational partners. In this study, we introduce SocialMind, the first +LLM-based proactive AR social assistive system that provides users with in-situ +social assistance. SocialMind employs human-like perception leveraging +multi-modal sensors to extract both verbal and nonverbal cues, social factors, +and implicit personas, incorporating these social cues into LLM reasoning for +social suggestion generation. Additionally, SocialMind employs a multi-tier +collaborative generation strategy and proactive update mechanism to display +social suggestions on Augmented Reality (AR) glasses, ensuring that suggestions +are timely provided to users without disrupting the natural flow of +conversation. Evaluations on three public datasets and a user study with 20 +participants show that SocialMind achieves 38.3% higher engagement compared to +baselines, and 95% of participants are willing to use SocialMind in their live +social interactions. + +
+
+
+
+
+ + ☆ Considerations Influencing Offense-Defense Dynamics From Artificial + Intelligence + + +
+ The rapid advancement of artificial intelligence (AI) technologies presents +profound challenges to societal safety. As AI systems become more capable, +accessible, and integrated into critical services, the dual nature of their +potential is increasingly clear. While AI can enhance defensive capabilities in +areas like threat detection, risk assessment, and automated security +operations, it also presents avenues for malicious exploitation and large-scale +societal harm, for example through automated influence operations and cyber +attacks. Understanding the dynamics that shape AI's capacity to both cause harm +and enhance protective measures is essential for informed decision-making +regarding the deployment, use, and integration of advanced AI systems. This +paper builds on recent work on offense-defense dynamics within the realm of AI, +proposing a taxonomy to map and examine the key factors that influence whether +AI systems predominantly pose threats or offer protective benefits to society. +By establishing a shared terminology and conceptual foundation for analyzing +these interactions, this work seeks to facilitate further research and +discourse in this critical area. + +
+
+
+
+
+ + ☆ Deep-Unrolling Multidimensional Harmonic Retrieval Algorithms on + Neuromorphic Hardware + + +
+ This paper explores the potential of conversion-based neuromorphic algorithms +for highly accurate and energy-efficient single-snapshot multidimensional +harmonic retrieval (MHR). By casting the MHR problem as a sparse recovery +problem, we devise the currently proposed, deep-unrolling-based Structured +Learned Iterative Shrinkage and Thresholding (S-LISTA) algorithm to solve it +efficiently using complex-valued convolutional neural networks with +complex-valued activations, which are trained using a supervised regression +objective. Afterward, a novel method for converting the complex-valued +convolutional layers and activations into spiking neural networks (SNNs) is +developed. At the heart of this method lies the recently proposed Few Spikes +(FS) conversion, which is extended by modifying the neuron model's parameters +and internal dynamics to account for the inherent coupling between real and +imaginary parts in complex-valued computations. Finally, the converted SNNs are +mapped onto the SpiNNaker2 neuromorphic board, and a comparison in terms of +estimation accuracy and power efficiency between the original CNNs deployed on +an NVIDIA Jetson Xavier and the SNNs is being conducted. The measurement +results show that the converted SNNs achieve almost five-fold power efficiency +at moderate performance loss compared to the original CNNs. + +
+
+ comment: accepted to the 58th Asilomar Conference on Signals, Systems, and + Computers, Oct. 27th - Oct. 30th, 2024, Pacific Grove, CA +
+
+
+
+
+ + ☆ LaserGuider: A Laser Based Physical Backdoor Attack against Deep Neural + Networks + + +
+ Backdoor attacks embed hidden associations between triggers and targets in +deep neural networks (DNNs), causing them to predict the target when a trigger +is present while maintaining normal behavior otherwise. Physical backdoor +attacks, which use physical objects as triggers, are feasible but lack remote +control, temporal stealthiness, flexibility, and mobility. To overcome these +limitations, in this work, we propose a new type of backdoor triggers utilizing +lasers that feature long-distance transmission and instant-imaging properties. +Based on the laser-based backdoor triggers, we present a physical backdoor +attack, called LaserGuider, which possesses remote control ability and achieves +high temporal stealthiness, flexibility, and mobility. We also introduce a +systematic approach to optimize laser parameters for improving attack +effectiveness. Our evaluation on traffic sign recognition DNNs, critical in +autonomous vehicles, demonstrates that LaserGuider with three different +laser-based triggers achieves over 90% attack success rate with negligible +impact on normal inputs. Additionally, we release LaserMark, the first dataset +of real world traffic signs stamped with physical laser spots, to support +further research in backdoor attacks and defenses. + +
+
+ comment: In Proceedings of the 23rd International Conference on Applied + Cryptography and Network Security (ACNS), Munich, Germany, 23-26 June, 2025 +
+
+
+
+
+ + ☆ MTMT: Consolidating Multiple Thinking Modes to Form a Thought Tree for + Strengthening LLM + + +
+ Large language models (LLMs) have shown limitations in tasks requiring +complex logical reasoning and multi-step problem-solving. To address these +challenges, researchers have employed carefully designed prompts and +flowcharts, simulating human cognitive processes to enhance LLM performance, +such as the Chain of Thought approach. In this paper, we introduce MTMT +(Multi-thinking Modes Tree), a novel method that interacts with LLMs to +construct a thought tree, simulating various advanced cognitive processes, +including but not limited to association, counterfactual thinking, task +decomposition, and comparison. By breaking down the original complex task into +simpler sub-questions, MTMT facilitates easier problem-solving for LLMs, +enabling more effective utilization of the latent knowledge within LLMs. We +evaluate the performance of MTMT under different parameter configurations, +using GPT-4o mini as the base model. Our results demonstrate that integrating +multiple modes of thinking significantly enhances the ability of LLMs to handle +complex tasks. + +
+
+
+
+
+ + ☆ Exploring Fully Convolutional Networks for the Segmentation of + Hyperspectral Imaging Applied to Advanced Driver Assistance Systems + + +
+ Advanced Driver Assistance Systems (ADAS) are designed with the main purpose +of increasing the safety and comfort of vehicle occupants. Most of current +computer vision-based ADAS perform detection and tracking tasks quite +successfully under regular conditions, but are not completely reliable, +particularly under adverse weather and changing lighting conditions, neither in +complex situations with many overlapping objects. In this work we explore the +use of hyperspectral imaging (HSI) in ADAS on the assumption that the distinct +near infrared (NIR) spectral reflectances of different materials can help to +better separate the objects in a driving scene. In particular, this paper +describes some experimental results of the application of fully convolutional +networks (FCN) to the image segmentation of HSI for ADAS applications. More +specifically, our aim is to investigate to what extent the spatial features +codified by convolutional filters can be helpful to improve the performance of +HSI segmentation systems. With that aim, we use the HSI-Drive v1.1 dataset, +which provides a set of labelled images recorded in real driving conditions +with a small-size snapshot NIR-HSI camera. Finally, we analyze the +implementability of such a HSI segmentation system by prototyping the developed +FCN model together with the necessary hyperspectral cube preprocessing stage +and characterizing its performance on an MPSoC. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2411.19274 +
+
+
+
+
+ + ☆ A Data-Driven Framework for Discovering Fractional Differential + Equations in Complex Systems + + +
+ In complex physical systems, conventional differential equations often fall +short in capturing non-local and memory effects, as they are limited to local +dynamics and integer-order interactions. This study introduces a stepwise +data-driven framework for discovering fractional differential equations (FDEs) +directly from data. FDEs, known for their capacity to model non-local dynamics +with fewer parameters than integer-order derivatives, can represent complex +systems with long-range interactions. Our framework applies deep neural +networks as surrogate models for denoising and reconstructing sparse and noisy +observations while using Gaussian-Jacobi quadrature to handle the challenges +posed by singularities in fractional derivatives. To optimize both the sparse +coefficients and fractional order, we employ an alternating optimization +approach that combines sparse regression with global optimization techniques. +We validate the framework across various datasets, including synthetic +anomalous diffusion data, experimental data on the creep behavior of frozen +soils, and single-particle trajectories modeled by L\'{e}vy motion. Results +demonstrate the framework's robustness in identifying the structure of FDEs +across diverse noise levels and its capacity to capture integer-order dynamics, +offering a flexible approach for modeling memory effects in complex systems. + +
+
+
+
+
+ + ☆ Demonstration Selection for In-Context Learning via Reinforcement + Learning + + +
+ Diversity in demonstration selection is crucial for enhancing model +generalization, as it enables a broader coverage of structures and concepts. +However, constructing an appropriate set of demonstrations has remained a focal +point of research. This paper presents the Relevance-Diversity Enhanced +Selection (RDES), an innovative approach that leverages reinforcement learning +to optimize the selection of diverse reference demonstrations for text +classification tasks using Large Language Models (LLMs), especially in few-shot +prompting scenarios. RDES employs a Q-learning framework to dynamically +identify demonstrations that maximize both diversity and relevance to the +classification objective by calculating a diversity score based on label +distribution among selected demonstrations. This method ensures a balanced +representation of reference data, leading to improved classification accuracy. +Through extensive experiments on four benchmark datasets and involving 12 +closed-source and open-source LLMs, we demonstrate that RDES significantly +enhances classification accuracy compared to ten established baselines. +Furthermore, we investigate the incorporation of Chain-of-Thought (CoT) +reasoning in the reasoning process, which further enhances the model's +predictive performance. The results underscore the potential of reinforcement +learning to facilitate adaptive demonstration selection and deepen the +understanding of classification challenges. + +
+
+
+
+
+ + ☆ Augmenting Minds or Automating Skills: The Differential Role of Human + Capital in Generative AI's Impact on Creative Tasks + + +
+ Generative AI is rapidly reshaping creative work, raising critical questions +about its beneficiaries and societal implications. This study challenges +prevailing assumptions by exploring how generative AI interacts with diverse +forms of human capital in creative tasks. Through two random controlled +experiments in flash fiction writing and song composition, we uncover a +paradox: while AI democratizes access to creative tools, it simultaneously +amplifies cognitive inequalities. Our findings reveal that AI enhances general +human capital (cognitive abilities and education) by facilitating adaptability +and idea integration but diminishes the value of domain-specific expertise. We +introduce a novel theoretical framework that merges human capital theory with +the automation-augmentation perspective, offering a nuanced understanding of +human-AI collaboration. This framework elucidates how AI shifts the locus of +creative advantage from specialized expertise to broader cognitive +adaptability. Contrary to the notion of AI as a universal equalizer, our work +highlights its potential to exacerbate disparities in skill valuation, +reshaping workplace hierarchies and redefining the nature of creativity in the +AI era. These insights advance theories of human capital and automation while +providing actionable guidance for organizations navigating AI integration +amidst workforce inequalities. + +
+
+
+
+
+ + ☆ A Framework For Image Synthesis Using Supervised Contrastive Learning + + +
+ Text-to-image (T2I) generation aims at producing realistic images +corresponding to text descriptions. Generative Adversarial Network (GAN) has +proven to be successful in this task. Typical T2I GANs are 2 phase methods that +first pretrain an inter-modal representation from aligned image-text pairs and +then use GAN to train image generator on that basis. However, such +representation ignores the inner-modal semantic correspondence, e.g. the images +with same label. The semantic label in priory describes the inherent +distribution pattern with underlying cross-image relationships, which is +supplement to the text description for understanding the full characteristics +of image. In this paper, we propose a framework leveraging both inter- and +inner-modal correspondence by label guided supervised contrastive learning. We +extend the T2I GANs to two parameter-sharing contrast branches in both +pretraining and generation phases. This integration effectively clusters the +semantically similar image-text pair representations, thereby fostering the +generation of higher-quality images. We demonstrate our framework on four novel +T2I GANs by both single-object dataset CUB and multi-object dataset COCO, +achieving significant improvements in the Inception Score (IS) and Frechet +Inception Distance (FID) metrics of imagegeneration evaluation. Notably, on +more complex multi-object COCO, our framework improves FID by 30.1%, 27.3%, +16.2% and 17.1% for AttnGAN, DM-GAN, SSA-GAN and GALIP, respectively. We also +validate our superiority by comparing with other label guided T2I GANs. The +results affirm the effectiveness and competitiveness of our approach in +advancing the state-of-the-art GAN for T2I generation + +
+
+
+
+
+ + ☆ Chain-of-Thought in Large Language Models: Decoding, Projection, and + Activation + + +
+ Chain-of-Thought prompting has significantly enhanced the reasoning +capabilities of large language models, with numerous studies exploring factors +influencing its performance. However, the underlying mechanisms remain poorly +understood. To further demystify the operational principles, this work examines +three key aspects: decoding, projection, and activation, aiming to elucidate +the changes that occur within models when employing Chainof-Thought. Our +findings reveal that LLMs effectively imitate exemplar formats while +integrating them with their understanding of the question, exhibiting +fluctuations in token logits during generation but ultimately producing a more +concentrated logits distribution, and activating a broader set of neurons in +the final layers, indicating more extensive knowledge retrieval compared to +standard prompts. Our code and data will be publicly avialable when the paper +is accepted. + +
+
+
+
+
+ + ☆ Enhancing and Accelerating Diffusion-Based Inverse Problem Solving + through Measurements Optimization + + +
+ Diffusion models have recently demonstrated notable success in solving +inverse problems. However, current diffusion model-based solutions typically +require a large number of function evaluations (NFEs) to generate high-quality +images conditioned on measurements, as they incorporate only limited +information at each step. To accelerate the diffusion-based inverse +problem-solving process, we introduce \textbf{M}easurements +\textbf{O}ptimization (MO), a more efficient plug-and-play module for +integrating measurement information at each step of the inverse problem-solving +process. This method is comprehensively evaluated across eight diverse linear +and nonlinear tasks on the FFHQ and ImageNet datasets. By using MO, we +establish state-of-the-art (SOTA) performance across multiple tasks, with key +advantages: (1) it operates with no more than 100 NFEs, with phase retrieval on +ImageNet being the sole exception; (2) it achieves SOTA or near-SOTA results +even at low NFE counts; and (3) it can be seamlessly integrated into existing +diffusion model-based solutions for inverse problems, such as DPS +\cite{chung2022diffusion} and Red-diff \cite{mardani2023variational}. For +example, DPS-MO attains a peak signal-to-noise ratio (PSNR) of 28.71 dB on the +FFHQ 256 dataset for high dynamic range imaging, setting a new SOTA benchmark +with only 100 NFEs, whereas current methods require between 1000 and 4000 NFEs +for comparable performance. + +
+
+
+
+
+ + ☆ InfiniCube: Unbounded and Controllable Dynamic 3D Driving Scene + Generation with World-Guided Video Models + + +
+ We present InfiniCube, a scalable method for generating unbounded dynamic 3D +driving scenes with high fidelity and controllability. Previous methods for +scene generation either suffer from limited scales or lack geometric and +appearance consistency along generated sequences. In contrast, we leverage the +recent advancements in scalable 3D representation and video models to achieve +large dynamic scene generation that allows flexible controls through HD maps, +vehicle bounding boxes, and text descriptions. First, we construct a +map-conditioned sparse-voxel-based 3D generative model to unleash its power for +unbounded voxel world generation. Then, we re-purpose a video model and ground +it on the voxel world through a set of carefully designed pixel-aligned +guidance buffers, synthesizing a consistent appearance. Finally, we propose a +fast feed-forward approach that employs both voxel and pixel branches to lift +the dynamic videos to dynamic 3D Gaussians with controllable objects. Our +method can generate controllable and realistic 3D driving scenes, and extensive +experiments validate the effectiveness and superiority of our model. + +
+
+ comment: Project Page: https://research.nvidia.com/labs/toronto-ai/infinicube/ +
+
+
+
+
+ + ☆ Exploring AI Text Generation, Retrieval-Augmented Generation, and + Detection Technologies: a Comprehensive Overview + + +
+ The rapid development of Artificial Intelligence (AI) has led to the creation +of powerful text generation models, such as large language models (LLMs), which +are widely used for diverse applications. However, concerns surrounding +AI-generated content, including issues of originality, bias, misinformation, +and accountability, have become increasingly prominent. This paper offers a +comprehensive overview of AI text generators (AITGs), focusing on their +evolution, capabilities, and ethical implications. This paper also introduces +Retrieval-Augmented Generation (RAG), a recent approach that improves the +contextual relevance and accuracy of text generation by integrating dynamic +information retrieval. RAG addresses key limitations of traditional models, +including their reliance on static knowledge and potential inaccuracies in +handling real-world data. Additionally, the paper reviews detection tools that +help differentiate AI-generated text from human-written content and discusses +the ethical challenges these technologies pose. The paper explores future +directions for improving detection accuracy, supporting ethical AI development, +and increasing accessibility. The paper contributes to a more responsible and +reliable use of AI in content creation through these discussions. + +
+
+
+
+
+ + ☆ MIND: Effective Incorrect Assignment Detection through a Multi-Modal + Structure-Enhanced Language Model + + +
+ The rapid growth of academic publications has exacerbated the issue of author +name ambiguity in online digital libraries. Despite advances in name +disambiguation algorithms, cumulative errors continue to undermine the +reliability of academic systems. It is estimated that over 10% paper-author +assignments are rectified when constructing the million-scale WhoIsWho +benchmark. Existing endeavors to detect incorrect assignments are either +semantic-based or graph-based approaches, which fall short of making full use +of the rich text attributes of papers and implicit structural features defined +via the co-occurrence of paper attributes. To this end, this paper introduces a +structure-enhanced language model that combines key structural features from +graph-based methods with fine-grained semantic features from rich paper +attributes to detect incorrect assignments. The proposed model is trained with +a highly effective multi-modal multi-turn instruction tuning framework, which +incorporates task-guided instruction tuning, text-attribute modality, and +structural modality. Experimental results demonstrate that our model +outperforms previous approaches, achieving top performance on the leaderboard +of KDD Cup 2024. Our code has been publicly available. + +
+
+
+
+
+ + ☆ MT3DNet: Multi-Task learning Network for 3D Surgical Scene + Reconstruction + + +
+ In image-assisted minimally invasive surgeries (MIS), understanding surgical +scenes is vital for real-time feedback to surgeons, skill evaluation, and +improving outcomes through collaborative human-robot procedures. Within this +context, the challenge lies in accurately detecting, segmenting, and estimating +the depth of surgical scenes depicted in high-resolution images, while +simultaneously reconstructing the scene in 3D and providing segmentation of +surgical instruments along with detection labels for each instrument. To +address this challenge, a novel Multi-Task Learning (MTL) network is proposed +for performing these tasks concurrently. A key aspect of this approach involves +overcoming the optimization hurdles associated with handling multiple tasks +concurrently by integrating a Adversarial Weight Update into the MTL framework, +the proposed MTL model achieves 3D reconstruction through the integration of +segmentation, depth estimation, and object detection, thereby enhancing the +understanding of surgical scenes, which marks a significant advancement +compared to existing studies that lack 3D capabilities. Comprehensive +experiments on the EndoVis2018 benchmark dataset underscore the adeptness of +the model in efficiently addressing all three tasks, demonstrating the efficacy +of the proposed techniques. + +
+
+
+
+
+ + ☆ A Survey on Large Language Model-Based Social Agents in Game-Theoretic + Scenarios + + +
+ Game-theoretic scenarios have become pivotal in evaluating the social +intelligence of Large Language Model (LLM)-based social agents. While numerous +studies have explored these agents in such settings, there is a lack of a +comprehensive survey summarizing the current progress. To address this gap, we +systematically review existing research on LLM-based social agents within +game-theoretic scenarios. Our survey organizes the findings into three core +components: Game Framework, Social Agent, and Evaluation Protocol. The game +framework encompasses diverse game scenarios, ranging from choice-focusing to +communication-focusing games. The social agent part explores agents' +preferences, beliefs, and reasoning abilities. The evaluation protocol covers +both game-agnostic and game-specific metrics for assessing agent performance. +By reflecting on the current research and identifying future research +directions, this survey provides insights to advance the development and +evaluation of social agents in game-theoretic scenarios. + +
+
+
+
+
+ + ☆ Integrating Various Software Artifacts for Better LLM-based Bug + Localization and Program Repair + + +
+ LLMs have garnered considerable attention for their potential to streamline +Automated Program Repair (APR). LLM-based approaches can either insert the +correct code or directly generate patches when provided with buggy methods. +However, most of LLM-based APR methods rely on a single type of software +information, without fully leveraging different software artifacts. Despite +this, many LLM-based approaches do not explore which specific types of +information best assist in APR. Addressing this gap is crucial for advancing +LLM-based APR techniques. We propose DEVLoRe to use issue content (description +and message) and stack error traces to localize buggy methods, then rely on +debug information in buggy methods and issue content and stack error to +localize buggy lines and generate plausible patches which can pass all unit +tests. The results show that while issue content is particularly effective in +assisting LLMs with fault localization and program repair, different types of +software artifacts complement each other. By incorporating different artifacts, +DEVLoRe successfully locates 49.3% and 47.6% of single and non-single buggy +methods and generates 56.0% and 14.5% plausible patches for the Defects4J v2.0 +dataset, respectively. This outperforms current state-of-the-art APR methods. +The source code and experimental results of this work for replication are +available at https://github.com/XYZboom/DEVLoRe. + +
+
+ comment: 22 pages, 11 images, 9 tables, Manuscript submitted to a journal + (2024) +
+
+
+
+
+ + ☆ MISR: Measuring Instrumental Self-Reasoning in Frontier Models + + +
+ We propose a suite of tasks to evaluate the instrumental self-reasoning +ability of large language model (LLM) agents. Instrumental self-reasoning +ability could improve adaptability and enable self-modification, but it could +also pose significant risks, such as enabling deceptive alignment. Prior work +has only evaluated self-reasoning in non-agentic settings or in limited +domains. In this paper, we propose evaluations for instrumental self-reasoning +ability in agentic tasks in a wide range of scenarios, including +self-modification, knowledge seeking, and opaque self-reasoning. We evaluate +agents built using state-of-the-art LLMs, including commercial and open source +systems. We find that instrumental self-reasoning ability emerges only in the +most capable frontier models and that it is highly context-dependent. No model +passes the the most difficult versions of our evaluations, hence our evaluation +can be used to measure increases in instrumental self-reasoning ability in +future models. We open-source our evaluations at +https://github.com/kaifronsdal/Self-Reasoning-Evals. + +
+
+ comment: 10 pages, 65 page appendix, 5 figures +
+
+
+
+
+ + ☆ Using SlowFast Networks for Near-Miss Incident Analysis in Dashcam + Videos + + +
+ This paper classifies near-miss traffic videos using the SlowFast deep neural +network that mimics the characteristics of the slow and fast visual information +processed by two different streams from the M (Magnocellular) and P +(Parvocellular) cells of the human brain. The approach significantly improves +the accuracy of the traffic near-miss video analysis and presents insights into +human visual perception in traffic scenarios. Moreover, it contributes to +traffic safety enhancements and provides novel perspectives on the potential +cognitive errors in traffic accidents. + +
+
+ comment: Best Research Paper Award for Asia-Pacific Region, The 30th ITS World + Congress 2024 +
+
+
+
+
+ + ☆ A Noise is Worth Diffusion Guidance + + +
+ Diffusion models excel in generating high-quality images. However, current +diffusion models struggle to produce reliable images without guidance methods, +such as classifier-free guidance (CFG). Are guidance methods truly necessary? +Observing that noise obtained via diffusion inversion can reconstruct +high-quality images without guidance, we focus on the initial noise of the +denoising pipeline. By mapping Gaussian noise to `guidance-free noise', we +uncover that small low-magnitude low-frequency components significantly enhance +the denoising process, removing the need for guidance and thus improving both +inference throughput and memory. Expanding on this, we propose \ours, a novel +method that replaces guidance methods with a single refinement of the initial +noise. This refined noise enables high-quality image generation without +guidance, within the same diffusion pipeline. Our noise-refining model +leverages efficient noise-space learning, achieving rapid convergence and +strong performance with just 50K text-image pairs. We validate its +effectiveness across diverse metrics and analyze how refined noise can +eliminate the need for guidance. See our project page: +https://cvlab-kaist.github.io/NoiseRefine/. + +
+
+ comment: Project page: https://cvlab-kaist.github.io/NoiseRefine/ +
+
+
+
+
+ + ☆ Machine Learning-based Android Intrusion Detection System + + +
+ The android operating system is being installed in most of the smart devices. +The introduction of intrusions in such operating systems is rising at a +tremendous rate. With the introduction of such malicious data streams, the +smart devices are being subjected to various attacks like Phishing, Spyware, +SMS Fraud, Bots and Banking-Trojans and many such. The application of machine +learning classification algorithms for the security of android APK files is +used in this paper. Each apk data stream was marked to be either malicious or +non malicious on the basis of different parameters. The machine learning +classification techniques are then used to classify whether the newly installed +applications' signature falls within the malicious or non-malicious domain. If +it falls within the malicious category, appropriate action can be taken, and +the Android operating system can be shielded against illegal activities. + +
+
+
+
+
+ + ☆ Dual-Branch Subpixel-Guided Network for Hyperspectral Image + Classification + + +
+ Deep learning (DL) has been widely applied into hyperspectral image (HSI) +classification owing to its promising feature learning and representation +capabilities. However, limited by the spatial resolution of sensors, existing +DL-based classification approaches mainly focus on pixel-level spectral and +spatial information extraction through complex network architecture design, +while ignoring the existence of mixed pixels in actual scenarios. To tackle +this difficulty, we propose a novel dual-branch subpixel-guided network for HSI +classification, called DSNet, which automatically integrates subpixel +information and convolutional class features by introducing a deep autoencoder +unmixing architecture to enhance classification performance. DSNet is capable +of fully considering physically nonlinear properties within subpixels and +adaptively generating diagnostic abundances in an unsupervised manner to +achieve more reliable decision boundaries for class label distributions. The +subpixel fusion module is designed to ensure high-quality information fusion +across pixel and subpixel features, further promoting stable joint +classification. Experimental results on three benchmark datasets demonstrate +the effectiveness and superiority of DSNet compared with state-of-the-art +DL-based HSI classification approaches. The codes will be available at +https://github.com/hanzhu97702/DSNet, contributing to the remote sensing +community. + +
+
+
+
+
+ + ☆ Uniform Discretized Integrated Gradients: An effective attribution based + method for explaining large language models + + +
+ Integrated Gradients is a well-known technique for explaining deep learning +models. It calculates feature importance scores by employing a gradient based +approach computing gradients of the model output with respect to input features +and accumulating them along a linear path. While this works well for continuous +features spaces, it may not be the most optimal way to deal with discrete +spaces like word embeddings. For interpreting LLMs (Large Language Models), +there exists a need for a non-linear path where intermediate points, whose +gradients are to be computed, lie close to actual words in the embedding space. +In this paper, we propose a method called Uniform Discretized Integrated +Gradients (UDIG) based on a new interpolation strategy where we choose a +favorable nonlinear path for computing attribution scores suitable for +predictive language models. We evaluate our method on two types of NLP tasks- +Sentiment Classification and Question Answering against three metrics viz Log +odds, Comprehensiveness and Sufficiency. For sentiment classification, we have +used the SST2, IMDb and Rotten Tomatoes datasets for benchmarking and for +Question Answering, we have used the fine-tuned BERT model on SQuAD dataset. +Our approach outperforms the existing methods in almost all the metrics. + +
+
+
+
+
+ + ☆ A Unified Framework for Evaluating the Effectiveness and Enhancing the + Transparency of Explainable AI Methods in Real-World Applications + + +
+ The rapid advancement of deep learning has resulted in substantial +advancements in AI-driven applications; however, the "black box" characteristic +of these models frequently constrains their interpretability, transparency, and +reliability. Explainable artificial intelligence (XAI) seeks to elucidate AI +decision-making processes, guaranteeing that explanations faithfully represent +the model's rationale and correspond with human comprehension. Despite +comprehensive research in XAI, a significant gap persists in standardized +procedures for assessing the efficacy and transparency of XAI techniques across +many real-world applications. This study presents a unified XAI evaluation +framework incorporating extensive quantitative and qualitative criteria to +systematically evaluate the correctness, interpretability, robustness, +fairness, and completeness of explanations generated by AI models. The +framework prioritizes user-centric and domain-specific adaptations, hence +improving the usability and reliability of AI models in essential domains. To +address deficiencies in existing evaluation processes, we suggest defined +benchmarks and a systematic evaluation pipeline that includes data loading, +explanation development, and thorough method assessment. The suggested +framework's relevance and variety are evidenced by case studies in healthcare, +finance, agriculture, and autonomous systems. These provide a solid basis for +the equitable and dependable assessment of XAI methodologies. This paradigm +enhances XAI research by offering a systematic, flexible, and pragmatic method +to guarantee transparency and accountability in AI systems across many +real-world contexts. + +
+
+
+
+
+ + ☆ Weak-to-Strong Generalization Through the Data-Centric Lens + + +
+ The weak-to-strong generalization phenomenon is the driver for important +machine learning applications including highly data-efficient learning and, +most recently, performing superalignment. While decades of research have +resulted in numerous algorithms that produce strong empirical performance, +understanding what aspects of data enable weak-to-strong generalization has +been understudied. We propose a simple data-centric mechanism that +characterizes weak-to-strong generalization: the overlap density. Intuitively, +generalization tracks the number of points that contain overlaps, i.e., both +easy patterns (learnable by a weak model) and challenging patterns (only +learnable by a stronger model), as with such points, weak predictions can be +used to learn challenging patterns by stronger models. We provide a practical +overlap detection algorithm to find such points in datasets and leverage them +to learn, among multiple sources of data, which to query when seeking to +maximize overlap density and thereby enhance weak-to-strong generalization. We +present a theoretical result showing that the generalization benefit is a +function of the overlap density and a regret bound for our data selection +algorithm. Empirically, we validate the mechanism and the overlap detection +algorithm on a wide array of settings. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ☆ AyutthayaAlpha: A Thai-Latin Script Transliteration Transformer + + +
+ This study introduces AyutthayaAlpha, an advanced transformer-based machine +learning model designed for the transliteration of Thai proper names into Latin +script. Our system achieves state-of-the-art performance with 82.32% +first-token accuracy and 95.24% first-three-token accuracy, while maintaining a +low character error rate of 0.0047. The complexity of Thai phonology, including +tonal features and vowel length distinctions, presents significant challenges +for accurate transliteration, which we address through a novel two-model +approach: AyutthayaAlpha-Small, based on the ByT5 architecture, and +AyutthayaAlpha-VerySmall, a computationally efficient variant that unexpectedly +outperforms its larger counterpart. Our research combines linguistic rules with +deep learning, training on a carefully curated dataset of 1.2 million +Thai-Latin name pairs, augmented through strategic upsampling to 2.7 million +examples. Extensive evaluations against existing transliteration methods and +human expert benchmarks demonstrate that AyutthayaAlpha not only achieves +superior accuracy but also effectively captures personal and cultural +preferences in name romanization. The system's practical applications extend to +cross-lingual information retrieval, international data standardization, and +identity verification systems, with particular relevance for government +databases, academic institutions, and global business operations. This work +represents a significant advance in bridging linguistic gaps between Thai and +Latin scripts, while respecting the cultural and personal dimensions of name +transliteration. + +
+
+
+
+
+ + Fine-Grained Sentiment Analysis of Electric Vehicle User Reviews: A + Bidirectional LSTM Approach to Capturing Emotional Intensity in Chinese Text + + +
+ The rapid expansion of the electric vehicle (EV) industry has highlighted the +importance of user feedback in improving product design and charging +infrastructure. Traditional sentiment analysis methods often oversimplify the +complexity of user emotions, limiting their effectiveness in capturing nuanced +sentiments and emotional intensities. This study proposes a Bidirectional Long +Short-Term Memory (Bi-LSTM) network-based sentiment scoring model to analyze +user reviews of EV charging infrastructure. By assigning sentiment scores +ranging from 0 to 5, the model provides a fine-grained understanding of +emotional expression. Leveraging a dataset of 43,678 reviews from PC Auto, the +study employs rigorous data cleaning and preprocessing, including tokenization +and stop word removal, to optimize input for deep learning. The Bi-LSTM model +demonstrates significant improvements over traditional approaches like SnowNLP +across key evaluation metrics, including Mean Squared Error (MSE), Mean +Absolute Error (MAE), and Explained Variance Score (EVS). These results +highlight the model's superior capability to capture nuanced sentiment +dynamics, offering valuable insights for targeted product and service +enhancements in the EV ecosystem. + +
+
+
+
+
+ + ♻ ☆ A method to benchmark high-dimensional process drift detection + + +
+ Process curves are multivariate finite time series data coming from +manufacturing processes. This paper studies machine learning that detect drifts +in process curve datasets. A theoretic framework to synthetically generate +process curves in a controlled way is introduced in order to benchmark machine +learning algorithms for process drift detection. An evaluation score, called +the temporal area under the curve, is introduced, which allows to quantify how +well machine learning models unveil curves belonging to drift segments. +Finally, a benchmark study comparing popular machine learning approaches on +synthetic data generated with the introduced framework is presented that shows +that existing algorithms often struggle with datasets containing multiple drift +segments. + +
+
+
+
+
+ + ♻ ☆ SmallToLarge (S2L): Scalable Data Selection for Fine-tuning Large + Language Models by Summarizing Training Trajectories of Small Models + + +
+ Despite the effectiveness of data selection for large language models (LLMs) +during pretraining and instruction fine-tuning phases, improving data +efficiency in supervised fine-tuning (SFT) for specialized domains poses +significant challenges due to the complexity of fine-tuning data. To bridge +this gap, we introduce an effective and scalable data selection method for SFT, +SmallToLarge (S2L), which leverages training trajectories from small models to +guide the data selection for larger models. We demonstrate through extensive +experiments that S2L significantly improves data efficiency in SFT for +mathematical problem-solving, reducing the training data to just 11% of the +original MathInstruct dataset (Yue et al., 2023) to match full dataset +performance while outperforming state-of-the-art data selection algorithms by +an average of 4.7% across 6 in- and out-domain evaluation datasets. Remarkably, +selecting only 50K data for SFT, S2L achieves a 32.7% accuracy on the most +challenging MATH (Hendrycks et al., 2021) benchmark, improving Phi-2 (Li et +al., 2023b) by 16.6%. In clinical text summarization on the MIMIC-III dataset +(Johnson et al., 2016), S2L again outperforms training on the full dataset +using only 50% of the data. Notably, S2L can perform data selection using a +reference model 40x smaller than the target model, proportionally reducing the +cost of data selection. + +
+
+
+
+
+ + ♻ ☆ Negative Token Merging: Image-based Adversarial Feature Guidance + + +
+ Text-based adversarial guidance using a negative prompt has emerged as a +widely adopted approach to steer diffusion models away from producing undesired +concepts. While useful, performing adversarial guidance using text alone can be +insufficient to capture complex visual concepts or avoid specific visual +elements like copyrighted characters. In this paper, for the first time we +explore an alternate modality in this direction by performing adversarial +guidance directly using visual features from a reference image or other images +in a batch. We introduce negative token merging (NegToMe), a simple but +effective training-free approach which performs adversarial guidance through +images by selectively pushing apart matching visual features between reference +and generated images during the reverse diffusion process. By simply adjusting +the used reference, NegToMe enables a diverse range of applications. Notably, +when using other images in same batch as reference, we find that NegToMe +significantly enhances output diversity (e.g., racial, gender, visual) by +guiding features of each image away from others. Similarly, when used w.r.t. +copyrighted reference images, NegToMe reduces visual similarity to copyrighted +content by 34.57%. NegToMe is simple to implement using just few-lines of code, +uses only marginally higher (<4%) inference time and is compatible with +different diffusion architectures, including those like Flux, which don't +natively support the use of a negative prompt. Code is available at +https://negtome.github.io + +
+
+
+
+
+ + ♻ ☆ WaveletGPT: Wavelets Meet Large Language Models + + +
+ Large Language Models (LLMs) have ushered in a new wave of artificial +intelligence advancements impacting every scientific field and discipline. They +are trained on a simple objective: to predict the next token given the previous +context. We live in a world where most of the data around us, e.g., text, +audio, and music, has a multi-scale structure associated with it. This paper +infuses LLMs with traditional signal processing ideas, namely wavelets, during +pre-training to take advantage of the structure. Without adding \textbf{any +extra parameters} to a GPT-style LLM architecture, we achieve the same +pre-training performance almost twice as fast in text, raw audio, and symbolic +music. This is achieved by imposing a structure on intermediate embeddings. +When trained for the same number of training steps, we achieve significant +gains in performance, which is comparable to pre-training a larger neural +architecture. Our architecture allows every next token prediction access to +intermediate embeddings at different temporal resolutions in every Transformer +decoder block. This work will hopefully pave the way for incorporating +multi-rate signal processing ideas into traditional LLM pre-training. Further, +we showcase pushing model performance by improving internal structure instead +of just going after scale. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ CNNSum: Exploring Long-Conext Summarization with Large Language Models + in Chinese Novels + + +
+ Large Language Models (LLMs) have been well-researched in many long-context +tasks. However, due to high annotation costs, high-quality long-context summary +datasets for training or evaluation are scarce, limiting further research. In +this work, we introduce CNNSum, a new multi-scale Chinese long-context novel +summarization benchmark, including four subsets, length covering +16k\textasciitilde128k, 695 samples in total, the annotations are human-driven. +We evaluate commercial and open-source models on CNNSum and conduct a detailed +analysis. Based on the observations, we further conduct fine-tuning exploration +with short-context summary data. In our study: (1) GPT-4o underperformed, due +to excessive subjective commentary. (2) Currently, long-context summarization +mainly relies on memory ability, small LLMs with stable longer context lengths +are the most cost-effective. Using long data concatenated from short-context +summaries makes a significant improvement. (3) Prompt templates may cause a +large performance gap but can be mitigated through fine-tuning. (4) Fine-tuned +Chat or Instruction versions may harm the Base model and further fine-tuning +cannot bridge performance gap. (5) while models with RoPE base scaling exhibit +strong extrapolation potential, their performance may vary significantly when +combined with other interpolation methods and need careful selection. (6) +CNNSum provides more reliable and insightful evaluation results than other +benchmarks. We release CNNSum to advance research in this field. + +
+
+
+
+
+ + ♻ ☆ Adversarial Attacks on Large Language Models in Medicine + + +
+ The integration of Large Language Models (LLMs) into healthcare applications +offers promising advancements in medical diagnostics, treatment +recommendations, and patient care. However, the susceptibility of LLMs to +adversarial attacks poses a significant threat, potentially leading to harmful +outcomes in delicate medical contexts. This study investigates the +vulnerability of LLMs to two types of adversarial attacks in three medical +tasks. Utilizing real-world patient data, we demonstrate that both open-source +and proprietary LLMs are susceptible to manipulation across multiple tasks. +This research further reveals that domain-specific tasks demand more +adversarial data in model fine-tuning than general domain tasks for effective +attack execution, especially for more capable models. We discover that while +integrating adversarial data does not markedly degrade overall model +performance on medical benchmarks, it does lead to noticeable shifts in +fine-tuned model weights, suggesting a potential pathway for detecting and +countering model attacks. This research highlights the urgent need for robust +security measures and the development of defensive mechanisms to safeguard LLMs +in medical applications, to ensure their safe and effective deployment in +healthcare settings. + +
+
+
+
+
+ + ♻ ☆ GeoPos: A Minimal Positional Encoding for Enhanced Fine-Grained Details + in Image Synthesis Using Convolutional Neural Networks WACV 2025 + + +
+ The enduring inability of image generative models to recreate intricate +geometric features, such as those present in human hands and fingers has been +an ongoing problem in image generation for nearly a decade. While strides have +been made by increasing model sizes and diversifying training datasets, this +issue remains prevalent across all models, from denoising diffusion models to +Generative Adversarial Networks (GAN), pointing to a fundamental shortcoming in +the underlying architectures. In this paper, we demonstrate how this problem +can be mitigated by augmenting convolution layers geometric capabilities +through providing them with a single input channel incorporating the relative +n-dimensional Cartesian coordinate system. We show this drastically improves +quality of images generated by Diffusion Models, GANs, and Variational +AutoEncoders (VAE). + +
+
+ comment: Accepted at WACV 2025. Contains 19 pages, 15 figures, and 9 tables +
+
+
+
+
+ + ♻ ☆ Introducing the Large Medical Model: State of the art healthcare cost + and risk prediction with transformers trained on patient event sequences + + +
+ With U.S. healthcare spending approaching $5T (NHE Fact Sheet 2024), and 25% +of it estimated to be wasteful (Waste in the US the health care system: +estimated costs and potential for savings, n.d.), the need to better predict +risk and optimal patient care is evermore important. This paper introduces the +Large Medical Model (LMM), a generative pre-trained transformer (GPT) designed +to guide and predict the broad facets of patient care and healthcare +administration. The model is trained on medical event sequences from over 140M +longitudinal patient claims records with a specialized vocabulary built from +medical terminology systems and demonstrates a superior capability to forecast +healthcare costs and identify potential risk factors. Through experimentation +and validation, we showcase the LMM's proficiency in not only in cost and risk +predictions, but also in discerning intricate patterns within complex medical +conditions and an ability to identify novel relationships in patient care. The +LMM is able to improve both cost prediction by 14.1% over the best commercial +models and chronic conditions prediction by 1.9% over the best transformer +models in research predicting a broad set of conditions. The LMM is a +substantial advancement in healthcare analytics, offering the potential to +significantly enhance risk assessment, cost management, and personalized +medicine. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Learning in Wilson-Cowan model for metapopulation + + +
+ The Wilson-Cowan model for metapopulation, a Neural Mass Network Model, +treats different subcortical regions of the brain as connected nodes, with +connections representing various types of structural, functional, or effective +neuronal connectivity between these regions. Each region comprises interacting +populations of excitatory and inhibitory cells, consistent with the standard +Wilson-Cowan model. By incorporating stable attractors into such a +metapopulation model's dynamics, we transform it into a learning algorithm +capable of achieving high image and text classification accuracy. We test it on +MNIST and Fashion MNIST, in combination with convolutional neural networks, on +CIFAR-10 and TF-FLOWERS, and, in combination with a transformer architecture +(BERT), on IMDB, always showing high classification accuracy. These numerical +evaluations illustrate that minimal modifications to the Wilson-Cowan model for +metapopulation can reveal unique and previously unobserved dynamics. + +
+
+ comment: Paper Accepted in Neural Computation (in press) +
+
+
+
+
+ + ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models WACV 2025 + + +
+ In this work, we address the challenging and emergent problem of novel object +detection (NOD), focusing on the accurate detection of both known and novel +object categories during inference. Traditional object detection algorithms are +inherently closed-set, limiting their capability to handle NOD. We present a +novel approach to transform existing closed-set detectors into open-set +detectors. This transformation is achieved by leveraging the complementary +strengths of pre-trained foundational models, specifically CLIP and SAM, +through our cooperative mechanism. Furthermore, by integrating this mechanism +with state-of-the-art open-set detectors such as GDINO, we establish new +benchmarks in object detection performance. Our method achieves 17.42 mAP in +novel object detection and 42.08 mAP for known objects on the challenging LVIS +dataset. Adapting our approach to the COCO OVD split, we surpass the current +state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our +code is available at https://rohit901.github.io/coop-foundation-models/ . + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ HydraViT: Stacking Heads for a Scalable ViT NeurIPS'24 + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+ comment: Accepted at NeurIPS'24, please cite the conference version +
+
+
+
+
+ + ♻ ☆ Facility Location Games with Scaling Effects AAMAS-24 + + +
+ We take the classic facility location problem and consider a variation, in +which each agent's individual cost function is equal to their distance from the +facility multiplied by a scaling factor which is determined by the facility +placement. In addition to the general class of continuous scaling functions, we +also provide results for piecewise linear scaling functions which can +effectively approximate or model the scaling of many real world scenarios. We +focus on the objectives of total and maximum cost, describing the computation +of the optimal solution. We then move to the approximate mechanism design +setting, observing that the agents' preferences may no longer be single-peaked. +Consequently, we characterize the conditions on scaling functions which ensure +that agents have single-peaked preferences. Under these conditions, we find a +characterization of continuous, strategyproof, and anonymous mechanisms, and +compute the total and maximum cost approximation ratios achievable by these +mechanisms. + +
+
+ comment: This is an updated version of the paper which appeared at the 23rd + International Conference on Autonomous Agents and Multi-Agent Systems + (AAMAS-24) +
+
+
+
+
+ + ♻ ☆ CoSy: Evaluating Textual Explanations of Neurons + + +
+ A crucial aspect of understanding the complex nature of Deep Neural Networks +(DNNs) is the ability to explain learned concepts within their latent +representations. While methods exist to connect neurons to human-understandable +textual descriptions, evaluating the quality of these explanations is +challenging due to the lack of a unified quantitative approach. We introduce +CoSy (Concept Synthesis), a novel, architecture-agnostic framework for +evaluating textual explanations of latent neurons. Given textual explanations, +our proposed framework uses a generative model conditioned on textual input to +create data points representing the explanations. By comparing the neuron's +response to these generated data points and control data points, we can +estimate the quality of the explanation. We validate our framework through +sanity checks and benchmark various neuron description methods for Computer +Vision tasks, revealing significant differences in quality. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ In-context learning and Occam's razor + + +
+ A central goal of machine learning is generalization. While the No Free Lunch +Theorem states that we cannot obtain theoretical guarantees for generalization +without further assumptions, in practice we observe that simple models which +explain the training data generalize best: a principle called Occam's razor. +Despite the need for simple models, most current approaches in machine learning +only minimize the training error, and at best indirectly promote simplicity +through regularization or architecture design. Here, we draw a connection +between Occam's razor and in-context learning: an emergent ability of certain +sequence models like Transformers to learn at inference time from past +observations in a sequence. In particular, we show that the next-token +prediction loss used to train in-context learners is directly equivalent to a +data compression technique called prequential coding, and that minimizing this +loss amounts to jointly minimizing both the training error and the complexity +of the model that was implicitly learned from context. Our theory and the +empirical experiments we use to support it not only provide a normative account +of in-context learning, but also elucidate the shortcomings of current +in-context learning methods, suggesting ways in which they can be improved. We +make our code available at https://github.com/3rdCore/PrequentialCode. + +
+
+
+
+
+ + ♻ ☆ Reachable Polyhedral Marching (RPM): An Exact Analysis Tool for + Deep-Learned Control Systems + + +
+ Neural networks are increasingly used in robotics as policies, state +transition models, state estimation models, or all of the above. With these +components being learned from data, it is important to be able to analyze what +behaviors were learned and how this affects closed-loop performance. In this +paper we take steps toward this goal by developing methods for computing +control invariant sets and regions of attraction (ROAs) of dynamical systems +represented as neural networks. We focus our attention on feedforward neural +networks with the rectified linear unit (ReLU) activation, which are known to +implement continuous piecewise-affine (PWA) functions. We describe the +Reachable Polyhedral Marching (RPM) algorithm for enumerating the affine pieces +of a neural network through an incremental connected walk. We then use this +algorithm to compute exact forward and backward reachable sets, from which we +provide methods for computing control invariant sets and ROAs. Our approach is +unique in that we find these sets incrementally, without Lyapunov-based tools. +In our examples we demonstrate the ability of our approach to find non-convex +control invariant sets and ROAs on tasks with learned van der Pol oscillator +and pendulum models. Further, we provide an accelerated algorithm for computing +ROAs that leverages the incremental and connected enumeration of affine regions +that RPM provides. We show this acceleration to lead to a 15x speedup in our +examples. Finally, we apply our methods to find a set of states that are +stabilized by an image-based controller for an aircraft runway control problem. + +
+
+ comment: Submitted to IEEE Transactions on Neural Networks and Learning + Systems. arXiv admin note: text overlap with arXiv:2011.11609 +
+
+
+
+
+ + ♻ ☆ A Complexity-Based Theory of Compositionality + + +
+ Compositionality is believed to be fundamental to intelligence. In humans, it +underlies the structure of thought, language, and higher-level reasoning. In +AI, compositional representations can enable a powerful form of +out-of-distribution generalization, in which a model systematically adapts to +novel combinations of known concepts. However, while we have strong intuitions +about what compositionality is, there currently exists no formal definition for +it that is measurable and mathematical. Here, we propose such a definition, +which we call representational compositionality, that accounts for and extends +our intuitions about compositionality. The definition is conceptually simple, +quantitative, grounded in algorithmic information theory, and applicable to any +representation. Intuitively, representational compositionality states that a +compositional representation satisfies three properties. First, it must be +expressive. Second, it must be possible to re-describe the representation as a +function of discrete symbolic sequences with re-combinable parts, analogous to +sentences in natural language. Third, the function that relates these symbolic +sequences to the representation, analogous to semantics in natural language, +must be simple. Through experiments on both synthetic and real world data, we +validate our definition of compositionality and show how it unifies disparate +intuitions from across the literature in both AI and cognitive science. We also +show that representational compositionality, while theoretically intractable, +can be readily estimated using standard deep learning tools. Our definition has +the potential to inspire the design of novel, theoretically-driven models that +better capture the mechanisms of compositional thought. + +
+
+
+
+
+ + ♻ ☆ Cross-domain and Cross-dimension Learning for Image-to-Graph + Transformers + + +
+ Direct image-to-graph transformation is a challenging task that involves +solving object detection and relationship prediction in a single model. Due to +this task's complexity, large training datasets are rare in many domains, +making the training of deep-learning methods challenging. This data sparsity +necessitates transfer learning strategies akin to the state-of-the-art in +general computer vision. In this work, we introduce a set of methods enabling +cross-domain and cross-dimension learning for image-to-graph transformers. We +propose (1) a regularized edge sampling loss to effectively learn object +relations in multiple domains with different numbers of edges, (2) a domain +adaptation framework for image-to-graph transformers aligning image- and +graph-level features from different domains, and (3) a projection function that +allows using 2D data for training 3D transformers. We demonstrate our method's +utility in cross-domain and cross-dimension experiments, where we utilize +labeled data from 2D road networks for simultaneous learning in vastly +different target domains. Our method consistently outperforms standard transfer +learning and self-supervised pretraining on challenging benchmarks, such as +retinal or whole-brain vessel graph extraction. + +
+
+
+
+
+ + ♻ ☆ Model-GLUE: Democratized LLM Scaling for A Large Model Zoo in the Wild NeurIPS 2024 + + +
+ As Large Language Models (LLMs) excel across tasks and specialized domains, +scaling LLMs based on existing models has garnered significant attention, which +faces the challenge of decreasing performance when combining disparate models. +Various techniques have been proposed for the aggregation of pre-trained LLMs, +including model merging, Mixture-of-Experts, and stacking. Despite their +merits, a comprehensive comparison and synergistic application of them to a +diverse model zoo is yet to be adequately addressed. In light of this research +gap, this paper introduces Model-GLUE, a holistic LLM scaling guideline. First, +our work starts with a benchmarking of existing LLM scaling techniques, +especially selective merging, and variants of mixture. Utilizing the insights +from the benchmark results, we formulate an optimal strategy for the selection +and aggregation of a heterogeneous model zoo characterizing different +architectures and initialization.Our methodology involves the clustering of +mergeable models and optimal merging strategy selection, and the integration of +clusters through a model mixture. Finally, evidenced by our experiments on a +diverse Llama-2-based model zoo, Model-GLUE shows an average performance +enhancement of 5.61%, achieved without additional training. Codes are available +at: https://github.com/Model-GLUE/Model-GLUE. + +
+
+ comment: 24 pages, 4 figures, accepted to NeurIPS 2024 Datasets and Benchmarks + Track +
+
+
+
+
+ + ♻ ☆ PBP: Post-training Backdoor Purification for Malware Classifiers NDSS 2025 + + +
+ In recent years, the rise of machine learning (ML) in cybersecurity has +brought new challenges, including the increasing threat of backdoor poisoning +attacks on ML malware classifiers. For instance, adversaries could inject +malicious samples into public malware repositories, contaminating the training +data and potentially misclassifying malware by the ML model. Current +countermeasures predominantly focus on detecting poisoned samples by leveraging +disagreements within the outputs of a diverse set of ensemble models on +training data points. However, these methods are not suitable for scenarios +where Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove +backdoors from a model after it has been trained. Addressing this scenario, we +introduce PBP, a post-training defense for malware classifiers that mitigates +various types of backdoor embeddings without assuming any specific backdoor +embedding mechanism. Our method exploits the influence of backdoor attacks on +the activation distribution of neural networks, independent of the +trigger-embedding method. In the presence of a backdoor attack, the activation +distribution of each layer is distorted into a mixture of distributions. By +regulating the statistics of the batch normalization layers, we can guide a +backdoored model to perform similarly to a clean one. Our method demonstrates +substantial advantages over several state-of-the-art methods, as evidenced by +experiments on two datasets, two types of backdoor methods, and various attack +configurations. Notably, our approach requires only a small portion of the +training data -- only 1\% -- to purify the backdoor and reduce the attack +success rate from 100\% to almost 0\%, a 100-fold improvement over the baseline +methods. Our code is available at +\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}. + +
+
+ comment: Accepted at NDSS 2025 +
+
+
+
+
+ + ♻ ☆ SwiftKV: Fast Prefill-Optimized Inference with Knowledge-Preserving + Model Transformation + + +
+ LLM inference for popular enterprise use cases, such as summarization, RAG, +and code-generation, typically observes orders of magnitude longer prompt +lengths than generation lengths. This characteristic leads to high cost of +prefill and increased response latency. In this paper, we present SwiftKV, a +novel model transformation and distillation procedure specifically designed to +reduce the time and cost of processing prompt tokens while preserving high +quality of generated tokens. SwiftKV combines three key mechanisms: i) +SingleInputKV, which prefills later layers' KV cache using a much earlier +layer's output, allowing prompt tokens to skip much of the model computation, +ii) AcrossKV, which merges the KV caches of neighboring layers to reduce the +memory footprint and support larger batch size for higher throughput, and iii) +a knowledge-preserving distillation procedure that can adapt existing LLMs for +SwiftKV with minimal accuracy impact and low compute and data requirement. For +Llama-3.1-8B and 70B, SwiftKV reduces the compute requirement of prefill by 50% +and the memory requirement of the KV cache by 62.5% while incurring minimum +quality degradation across a wide range of tasks. In the end-to-end inference +serving using an optimized vLLM implementation, SwiftKV realizes up to 2x +higher aggregate throughput and 60% lower time per output token. It can achieve +a staggering 560 TFlops/GPU of normalized inference throughput, which +translates to 16K tokens/s for Llama-3.1-70B in 16-bit precision on 4x H100 +GPUs. Our training, inference, and model implementations are open-sourced and +can be found through +https://huggingface.co/collections/Snowflake/swiftkv-models-674f7d7474eb789e185d31cb. + +
+
+
+
+
+ + ♻ ☆ Dockformer: A transformer-based molecular docking paradigm for + large-scale virtual screening + + +
+ Molecular docking is a crucial step in drug development, which enables the +virtual screening of compound libraries to identify potential ligands that +target proteins of interest. However, the computational complexity of +traditional docking models increases as the size of the compound library +increases. Recently, deep learning algorithms can provide data-driven research +and development models to increase the speed of the docking process. +Unfortunately, few models can achieve superior screening performance compared +to that of traditional models. Therefore, a novel deep learning-based docking +approach named Dockformer is introduced in this study. Dockformer leverages +multimodal information to capture the geometric topology and structural +knowledge of molecules and can directly generate binding conformations with the +corresponding confidence measures in an end-to-end manner. The experimental +results show that Dockformer achieves success rates of 90.53% and 82.71% on the +PDBbind core set and PoseBusters benchmarks, respectively, and more than a +100-fold increase in the inference process speed, outperforming almost all +state-of-the-art docking methods. In addition, the ability of Dockformer to +identify the main protease inhibitors of coronaviruses is demonstrated in a +real-world virtual screening scenario. Considering its high docking accuracy +and screening efficiency, Dockformer can be regarded as a powerful and robust +tool in the field of drug design. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ MetricGold: Leveraging Text-To-Image Latent Diffusion Models for Metric + Depth Estimation + + +
+ Recovering metric depth from a single image remains a fundamental challenge +in computer vision, requiring both scene understanding and accurate scaling. +While deep learning has advanced monocular depth estimation, current models +often struggle with unfamiliar scenes and layouts, particularly in zero-shot +scenarios and when predicting scale-ergodic metric depth. We present +MetricGold, a novel approach that harnesses generative diffusion model's rich +priors to improve metric depth estimation. Building upon recent advances in +MariGold, DDVM and Depth Anything V2 respectively, our method combines latent +diffusion, log-scaled metric depth representation, and synthetic data training. +MetricGold achieves efficient training on a single RTX 3090 within two days +using photo-realistic synthetic data from HyperSIM, VirtualKitti, and +TartanAir. Our experiments demonstrate robust generalization across diverse +datasets, producing sharper and higher quality metric depth estimates compared +to existing approaches. + +
+
+
+
+
+ + ♻ ☆ Agent-OM: Leveraging LLM Agents for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM agents have +revolutionised data engineering and have been applied creatively in many +domains, their potential for OM remains underexplored. This study introduces a +novel agent-powered LLM-based design paradigm for OM systems. With +consideration of several specific challenges in leveraging LLM agents for OM, +we propose a generic framework, namely Agent-OM (Agent for Ontology Matching), +consisting of two Siamese agents for retrieval and matching, with a set of +simple OM tools. Our framework is implemented in a proof-of-concept system. +Evaluations of three Ontology Alignment Evaluation Initiative (OAEI) tracks +over state-of-the-art OM systems show that our system can achieve results very +close to the long-standing best performance on simple OM tasks and can +significantly improve the performance on complex and few-shot OM tasks. + +
+
+ comment: 14 pages, 13 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Context Prompting for Zero-Shot Action Detection WACV2025 + + +
+ Spatio-temporal action detection encompasses the tasks of localizing and +classifying individual actions within a video. Recent works aim to enhance this +process by incorporating interaction modeling, which captures the relationship +between people and their surrounding context. However, these approaches have +primarily focused on fully-supervised learning, and the current limitation lies +in the lack of generalization capability to recognize unseen action categories. +In this paper, we aim to adapt the pretrained image-language models to detect +unseen actions. To this end, we propose a method which can effectively leverage +the rich knowledge of visual-language models to perform Person-Context +Interaction. Meanwhile, our Context Prompting module will utilize contextual +information to prompt labels, thereby enhancing the generation of more +representative text features. Moreover, to address the challenge of recognizing +distinct actions by multiple people at the same timestamp, we design the +Interest Token Spotting mechanism which employs pretrained visual knowledge to +find each person's interest context tokens, and then these tokens will be used +for prompting to generate text features tailored to each individual. To +evaluate the ability to detect unseen actions, we propose a comprehensive +benchmark on J-HMDB, UCF101-24, and AVA datasets. The experiments show that our +method achieves superior results compared to previous approaches and can be +further extended to multi-action videos, bringing it closer to real-world +applications. The code and data can be found in +https://webber2933.github.io/ST-CLIP-project-page. + +
+
+ comment: Accepted by WACV2025. Project page: + https://webber2933.github.io/ST-CLIP-project-page +
+
+
+
+
+ + ♻ ☆ Adaptive Circuit Behavior and Generalization in Mechanistic + Interpretability + + +
+ Mechanistic interpretability aims to understand the inner workings of large +neural networks by identifying circuits, or minimal subgraphs within the model +that implement algorithms responsible for performing specific tasks. These +circuits are typically discovered and analyzed using a narrowly defined prompt +format. However, given the abilities of large language models (LLMs) to +generalize across various prompt formats for the same task, it remains unclear +how well these circuits generalize. For instance, it is unclear whether the +models generalization results from reusing the same circuit components, the +components behaving differently, or the use of entirely different components. +In this paper, we investigate the generality of the indirect object +identification (IOI) circuit in GPT-2 small, which is well-studied and believed +to implement a simple, interpretable algorithm. We evaluate its performance on +prompt variants that challenge the assumptions of this algorithm. Our findings +reveal that the circuit generalizes surprisingly well, reusing all of its +components and mechanisms while only adding additional input edges. Notably, +the circuit generalizes even to prompt variants where the original algorithm +should fail; we discover a mechanism that explains this which we term S2 +Hacking. Our findings indicate that circuits within LLMs may be more flexible +and general than previously recognized, underscoring the importance of studying +circuit generalization to better understand the broader capabilities of these +models. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ When Stability meets Sufficiency: Informative Explanations that do not + Overwhelm + + +
+ Recent studies evaluating various criteria for explainable artificial +intelligence (XAI) suggest that fidelity, stability, and comprehensibility are +among the most important metrics considered by users of AI across a diverse +collection of usage contexts. We consider these criteria as applied to +feature-based attribution methods, which are amongst the most prevalent in XAI +literature. Going beyond standard correlation, methods have been proposed that +highlight what should be minimally sufficient to justify the classification of +an input (viz. pertinent positives). While minimal sufficiency is an attractive +property akin to comprehensibility, the resulting explanations are often too +sparse for a human to understand and evaluate the local behavior of the model. +To overcome these limitations, we incorporate the criteria of stability and +fidelity and propose a novel method called Path-Sufficient Explanations Method +(PSEM) that outputs a sequence of stable and sufficient explanations for a +given input of strictly decreasing size (or value) -- from original input to a +minimally sufficient explanation -- which can be thought to trace the local +boundary of the model in a stable manner, thus providing better intuition about +the local model behavior for the specific input. We validate these claims, both +qualitatively and quantitatively, with experiments that show the benefit of +PSEM across three modalities (image, tabular and text) as well as versus other +path explanations. A user study depicts the strength of the method in +communicating the local behavior, where (many) users are able to correctly +determine the prediction made by a model. + +
+
+ comment: Published at TMLR +
+
+
+
+
+ + ♻ ☆ MC-LLaVA: Multi-Concept Personalized Vision-Language Model + + +
+ Current vision-language models (VLMs) show exceptional abilities across +diverse tasks including visual question answering. To enhance user experience +in practical applications, recent studies investigate VLM personalization to +understand user-provided concepts. However, existing studies mainly focus on +single-concept personalization, neglecting the existence and interplay of +multiple concepts, which limits the real-world applicability of personalized +VLMs. In this paper, we propose the first multi-concept personalization method +named MC-LLaVA along with a high-quality multi-concept personalization dataset. +Specifically, MC-LLaVA uses a joint training strategy incorporating multiple +concepts in a single training step, allowing VLMs to perform accurately in +multi-concept personalization. To reduce the cost of joint training, MC-LLaVA +leverages visual token information for concept token initialization, yielding +improved concept representation and accelerating joint training. To advance +multi-concept personalization research, we further contribute a high-quality +dataset. We carefully collect images from various movies that contain multiple +characters and manually generate the multi-concept question-answer samples. Our +dataset features diverse movie types and question-answer types. We conduct +comprehensive qualitative and quantitative experiments to demonstrate that +MC-LLaVA can achieve impressive multi-concept personalized responses, paving +the way for VLMs to become better user-specific assistants. The code and +dataset will be publicly available at https://github.com/arctanxarc/MC-LLaVA. + +
+
+
+
+
+ + ♻ ☆ Learning Semantic Association Rules from Internet of Things Data + + +
+ Association Rule Mining (ARM) is the task of discovering commonalities in +data in the form of logical implications. ARM is used in the Internet of Things +(IoT) for different tasks including monitoring and decision-making. However, +existing methods give limited consideration to IoT-specific requirements such +as heterogeneity and volume. Furthermore, they do not utilize important static +domain-specific description data about IoT systems, which is increasingly +represented as knowledge graphs. In this paper, we propose a novel ARM pipeline +for IoT data that utilizes both dynamic sensor data and static IoT system +metadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method +(Aerial) as part of the pipeline to address the high volume of IoT data and +reduce the total number of rules that are resource-intensive to process. Aerial +learns a neural representation of a given data and extracts association rules +from this representation by exploiting the reconstruction (decoding) mechanism +of an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show +that ARM on both static and dynamic IoT data results in more generically +applicable rules while Aerial can learn a more concise set of high-quality +association rules than the state-of-the-art with full coverage over the +datasets. + +
+
+
+
+
+ + ♻ ☆ DeiSAM: Segment Anything with Deictic Prompting NeurIPS 2024 + + +
+ Large-scale, pre-trained neural networks have demonstrated strong +capabilities in various tasks, including zero-shot image segmentation. To +identify concrete objects in complex scenes, humans instinctively rely on +deictic descriptions in natural language, i.e., referring to something +depending on the context such as "The object that is on the desk and behind the +cup.". However, deep learning approaches cannot reliably interpret such deictic +representations due to their lack of reasoning capabilities in complex +scenarios. To remedy this issue, we propose DeiSAM -- a combination of large +pre-trained neural networks with differentiable logic reasoners -- for deictic +promptable segmentation. Given a complex, textual segmentation description, +DeiSAM leverages Large Language Models (LLMs) to generate first-order logic +rules and performs differentiable forward reasoning on generated scene graphs. +Subsequently, DeiSAM segments objects by matching them to the logically +inferred image regions. As part of our evaluation, we propose the Deictic +Visual Genome (DeiVG) dataset, containing paired visual input and complex, +deictic textual prompts. Our empirical results demonstrate that DeiSAM is a +substantial improvement over purely data-driven baselines for deictic +promptable segmentation. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Stochastic Monkeys at Play: Random Augmentations Cheaply Break LLM + Safety Alignment + + +
+ Safety alignment of Large Language Models (LLMs) has recently become a +critical objective of model developers. In response, a growing body of work has +been investigating how safety alignment can be bypassed through various +jailbreaking methods, such as adversarial attacks. However, these jailbreak +methods can be rather costly or involve a non-trivial amount of creativity and +effort, introducing the assumption that malicious users are high-resource or +sophisticated. In this paper, we study how simple random augmentations to the +input prompt affect safety alignment effectiveness in state-of-the-art LLMs, +such as Llama 3 and Qwen 2. We perform an in-depth evaluation of 17 different +models and investigate the intersection of safety under random augmentations +with multiple dimensions: augmentation type, model size, quantization, +fine-tuning-based defenses, and decoding strategies (e.g., sampling +temperature). We show that low-resource and unsophisticated attackers, i.e. +$\textit{stochastic monkeys}$, can significantly improve their chances of +bypassing alignment with just 25 random augmentations per prompt. Source code +and data: https://github.com/uiuc-focal-lab/stochastic-monkeys/ + +
+
+ comment: v2: Updated with changes from peer review rebuttal. v1: Version under + peer review +
+
+
+
+
+ + ♻ ☆ Lexicalization Is All You Need: Examining the Impact of Lexical + Knowledge in a Compositional QALD System + + +
+ In this paper, we examine the impact of lexicalization on Question Answering +over Linked Data (QALD). It is well known that one of the key challenges in +interpreting natural language questions with respect to SPARQL lies in bridging +the lexical gap, that is mapping the words in the query to the correct +vocabulary elements. We argue in this paper that lexicalization, that is +explicit knowledge about the potential interpretations of a word with respect +to the given vocabulary, significantly eases the task and increases the +performance of QA systems. Towards this goal, we present a compositional QA +system that can leverage explicit lexical knowledge in a compositional manner +to infer the meaning of a question in terms of a SPARQL query. We show that +such a system, given lexical knowledge, has a performance well beyond current +QA systems, achieving up to a $35.8\%$ increase in the micro $F_1$ score +compared to the best QA system on QALD-9. This shows the importance and +potential of including explicit lexical knowledge. In contrast, we show that +LLMs have limited abilities to exploit lexical knowledge, with only marginal +improvements compared to a version without lexical knowledge. This shows that +LLMs have no ability to compositionally interpret a question on the basis of +the meaning of its parts, a key feature of compositional approaches. Taken +together, our work shows new avenues for QALD research, emphasizing the +importance of lexicalization and compositionality. + +
+
+ comment: 24th International Conference on Knowledge Engineering and Knowledge + Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands +
+
+
+
+
+ + ♻ ☆ Efficiently Learning at Test-Time: Active Fine-Tuning of LLMs + + +
+ Recent efforts in fine-tuning language models often rely on automatic data +selection, commonly using Nearest Neighbors retrieval from large datasets. +However, we theoretically show that this approach tends to select redundant +data, limiting its effectiveness or even hurting performance. To address this, +we introduce SIFT, a data selection algorithm designed to reduce uncertainty +about the model's response given a prompt, which unifies ideas from retrieval +and active learning. Whereas Nearest Neighbor retrieval typically fails in the +presence of information duplication, SIFT accounts for information duplication +and optimizes the overall information gain of the selected examples. We focus +our evaluations on fine-tuning at test-time for prompt-specific language +modeling on the Pile dataset, and show that SIFT consistently outperforms +Nearest Neighbor retrieval, with minimal computational overhead. Moreover, we +show that our uncertainty estimates can predict the performance gain of +test-time fine-tuning, and use this to develop an adaptive algorithm that +invests test-time compute proportional to realized performance gains. We +provide the $\texttt{activeft}$ (Active Fine-Tuning) library which can be used +as a drop-in replacement for Nearest Neighbor retrieval. + +
+
+
+
+
+ + ♻ ☆ Memory-efficient Continual Learning with Neural Collapse Contrastive WACV 2025 + + +
+ Contrastive learning has significantly improved representation quality, +enhancing knowledge transfer across tasks in continual learning (CL). However, +catastrophic forgetting remains a key challenge, as contrastive based methods +primarily focus on "soft relationships" or "softness" between samples, which +shift with changing data distributions and lead to representation overlap +across tasks. Recently, the newly identified Neural Collapse phenomenon has +shown promise in CL by focusing on "hard relationships" or "hardness" between +samples and fixed prototypes. However, this approach overlooks "softness", +crucial for capturing intra-class variability, and this rigid focus can also +pull old class representations toward current ones, increasing forgetting. +Building on these insights, we propose Focal Neural Collapse Contrastive +(FNC2), a novel representation learning loss that effectively balances both +soft and hard relationships. Additionally, we introduce the Hardness-Softness +Distillation (HSD) loss to progressively preserve the knowledge gained from +these relationships across tasks. Our method outperforms state-of-the-art +approaches, particularly in minimizing memory reliance. Remarkably, even +without the use of memory, our approach rivals rehearsal-based methods, +offering a compelling solution for data privacy concerns. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Benchmarking Foundation Models on Exceptional Cases: Dataset Creation + and Validation EMNLP 2024 + + +
+ Foundation models (FMs) have achieved significant success across various +tasks, leading to research on benchmarks for reasoning abilities. However, +there is a lack of studies on FMs performance in exceptional scenarios, which +we define as out-of-distribution (OOD) reasoning tasks. This paper is the first +to address these cases, developing a novel dataset for evaluation of FMs across +multiple modalities, including graphic novels, calligraphy, news articles, and +lyrics. It includes tasks for instance classification, character recognition, +token prediction, and text generation. The paper also proposes prompt +engineering techniques like Chain-of-Thought (CoT) and CoT+Few-Shot to enhance +performance. Validation of FMs using various methods revealed improvements. The +code repository is accessible at: +https://github.com/MLAI-Yonsei/ExceptionalBenchmark + +
+
+ comment: EMNLP 2024 Workshop + Genbench(https://genbench.org/workshop_programme/) +
+
+
+
+
+ + ♻ ☆ PePR: Performance Per Resource Unit as a Metric to Promote Small-Scale + Deep Learning in Medical Image Analysis + + +
+ The recent advances in deep learning (DL) have been accelerated by access to +large-scale data and compute. These large-scale resources have been used to +train progressively larger models which are resource intensive in terms of +compute, data, energy, and carbon emissions. These costs are becoming a new +type of entry barrier to researchers and practitioners with limited access to +resources at such scale, particularly in the Global South. In this work, we +take a comprehensive look at the landscape of existing DL models for medical +image analysis tasks and demonstrate their usefulness in settings where +resources are limited. To account for the resource consumption of DL models, we +introduce a novel measure to estimate the performance per resource unit, which +we call the PePR score. Using a diverse family of 131 unique DL architectures +(spanning 1M to 130M trainable parameters) and three medical image datasets, we +capture trends about the performance-resource trade-offs. In applications like +medical image analysis, we argue that small-scale, specialized models are +better than striving for large-scale models. Furthermore, we show that using +existing pretrained models that are fine-tuned on new data can significantly +reduce the computational resources and data required compared to training +models from scratch. We hope this work will encourage the community to focus on +improving AI equity by developing methods and models with smaller resource +footprints. + +
+
+ comment: Accepted to be published at the Northern Lights Deep Learning + Conference (NLDL), 2025. Source code available at + https://github.com/saintslab/PePR +
+
+
+
+
+ + ♻ ☆ A Deep RL Approach on Task Placement and Scaling of Edge Resources for + Cellular Vehicle-to-Network Service Provisioning + + +
+ Cellular-Vehicle-to-Everything (C-V2X) is currently at the forefront of the +digital transformation of our society. By enabling vehicles to communicate with +each other and with the traffic environment using cellular networks, we +redefine transportation, improving road safety and transportation services, +increasing efficiency of vehicular traffic flows, and reducing environmental +impact. To effectively facilitate the provisioning of Cellular +Vehicular-to-Network (C-V2N) services, we tackle the interdependent problems of +service task placement and scaling of edge resources. Specifically, we +formulate the joint problem and prove that it is not computationally tractable. +To address its complexity we propose Deep Hybrid Policy Gradient (DHPG), a new +Deep Reinforcement Learning (DRL) approach that operates in hybrid action +spaces, enabling holistic decision-making and enhancing overall performance. We +evaluated the performance of DHPG using simulations with a real-world C-V2N +traffic dataset, comparing it to several state-of-the-art (SoA) solutions. DHPG +outperforms these solutions, guaranteeing the $99^{th}$ percentile of C-V2N +service delay target, while simultaneously optimizing the utilization of +computing resources. Finally, time complexity analysis is conducted to verify +that the proposed approach can support real-time C-V2N services. + +
+
+ comment: This paper has been submitted to IEEE Transactions on Network and + Service Management +
+
+
+
+
+ + ♻ ☆ Transferring disentangled representations: bridging the gap between + synthetic and real images + + +
+ Developing meaningful and efficient representations that separate the +fundamental structure of the data generation mechanism is crucial in +representation learning. However, Disentangled Representation Learning has not +fully shown its potential on real images, because of correlated generative +factors, their resolution and limited access to ground truth labels. +Specifically on the latter, we investigate the possibility of leveraging +synthetic data to learn general-purpose disentangled representations applicable +to real data, discussing the effect of fine-tuning and what properties of +disentanglement are preserved after the transfer. We provide an extensive +empirical study to address these issues. In addition, we propose a new +interpretable intervention-based metric, to measure the quality of factors +encoding in the representation. Our results indicate that some level of +disentanglement, transferring a representation from synthetic to real data, is +possible and effective. + +
+
+
+
+
+ + ♻ ☆ FPANet: Frequency-based Video Demoireing using Frame-level Post + Alignment + + +
+ Moire patterns, created by the interference between overlapping grid patterns +in the pixel space, degrade the visual quality of images and videos. Therefore, +removing such patterns~(demoireing) is crucial, yet remains a challenge due to +their complexities in sizes and distortions. Conventional methods mainly tackle +this task by only exploiting the spatial domain of the input images, limiting +their capabilities in removing large-scale moire patterns. Therefore, this work +proposes FPANet, an image-video demoireing network that learns filters in both +frequency and spatial domains, improving the restoration quality by removing +various sizes of moire patterns. To further enhance, our model takes multiple +consecutive frames, learning to extract frame-invariant content features and +outputting better quality temporally consistent images. We demonstrate the +effectiveness of our proposed method with a publicly available large-scale +dataset, observing that ours outperforms the state-of-the-art approaches in +terms of image and video quality metrics and visual experience. + +
+
+ comment: Accepted version, to appear in Neural Networks +
+
+
+
+
+ + ♻ ☆ Online SLA Decomposition: Enabling Real-Time Adaptation to Evolving + Systems ICML + + +
+ When a network slice spans multiple technology domains, it is crucial for +each domain to uphold the End-to-End (E2E) Service Level Agreement (SLA) +associated with the slice. Consequently, the E2E SLA must be properly +decomposed into partial SLAs that are assigned to each domain involved. In a +network slice management system with a two-level architecture, comprising an +E2E service orchestrator and local domain controllers, we consider that the +orchestrator has access solely to historical data regarding the responses of +local controllers to previous requests, and this information is used to +construct a risk model for each domain. In this study, we extend our previous +work by investigating the dynamic nature of real-world systems and introducing +an online learning-decomposition framework to tackle the dynamicity. We propose +a framework that periodically updates the risk models based on the most recent +feedback. This approach leverages key components such as online gradient +descent and FIFO memory buffers, which enhance the stability and robustness of +the overall process. Our empirical study on an analytic model-based simulator +demonstrates that the proposed framework outperforms the state-of-the-art +static approach, providing more accurate and resilient SLA decomposition even +under varying conditions and limited data scenarios. + +
+
+ comment: The paper has been submitted to IEEE ICMLCN 2025 +
+
+
+
+
+ + ♻ ☆ Deep learning empowered sensor fusion boosts infant movement + classification + + +
+ To assess the integrity of the developing nervous system, the Prechtl general +movement assessment (GMA) is recognized for its clinical value in diagnosing +neurological impairments in early infancy. GMA has been increasingly augmented +through machine learning approaches intending to scale-up its application, +circumvent costs in the training of human assessors and further standardize +classification of spontaneous motor patterns. Available deep learning tools, +all of which are based on single sensor modalities, are however still +considerably inferior to that of well-trained human assessors. These approaches +are hardly comparable as all models are designed, trained and evaluated on +proprietary/silo-data sets. With this study we propose a sensor fusion approach +for assessing fidgety movements (FMs). FMs were recorded from 51 typically +developing participants. We compared three different sensor modalities +(pressure, inertial, and visual sensors). Various combinations and two sensor +fusion approaches (late and early fusion) for infant movement classification +were tested to evaluate whether a multi-sensor system outperforms single +modality assessments. Convolutional neural network (CNN) architectures were +used to classify movement patterns. The performance of the three-sensor fusion +(classification accuracy of 94.5%) was significantly higher than that of any +single modality evaluated. We show that the sensor fusion approach is a +promising avenue for automated classification of infant motor patterns. The +development of a robust sensor fusion system may significantly enhance AI-based +early recognition of neurofunctions, ultimately facilitating automated early +detection of neurodevelopmental conditions. + +
+
+
+
+
+ + ♻ ☆ Hybrid-SQuAD: Hybrid Scholarly Question Answering Dataset + + +
+ Existing Scholarly Question Answering (QA) methods typically target +homogeneous data sources, relying solely on either text or Knowledge Graphs +(KGs). However, scholarly information often spans heterogeneous sources, +necessitating the development of QA systems that integrate information from +multiple heterogeneous data sources. To address this challenge, we introduce +Hybrid-SQuAD (Hybrid Scholarly Question Answering Dataset), a novel large-scale +QA dataset designed to facilitate answering questions incorporating both text +and KG facts. The dataset consists of 10.5K question-answer pairs generated by +a large language model, leveraging the KGs DBLP and SemOpenAlex alongside +corresponding text from Wikipedia. In addition, we propose a RAG-based baseline +hybrid QA model, achieving an exact match score of 69.65 on the Hybrid-SQuAD +test set. + +
+
+
+
+
+ + ♻ ☆ Bayesian Networks for Causal Analysis in Socioecological Systems + + +
+ Causal and counterfactual reasoning are emerging directions in data science +that allow us to reason about hypothetical scenarios. This is particularly +useful in fields like environmental and ecological sciences, where +interventional data are usually not available. Structural causal models are +probabilistic models for causal analysis that simplify this kind of reasoning +due to their graphical representation. They can be regarded as extensions of +the so-called Bayesian networks, a well known modeling tool commonly used in +environmental and ecological problems. The main contribution of this paper is +to analyze the relations of necessity and sufficiency between the variables of +a socioecological system using counterfactual reasoning with Bayesian networks. +In particular, we consider a case study involving socioeconomic factors and +land-uses in southern Spain. In addition, this paper aims to be a coherent +overview of the fundamental concepts for applying counterfactual reasoning, so +that environmental researchers with a background in Bayesian networks can +easily take advantage of the structural causal model formalism. + +
+
+
+
+
+ + ♻ ☆ Quest: Query-centric Data Synthesis Approach for Long-context Scaling of + Large Language Model + + +
+ Recent advancements in large language models (LLMs) have highlighted the +importance of extending context lengths for handling complex tasks. While +traditional methods for training on long contexts often use filtered long +documents, these approaches lead to domain imbalances, limiting model +performance. To address this, techniques like random document concatenation +(Standard) and similarity-based methods (KNN, ICLM) have been developed. +However, they either sacrifice semantic coherence or diversity. To balance both +aspects, we introduce Quest, a query-centric data synthesis method aggregating +semantically relevant yet diverse documents. Quest uses a generative model to +predict potential queries for each document, grouping documents with similar +queries and keywords. Extensive experiments demonstrate Quest's superior +performance on long-context tasks, achieving remarkable results with context +lengths of up to 1M tokens and confirming its scalability across various model +sizes. + +
+
+
+
+
+ + ♻ ☆ PDNNet: PDN-Aware GNN-CNN Heterogeneous Network for Dynamic IR Drop + Prediction + + +
+ IR drop on the power delivery network (PDN) is closely related to PDN's +configuration and cell current consumption. As the integrated circuit (IC) +design is growing larger, dynamic IR drop simulation becomes computationally +unaffordable and machine learning based IR drop prediction has been explored as +a promising solution. Although CNN-based methods have been adapted to IR drop +prediction task in several works, the shortcomings of overlooking PDN +configuration is non-negligible. In this paper, we consider not only how to +properly represent cell-PDN relation, but also how to model IR drop following +its physical nature in the feature aggregation procedure. Thus, we propose a +novel graph structure, PDNGraph, to unify the representations of the PDN +structure and the fine-grained cell-PDN relation. We further propose a +dual-branch heterogeneous network, PDNNet, incorporating two parallel GNN-CNN +branches to favorably capture the above features during the learning process. +Several key designs are presented to make the dynamic IR drop prediction highly +effective and interpretable. We are the first work to apply graph structure to +deep-learning based dynamic IR drop prediction method. Experiments show that +PDNNet outperforms the state-of-the-art CNN-based methods and achieves 545x +speedup compared to the commercial tool, which demonstrates the superiority of +our method. + +
+
+
+
+
+ + ♻ ☆ R-MTLLMF: Resilient Multi-Task Large Language Model Fusion at the + Wireless Edge + + +
+ Multi-task large language models (MTLLMs) are important for many applications +at the wireless edge, where users demand specialized models to handle multiple +tasks efficiently. However, training MTLLMs is complex and exhaustive, +particularly when tasks are subject to change. Recently, the concept of model +fusion via task vectors has emerged as an efficient approach for combining +fine-tuning parameters to produce an MTLLM. In this paper, the problem of +enabling edge users to collaboratively craft such MTLMs via tasks vectors is +studied, under the assumption of worst-case adversarial attacks. To this end, +first the influence of adversarial noise to multi-task model fusion is +investigated and a relationship between the so-called weight disentanglement +error and the mean squared error (MSE) is derived. Using hypothesis testing, it +is directly shown that the MSE increases interference between task vectors, +thereby rendering model fusion ineffective. Then, a novel resilient MTLLM +fusion (R-MTLLMF) is proposed, which leverages insights about the LLM +architecture and fine-tuning process to safeguard task vector aggregation under +adversarial noise by realigning the MTLLM. The proposed R-MTLLMF is then +compared for both worst-case and ideal transmission scenarios to study the +impact of the wireless channel. Extensive model fusion experiments with vision +LLMs demonstrate R-MTLLMF's effectiveness, achieving close-to-baseline +performance across eight different tasks in ideal noise scenarios and +significantly outperforming unprotected model fusion in worst-case scenarios. +The results further advocate for additional physical layer protection for a +holistic approach to resilience, from both a wireless and LLM perspective. + +
+
+
+
+
+ + ♻ ☆ Tencent Hunyuan3D-1.0: A Unified Framework for Text-to-3D and + Image-to-3D Generation + + +
+ While 3D generative models have greatly improved artists' workflows, the +existing diffusion models for 3D generation suffer from slow generation and +poor generalization. To address this issue, we propose a two-stage approach +named Hunyuan3D-1.0 including a lite version and a standard version, that both +support text- and image-conditioned generation. In the first stage, we employ a +multi-view diffusion model that efficiently generates multi-view RGB in +approximately 4 seconds. These multi-view images capture rich details of the 3D +asset from different viewpoints, relaxing the tasks from single-view to +multi-view reconstruction. In the second stage, we introduce a feed-forward +reconstruction model that rapidly and faithfully reconstructs the 3D asset +given the generated multi-view images in approximately 7 seconds. The +reconstruction network learns to handle noises and in-consistency introduced by +the multi-view diffusion and leverages the available information from the +condition image to efficiently recover the 3D structure. Our framework involves +the text-to-image model, i.e., Hunyuan-DiT, making it a unified framework to +support both text- and image-conditioned 3D generation. Our standard version +has 3x more parameters than our lite and other existing model. Our +Hunyuan3D-1.0 achieves an impressive balance between speed and quality, +significantly reducing generation time while maintaining the quality and +diversity of the produced assets. + +
+
+ comment: Technical Report; 3D Generation +
+
+
+
+
+ + ♻ ☆ Context Matters: Leveraging Contextual Features for Time Series + Forecasting + + +
+ Time series forecasts are often influenced by exogenous contextual features +in addition to their corresponding history. For example, in financial settings, +it is hard to accurately predict a stock price without considering public +sentiments and policy decisions in the form of news articles, tweets, etc. +Though this is common knowledge, the current state-of-the-art (SOTA) +forecasting models fail to incorporate such contextual information, owing to +its heterogeneity and multimodal nature. To address this, we introduce +ContextFormer, a novel plug-and-play method to surgically integrate multimodal +contextual information into existing pre-trained forecasting models. +ContextFormer effectively distills forecast-specific information from rich +multimodal contexts, including categorical, continuous, time-varying, and even +textual information, to significantly enhance the performance of existing base +forecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on +a range of real-world datasets spanning energy, traffic, environmental, and +financial domains. + +
+
+
+
+
+ + ♻ ☆ Developing a Thailand solar irradiance map using Himawari-8 satellite + imageries and deep learning models + + +
+ This paper presents an online platform showing Thailand solar irradiance map +every 30 minutes, available at https://www.cusolarforecast.com. The methodology +for estimating global horizontal irradiance (GHI) across Thailand relies on +cloud index extracted from Himawari-8 satellite imagery, Ineichen clear-sky +model with locally-tuned Linke turbidity, and machine learning models. The +methods take clear-sky irradiance, cloud index, re-analyzed GHI and temperature +data from the MERRA-2 database, and date-time as inputs for GHI estimation +models, including LightGBM, LSTM, Informer, and Transformer. These are +benchmarked with the estimate from a commercial service X by evaluation of +15-minute ground GHI data from 53 ground stations over 1.5 years during +2022-2023. The results show that the four models exhibit comparable overall MAE +performance to the service X. The best model is LightGBM with an overall MAE of +78.58 W/sqm and RMSE of 118.97 W/sqm, while the service X achieves the lowest +MAE, RMSE, and MBE in cloudy condition. Obtaining re-analyzed MERRA-2 data for +the whole Thailand region is not economically feasible for deployment. When +removing these features, the Informer model has a winning performance in MAE of +78.67 W/sqm. The obtained performance aligns with existing literature by taking +the climate zone and time granularity of data into consideration. As the map +shows an estimate of GHI over 93,000 grids with a frequent update, the paper +also describes a computational framework for displaying the entire map. It +tests the runtime performance of deep learning models in the GHI estimation +process. + +
+
+ comment: 23 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ HoPE: A Novel Positional Encoding Without Long-Term Decay for Enhanced + Context Awareness and Extrapolation + + +
+ Many positional encodings (PEs) are designed to exhibit long-term decay, +based on an entrenched and long-standing inductive opinion: tokens farther away +from the current position carry less relevant information. We argue that +long-term decay is outdated in the era of LLMs, as LLMs are now applied to +tasks demanding precise retrieval of in-context information from arbitrary +positions. Firstly, we present empirical analyses on various PEs, demonstrating +that models inherently learn attention with only a local-decay pattern while +forming a U-shape pattern globally, contradicting the principle of long-term +decay. Furthermore, we conduct a detailed analysis of rotary position encoding +(RoPE, a prevalent relative positional encoding in LLMs), and found that the +U-shape attention is caused by some learned components, which are also the key +factor limiting RoPE's expressiveness and extrapolation.Inspired by these +insights, we propose High-frequency rotary Position Encoding (HoPE). HoPE +replaces the specific components in RoPE with position-independent ones, +retaining only high-frequency signals, which also breaks the principle of +long-term decay in theory. HoPE achieves two major advantages: (1) Without +constraints imposed by long-term decay, contradictory factors that limit +spontaneous attention optimization and model extrapolation performance are +removed. (2) Components representing positions and semantics are are optimized. +These enhances model's context awareness and extrapolation, as validated by +extensive experiments. + +
+
+
+
+
+ + ♻ ☆ ToolEyes: Fine-Grained Evaluation for Tool Learning Capabilities of + Large Language Models in Real-world Scenarios COLING 2025 + + +
+ Existing evaluations of tool learning primarily focus on validating the +alignment of selected tools for large language models (LLMs) with expected +outcomes. However, these approaches rely on a limited set of scenarios where +answers can be pre-determined, diverging from genuine needs. Furthermore, a +sole emphasis on outcomes disregards the complex capabilities required for LLMs +to effectively use tools. To tackle this issue, we propose ToolEyes, a +fine-grained system tailored for the evaluation of the LLMs' tool learning +capabilities in authentic scenarios. The system meticulously examines seven +real-world scenarios, analyzing five dimensions crucial to LLMs in tool +learning: format alignment, intent comprehension, behavior planning, tool +selection, and answer organization. Additionally, ToolEyes incorporates a tool +library boasting approximately 600 tools, serving as an intermediary between +LLMs and the physical world. Evaluations involving ten LLMs across three +categories reveal a preference for specific scenarios and limited cognitive +abilities in tool learning. Intriguingly, expanding the model size even +exacerbates the hindrance to tool learning. The code and data are available at +https://github.com/Junjie-Ye/ToolEyes. + +
+
+ comment: Accepted by COLING 2025 conference +
+
+
+
+
+ + ♻ ☆ LuxEmbedder: A Cross-Lingual Approach to Enhanced Luxembourgish Sentence + Embeddings COLING 2025 + + +
+ Sentence embedding models play a key role in various Natural Language +Processing tasks, such as in Topic Modeling, Document Clustering and +Recommendation Systems. However, these models rely heavily on parallel data, +which can be scarce for many low-resource languages, including Luxembourgish. +This scarcity results in suboptimal performance of monolingual and +cross-lingual sentence embedding models for these languages. To address this +issue, we compile a relatively small but high-quality human-generated +cross-lingual parallel dataset to train LuxEmbedder, an enhanced sentence +embedding model for Luxembourgish with strong cross-lingual capabilities. +Additionally, we present evidence suggesting that including low-resource +languages in parallel training datasets can be more advantageous for other +low-resource languages than relying solely on high-resource language pairs. +Furthermore, recognizing the lack of sentence embedding benchmarks for +low-resource languages, we create a paraphrase detection benchmark specifically +for Luxembourgish, aiming to partially fill this gap and promote further +research. + +
+
+ comment: Accepted at COLING 2025 +
+
+
+
+
+ + ♻ ☆ OMEGA: Efficient Occlusion-Aware Navigation for Air-Ground Robot in + Dynamic Environments via State Space Model + + +
+ Air-ground robots (AGRs) are widely used in surveillance and disaster +response due to their exceptional mobility and versatility (i.e., flying and +driving). Current AGR navigation systems perform well in static occlusion-prone +environments (e.g., indoors) by using 3D semantic occupancy networks to predict +occlusions for complete local mapping and then computing Euclidean Signed +Distance Field (ESDF) for path planning. However, these systems face challenges +in dynamic, severe occlusion scenes (e.g., crowds) due to limitations in +perception networks' low prediction accuracy and path planners' high +computation overhead. In this paper, we propose OMEGA, which contains OccMamba +with an Efficient AGR-Planner to address the above-mentioned problems. OccMamba +adopts a novel architecture that separates semantic and occupancy prediction +into independent branches, incorporating two mamba blocks within these +branches. These blocks efficiently extract semantic and geometric features in +3D environments with linear complexity, ensuring that the network can learn +long-distance dependencies to improve prediction accuracy. Semantic and +geometric features are combined within the Bird's Eye View (BEV) space to +minimise computational overhead during feature fusion. The resulting semantic +occupancy map is then seamlessly integrated into the local map, providing +occlusion awareness of the dynamic environment. Our AGR-Planner utilizes this +local map and employs kinodynamic A* search and gradient-based trajectory +optimization to guarantee planning is ESDF-free and energy-efficient. Extensive +experiments demonstrate that OccMamba outperforms the state-of-the-art 3D +semantic occupancy network with 25.0% mIoU. End-to-end navigation experiments +in dynamic scenes verify OMEGA's efficiency, achieving a 96% average planning +success rate. Code and video are available at +https://jmwang0117.github.io/OMEGA/. + +
+
+ comment: Accepted to IEEE RA-L | OccMamba is here! +
+
+
+
+
+ + ♻ ☆ Concept Based Continuous Prompts for Interpretable Text Classification + + +
+ Continuous prompts have become widely adopted for augmenting performance +across a wide range of natural language tasks. However, the underlying +mechanism of this enhancement remains obscure. Previous studies rely on +individual words for interpreting continuous prompts, which lacks comprehensive +semantic understanding. Drawing inspiration from Concept Bottleneck Models, we +propose a framework for interpreting continuous prompts by decomposing them +into human-readable concepts. Specifically, to ensure the feasibility of the +decomposition, we demonstrate that a corresponding concept embedding matrix and +a coefficient matrix can always be found to replace the prompt embedding +matrix. Then, we employ GPT-4o to generate a concept pool and choose potential +candidate concepts that are discriminative and representative using a novel +submodular optimization algorithm. Experiments demonstrate that our framework +can achieve similar results as the original P-tuning and word-based approaches +using only a few concepts while providing more plausible results. Our code is +available at https://github.com/qq31415926/CD. + +
+
+
+
+
+ + ♻ ☆ Diffusion of Thoughts: Chain-of-Thought Reasoning in Diffusion Language + Models NeurIPS 2024 + + +
+ Recently, diffusion models have garnered significant interest in the field of +text processing due to their many potential advantages compared to conventional +autoregressive models. In this work, we propose Diffusion-of-Thought (DoT), a +novel approach that integrates diffusion models with Chain-of-Thought, a +well-established technique for improving the reasoning ability of +autoregressive language models. In contrast to autoregressive language models +that make decisions in a left-to-right, token-by-token manner, DoT allows +reasoning steps to diffuse over time through a diffusion language model and +offers greater flexibility in trading-off computation for reasoning +performance. Our experimental results demonstrate the effectiveness of DoT in +multi-digit multiplication, boolean logic, and grade school math problems, with +a small diffusion model outperforming a much larger autoregressive model in +both efficiency and accuracy. In addition to that, DoT showcases promising +self-correction abilities and benefits from existing reasoning-enhancing +techniques like self-consistency decoding. Our findings contribute to the +understanding and development of reasoning with diffusion language models. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Techniques for Measuring the Inferential Strength of Forgetting Policies + + +
+ The technique of forgetting in knowledge representation has been shown to be +a powerful and useful knowledge engineering tool with widespread application. +Yet, very little research has been done on how different policies of +forgetting, or use of different forgetting operators, affects the inferential +strength of the original theory. The goal of this paper is to define loss +functions for measuring changes in inferential strength based on intuitions +from model counting and probability theory. Properties of such loss measures +are studied and a pragmatic knowledge engineering tool is proposed for +computing loss measures using ProbLog. The paper includes a working methodology +for studying and determining the strength of different forgetting policies, in +addition to concrete examples showing how to apply the theoretical results +using ProbLog. Although the focus is on forgetting, the results are much more +general and should have wider application to other areas. + +
+
+
+
+
+ + ♻ ☆ Objective Features Extracted from Motor Activity Time Series for Food + Addiction Analysis Using Machine Learning + + +
+ This study investigates machine learning algorithms to identify objective +features for diagnosing food addiction (FA) and assessing confirmed symptoms +(SC). Data were collected from 81 participants (mean age: 21.5 years, range: +18-61 years, women: 77.8%) whose FA and SC were measured using the Yale Food +Addiction Scale (YFAS). Participants provided demographic and anthropometric +data, completed the YFAS, the Zung Self-Rating Depression Scale, and the Dutch +Eating Behavior Questionnaire, and wore an actimeter on the non-dominant wrist +for a week to record motor activity. Analysis of the actimetric data identified +significant statistical and entropy-based features that accurately predicted FA +and SC using ML. The Matthews correlation coefficient (MCC) was the primary +metric. Activity-related features were more effective for FA prediction +(MCC=0.88) than rest-related features (MCC=0.68). For SC, activity segments +yielded MCC=0.47, rest segments MCC=0.38, and their combination MCC=0.51. +Significant correlations were also found between actimetric features related to +FA, emotional, and restrained eating behaviors, supporting the model's +validity. Our results support the concept of a human bionic suite composed of +IoT devices and ML sensors, which implements health digital assistance with +real-time monitoring and analysis of physiological indicators related to FA and +SC. + +
+
+ comment: 16 pages, 3 figures, 14 tables +
+
+
+
+
+ + ♻ ☆ Embed-Search-Align: DNA Sequence Alignment using Transformer Models + + +
+ DNA sequence alignment involves assigning short DNA reads to the most +probable locations on an extensive reference genome. This process is crucial +for various genomic analyses, including variant calling, transcriptomics, and +epigenomics. Conventional methods, refined over decades, tackle this challenge +in 2 steps: genome indexing followed by efficient search to locate likely +positions for given reads. Building on the success of Large Language Models in +encoding text into embeddings, where the distance metric captures semantic +similarity, recent efforts have explored whether the same Transformer +architecture can produce embeddings for DNA sequences. Such models have shown +early promise in classifying short DNA sequences, such as detecting +coding/non-coding regions, and enhancer, promoter sequences. However, +performance at sequence classification tasks does not translate to sequence +alignment, where it is necessary to search across the genome to align each +read, a significantly longer-range task. We bridge this gap by framing the +Sequence Alignment task for Transformer models as an "Embed-Search-Align" task. +In this framework, a novel Reference-Free DNA Embedding model generates +embeddings of reads and reference fragments, which are projected into a shared +vector space where the read-fragment distance is used as a surrogate for +alignment. Technical contributions include: (1) Contrastive loss for +self-supervised training of DNA sequence representations, facilitating rich +reference-free, sequence-level embeddings, and (2) a DNA vector store to enable +search across fragments on a global scale. DNA-ESA is 99% accurate when +aligning 250-length reads onto a human genome (3gb), rivaling conventional +methods such as Bowtie and BWA-Mem. DNA-ESA exceeds the performance of 6 +Transformer model baselines such as Nucleotide Transformer, Hyena-DNA, and +shows task transfer across chromosomes and species. + +
+
+ comment: 12 pages, Tables 7, Figures 6 +
+
+
+
+
+ + ♻ ☆ Train a Real-world Local Path Planner in One Hour via Partially + Decoupled Reinforcement Learning and Vectorized Diversity + + +
+ Deep Reinforcement Learning (DRL) has exhibited efficacy in resolving the +Local Path Planning (LPP) problem. However, such application in the real world +is immensely limited due to the deficient training efficiency and +generalization capability of DRL. To alleviate these two issues, a solution +named Color is proposed, which consists of an Actor-Sharer-Learner (ASL) +training framework and a mobile robot-oriented simulator Sparrow. Specifically, +the ASL intends to improve the training efficiency of DRL algorithms. It +employs a Vectorized Data Collection (VDC) mode to expedite data acquisition, +decouples the data collection from model optimization by multithreading, and +partially connects the two procedures by harnessing a Time Feedback Mechanism +(TFM) to evade data underuse or overuse. Meanwhile, the Sparrow simulator +utilizes a 2D grid-based world, simplified kinematics, and conversion-free data +flow to achieve a lightweight design. The lightness facilitates vectorized +diversity, allowing diversified simulation setups across extensive copies of +the vectorized environments, resulting in a notable enhancement in the +generalization capability of the DRL algorithm being trained. Comprehensive +experiments, comprising 57 DRL benchmark environments, 32 simulated and 36 +real-world LPP scenarios, have been conducted to corroborate the superiority of +our method in terms of efficiency and generalization. The code and the video of +this paper are accessible at https://github.com/XinJingHao/Color. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ♻ ☆ HERO: Hint-Based Efficient and Reliable Query Optimizer VLDB 2025 + + +
+ We propose a novel model for learned query optimization which provides query +hints leading to better execution plans. The model addresses the three key +challenges in learned hint-based query optimization: reliable hint +recommendation (ensuring non-degradation of query latency), efficient hint +exploration, and fast inference. We provide an in-depth analysis of existing +NN-based approaches to hint-based optimization and experimentally confirm the +named challenges for them. Our alternative solution consists of a new inference +schema based on an ensemble of context-aware models and a graph storage for +reliable hint suggestion and fast inference, and a budget-controlled training +procedure with a local search algorithm that solves the issue of exponential +search space exploration. In experiments on standard benchmarks, our model +demonstrates optimization capability close to the best achievable with +coarse-grained hints. Controlling the degree of parallelism (query dop) in +addition to operator-related hints enables our model to achieve 3x latency +improvement on JOB benchmark which sets a new standard for optimization. Our +model is interpretable and easy to debug, which is particularly important for +deployment in production. + +
+
+ comment: Submitted to VLDB 2025; 13 pages; 13 figures +
+
+
+
+
+ + ♻ ☆ Knowledge Transfer based Evolutionary Deep Neural Network for + Intelligent Fault Diagnosis + + +
+ A fault diagnosis with commendable accuracy is essential for the reliability +of industrial machines. Two main challenges affect the design of +high-performing intelligent systems: (i) the selection of a suitable model and +(ii) domain adaptation if there is a continuous change in operating conditions. +Therefore, we propose an evolutionary Net2Net transformation (EvoN2N) that +finds the best suitable DNN architecture with limited availability of labeled +data samples. Net2Net transformation-based quick learning algorithm has been +used in the evolutionary framework of Non-dominated sorting genetic algorithm +II to obtain the best DNN architecture. Net2Net transformation-based quick +learning algorithm uses the concept of knowledge transfer from one generation +to the next for faster fitness evaluation. The proposed framework can obtain +the best model for intelligent fault diagnosis without a long and +time-consuming search process. The proposed framework has been validated on the +Case Western Reserve University dataset, the Paderborn University dataset, and +the gearbox fault detection dataset under different operating conditions. The +best models obtained are capable of demonstrating an excellent diagnostic +performance and classification accuracy of almost up to 100\% for most of the +operating conditions. + +
+
+
+
+
+ + ♻ ☆ Nl2Hltl2Plan: Scaling Up Natural Language Understanding for Multi-Robots + Through Hierarchical Temporal Logic Task Representation + + +
+ To enable non-experts to specify long-horizon, multi-robot collaborative +tasks, language models are increasingly used to translate natural language +commands into formal specifications. However, because translation can occur in +multiple ways, such translations may lack accuracy or lead to inefficient +multi-robot planning. Our key insight is that concise hierarchical +specifications can simplify planning while remaining straightforward to derive +from human instructions. We propose Nl2Hltl2Plan, a framework that translates +natural language commands into hierarchical Linear Temporal Logic (LTL) and +solves the corresponding planning problem. The translation involves two steps +leveraging Large Language Models (LLMs). First, an LLM transforms instructions +into a Hierarchical Task Tree, capturing logical and temporal relations. Next, +a fine-tuned LLM converts sub-tasks into flat LTL formulas, which are +aggregated into hierarchical specifications, with the lowest level +corresponding to ordered robot actions. These specifications are then used with +off-the-shelf planners. Our Nl2Hltl2Plan demonstrates the potential of LLMs in +hierarchical reasoning for multi-robot task planning. Evaluations in simulation +and real-world experiments with human participants show that Nl2Hltl2Plan +outperforms existing methods, handling more complex instructions while +achieving higher success rates and lower costs in task allocation and planning. +Additional details are available at https://nl2hltl2plan.github.io . + +
+
+
+
+
+ + ♻ ☆ Fourier Boundary Features Network with Wider Catchers for Glass + Segmentation + + +
+ Glass largely blurs the boundary between the real world and the reflection. +The special transmittance and reflectance quality have confused the semantic +tasks related to machine vision. Therefore, how to clear the boundary built by +glass, and avoid over-capturing features as false positive information in deep +structure, matters for constraining the segmentation of reflection surface and +penetrating glass. We proposed the Fourier Boundary Features Network with Wider +Catchers (FBWC), which might be the first attempt to utilize sufficiently wide +horizontal shallow branches without vertical deepening for guiding the fine +granularity segmentation boundary through primary glass semantic information. +Specifically, we designed the Wider Coarse-Catchers (WCC) for anchoring large +area segmentation and reducing excessive extraction from a structural +perspective. We embed fine-grained features by Cross Transpose Attention (CTA), +which is introduced to avoid the incomplete area within the boundary caused by +reflection noise. For excavating glass features and balancing high-low layers +context, a learnable Fourier Convolution Controller (FCC) is proposed to +regulate information integration robustly. The proposed method has been +validated on three different public glass segmentation datasets. Experimental +results reveal that the proposed method yields better segmentation performance +compared with the state-of-the-art (SOTA) methods in glass image segmentation. + +
+
+
+
+
+ + ♻ ☆ RILQ: Rank-Insensitive LoRA-based Quantization Error Compensation for + Boosting 2-bit Large Language Model Accuracy + + +
+ Low-rank adaptation (LoRA) has become the dominant method for +parameter-efficient LLM fine-tuning, with LoRA-based quantization error +compensation (LQEC) emerging as a powerful tool for recovering accuracy in +compressed LLMs. However, LQEC has underperformed in sub-4-bit scenarios, with +no prior investigation into understanding this limitation. We propose RILQ +(Rank-Insensitive LoRA-based Quantization Error Compensation) to understand +fundamental limitation and boost 2-bit LLM accuracy. Based on rank analysis +revealing model-wise activation discrepancy loss's rank-insensitive nature, +RILQ employs this loss to adjust adapters cooperatively across layers, enabling +robust error compensation with low-rank adapters. Evaluations on LLaMA-2 and +LLaMA-3 demonstrate RILQ's consistent improvements in 2-bit quantized inference +across various state-of-the-art quantizers and enhanced accuracy in +task-specific fine-tuning. RILQ maintains computational efficiency comparable +to existing LoRA methods, enabling adapter-merged weight-quantized LLM +inference with significantly enhanced accuracy, making it a promising approach +for boosting 2-bit LLM performance. + +
+
+ comment: The typo in Table 4 has been corrected +
+
+
+
+
+ + ♻ ☆ Combining Stochastic Defenses to Resist Gradient Inversion: An Ablation + Study + + +
+ Gradient Inversion (GI) attacks are a ubiquitous threat in Federated Learning +(FL) as they exploit gradient leakage to reconstruct supposedly private +training data. Common defense mechanisms such as Differential Privacy (DP) or +stochastic Privacy Modules (PMs) introduce randomness during gradient +computation to prevent such attacks. However, we pose that if an attacker +effectively mimics a client's stochastic gradient computation, the attacker can +circumvent the defense and reconstruct clients' private training data. This +paper introduces several targeted GI attacks that leverage this principle to +bypass common defense mechanisms. As a result, we demonstrate that no +individual defense provides sufficient privacy protection. To address this +issue, we propose to combine multiple defenses. We conduct an extensive +ablation study to evaluate the influence of various combinations of defenses on +privacy protection and model utility. We observe that only the combination of +DP and a stochastic PM was sufficient to decrease the Attack Success Rate (ASR) +from 100% to 0%, thus preserving privacy. Moreover, we found that this +combination of defenses consistently achieves the best trade-off between +privacy and model utility. + +
+
+ comment: This version represents a comprehensive rework of the initial study, + including substantial updates to the methodology, analysis, and conclusions. + 26 pages, 2 figures, 5 tables +
+
+
+
+
+
+
+
+ + Genomics 3 + +
+
+
+ + ♻ ☆ GV-Rep: A Large-Scale Dataset for Genetic Variant Representation + Learning + + +
+ Genetic variants (GVs) are defined as differences in the DNA sequences among +individuals and play a crucial role in diagnosing and treating genetic +diseases. The rapid decrease in next generation sequencing cost has led to an +exponential increase in patient-level GV data. This growth poses a challenge +for clinicians who must efficiently prioritize patient-specific GVs and +integrate them with existing genomic databases to inform patient management. To +addressing the interpretation of GVs, genomic foundation models (GFMs) have +emerged. However, these models lack standardized performance assessments, +leading to considerable variability in model evaluations. This poses the +question: How effectively do deep learning methods classify unknown GVs and +align them with clinically-verified GVs? We argue that representation learning, +which transforms raw data into meaningful feature spaces, is an effective +approach for addressing both indexing and classification challenges. We +introduce a large-scale Genetic Variant dataset, named GV-Rep, featuring +variable-length contexts and detailed annotations, designed for deep learning +models to learn GV representations across various traits, diseases, tissue +types, and experimental contexts. Our contributions are three-fold: (i) +Construction of a comprehensive dataset with 7 million records, each labeled +with characteristics of the corresponding variants, alongside additional data +from 17,548 gene knockout tests across 1,107 cell types, 1,808 variant +combinations, and 156 unique clinically verified GVs from real-world patients. +(ii) Analysis of the structure and properties of the dataset. (iii) +Experimentation of the dataset with pre-trained GFMs. The results show a +significant gap between GFMs current capabilities and accurate GV +representation. We hope this dataset will help advance genomic deep learning to +bridge this gap. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ iSEEtree: interactive explorer for hierarchical data + + +
+ $\textbf{Motivation:}$ Hierarchical data structures are prevalent across +several fields of research, as they represent an organised and efficient +approach to study complex interconnected systems. Their significance is +particularly evident in microbiome analysis, where microbial communities are +classified at various taxonomic levels along the phylogenetic tree. In light of +this trend, the R/Bioconductor community has established a reproducible +analytical framework for hierarchical data, which relies on the highly generic +and optimised TreeSummarizedExperiment data container. However, using this +framework requires basic proficiency in programming. + $\textbf{Results:}$ To reduce the entry requirements, we developed iSEEtree, +an R shiny app which provides a visual interface for the analysis and +exploration of TreeSummarizedExperiment objects, thereby expanding the +interactive graphics capabilities of related work to hierarchical structures. +This way, users can interactively explore several aspects of their data without +the need for extensive knowledge of R programming. We describe how iSEEtree +enables the exploration of hierarchical multi-table data and demonstrate its +functionality with applications to microbiome analysis. + $\textbf{Availability and Implementation:}$ iSEEtree was implemented in the R +programming language and is available on Bioconductor at +https://bioconductor.org/packages/iSEEtree under an Artistic 2.0 license. + $\textbf{Contact:}$ giulio.benedetti@utu.fi or leo.lahti@utu.fi. + +
+
+ comment: 4 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Embed-Search-Align: DNA Sequence Alignment using Transformer Models + + +
+ DNA sequence alignment involves assigning short DNA reads to the most +probable locations on an extensive reference genome. This process is crucial +for various genomic analyses, including variant calling, transcriptomics, and +epigenomics. Conventional methods, refined over decades, tackle this challenge +in 2 steps: genome indexing followed by efficient search to locate likely +positions for given reads. Building on the success of Large Language Models in +encoding text into embeddings, where the distance metric captures semantic +similarity, recent efforts have explored whether the same Transformer +architecture can produce embeddings for DNA sequences. Such models have shown +early promise in classifying short DNA sequences, such as detecting +coding/non-coding regions, and enhancer, promoter sequences. However, +performance at sequence classification tasks does not translate to sequence +alignment, where it is necessary to search across the genome to align each +read, a significantly longer-range task. We bridge this gap by framing the +Sequence Alignment task for Transformer models as an "Embed-Search-Align" task. +In this framework, a novel Reference-Free DNA Embedding model generates +embeddings of reads and reference fragments, which are projected into a shared +vector space where the read-fragment distance is used as a surrogate for +alignment. Technical contributions include: (1) Contrastive loss for +self-supervised training of DNA sequence representations, facilitating rich +reference-free, sequence-level embeddings, and (2) a DNA vector store to enable +search across fragments on a global scale. DNA-ESA is 99% accurate when +aligning 250-length reads onto a human genome (3gb), rivaling conventional +methods such as Bowtie and BWA-Mem. DNA-ESA exceeds the performance of 6 +Transformer model baselines such as Nucleotide Transformer, Hyena-DNA, and +shows task transfer across chromosomes and species. + +
+
+ comment: 12 pages, Tables 7, Figures 6 +
+
+
+
+
+
+
+
+ + Machine Learning 154 + +
+
+
+ + ☆ VisionZip: Longer is Better but Not Necessary in Vision Language Models + + +
+ Recent advancements in vision-language models have enhanced performance by +increasing the length of visual tokens, making them much longer than text +tokens and significantly raising computational costs. However, we observe that +the visual tokens generated by popular vision encoders, such as CLIP and +SigLIP, contain significant redundancy. To address this, we introduce +VisionZip, a simple yet effective method that selects a set of informative +tokens for input to the language model, reducing visual token redundancy and +improving efficiency while maintaining model performance. The proposed +VisionZip can be widely applied to image and video understanding tasks and is +well-suited for multi-turn dialogues in real-world scenarios, where previous +methods tend to underperform. Experimental results show that VisionZip +outperforms the previous state-of-the-art method by at least 5% performance +gains across nearly all settings. Moreover, our method significantly enhances +model inference speed, improving the prefilling time by 8x and enabling the +LLaVA-Next 13B model to infer faster than the LLaVA-Next 7B model while +achieving better results. Furthermore, we analyze the causes of this redundancy +and encourage the community to focus on extracting better visual features +rather than merely increasing token length. Our code is available at +https://github.com/dvlab-research/VisionZip . + +
+
+ comment: 2 columns, 28 pages, 15 figures, 18 tables +
+
+
+
+
+ + ☆ Code-as-Monitor: Constraint-aware Visual Programming for Reactive and + Proactive Robotic Failure Detection + + +
+ Automatic detection and prevention of open-set failures are crucial in +closed-loop robotic systems. Recent studies often struggle to simultaneously +identify unexpected failures reactively after they occur and prevent +foreseeable ones proactively. To this end, we propose Code-as-Monitor (CaM), a +novel paradigm leveraging the vision-language model (VLM) for both open-set +reactive and proactive failure detection. The core of our method is to +formulate both tasks as a unified set of spatio-temporal constraint +satisfaction problems and use VLM-generated code to evaluate them for real-time +monitoring. To enhance the accuracy and efficiency of monitoring, we further +introduce constraint elements that abstract constraint-related entities or +their parts into compact geometric elements. This approach offers greater +generality, simplifies tracking, and facilitates constraint-aware visual +programming by leveraging these elements as visual prompts. Experiments show +that CaM achieves a 28.7% higher success rate and reduces execution time by +31.8% under severe disturbances compared to baselines across three simulators +and a real-world setting. Moreover, CaM can be integrated with open-loop +control policies to form closed-loop systems, enabling long-horizon tasks in +cluttered scenes with dynamic environments. + +
+
+ comment: Project page: https://zhoues.github.io/Code-as-Monitor/ +
+
+
+
+
+ + ☆ Moto: Latent Motion Token as the Bridging Language for Robot + Manipulation + + +
+ Recent developments in Large Language Models pre-trained on extensive corpora +have shown significant success in various natural language processing tasks +with minimal fine-tuning. This success offers new promise for robotics, which +has long been constrained by the high cost of action-labeled data. We ask: +given the abundant video data containing interaction-related knowledge +available as a rich "corpus", can a similar generative pre-training approach be +effectively applied to enhance robot learning? The key challenge is to identify +an effective representation for autoregressive pre-training that benefits robot +manipulation tasks. Inspired by the way humans learn new skills through +observing dynamic environments, we propose that effective robotic learning +should emphasize motion-related knowledge, which is closely tied to low-level +actions and is hardware-agnostic, facilitating the transfer of learned motions +to actual robot actions. To this end, we introduce Moto, which converts video +content into latent Motion Token sequences by a Latent Motion Tokenizer, +learning a bridging "language" of motion from videos in an unsupervised manner. +We pre-train Moto-GPT through motion token autoregression, enabling it to +capture diverse visual motion knowledge. After pre-training, Moto-GPT +demonstrates the promising ability to produce semantically interpretable motion +tokens, predict plausible motion trajectories, and assess trajectory +rationality through output likelihood. To transfer learned motion priors to +real robot actions, we implement a co-fine-tuning strategy that seamlessly +bridges latent motion token prediction and real robot control. Extensive +experiments show that the fine-tuned Moto-GPT exhibits superior robustness and +efficiency on robot manipulation benchmarks, underscoring its effectiveness in +transferring knowledge from video data to downstream visual manipulation tasks. + +
+
+ comment: Project released at: https://chenyi99.github.io/moto/ +
+
+
+
+
+ + ☆ Grounding Descriptions in Images informs Zero-Shot Visual Recognition + + +
+ Vision-language models (VLMs) like CLIP have been cherished for their ability +to perform zero-shot visual recognition on open-vocabulary concepts. This is +achieved by selecting the object category whose textual representation bears +the highest similarity with the query image. While successful in some domains, +this method struggles with identifying fine-grained entities as well as +generalizing to unseen concepts that are not captured by the training +distribution. Recent works attempt to mitigate these challenges by integrating +category descriptions at test time, albeit yielding modest improvements. We +attribute these limited gains to a fundamental misalignment between image and +description representations, which is rooted in the pretraining structure of +CLIP. In this paper, we propose GRAIN, a new pretraining strategy aimed at +aligning representations at both fine and coarse levels simultaneously. Our +approach learns to jointly ground textual descriptions in image regions along +with aligning overarching captions with global image representations. To drive +this pre-training, we leverage frozen Multimodal Large Language Models (MLLMs) +to derive large-scale synthetic annotations. We demonstrate the enhanced +zero-shot performance of our model compared to current state-of-the art methods +across 11 diverse image classification datasets. Additionally, we introduce +Products-2023, a newly curated, manually labeled dataset featuring novel +concepts, and showcase our model's ability to recognize these concepts by +benchmarking on it. Significant improvements achieved by our model on other +downstream tasks like retrieval further highlight the superior quality of +representations learned by our approach. Code available at +https://github.com/shaunak27/grain-clip . + +
+
+
+
+
+ + ☆ Marvel: Accelerating Safe Online Reinforcement Learning with Finetuned + Offline Policy + + +
+ The high costs and risks involved in extensive environment interactions +hinder the practical application of current online safe reinforcement learning +(RL) methods. While offline safe RL addresses this by learning policies from +static datasets, the performance therein is usually limited due to reliance on +data quality and challenges with out-of-distribution (OOD) actions. Inspired by +recent successes in offline-to-online (O2O) RL, it is crucial to explore +whether offline safe RL can be leveraged to facilitate faster and safer online +policy learning, a direction that has yet to be fully investigated. To fill +this gap, we first demonstrate that naively applying existing O2O algorithms +from standard RL would not work well in the safe RL setting due to two unique +challenges: \emph{erroneous Q-estimations}, resulted from offline-online +objective mismatch and offline cost sparsity, and \emph{Lagrangian mismatch}, +resulted from difficulties in aligning Lagrange multipliers between offline and +online policies. To address these challenges, we introduce \textbf{Marvel}, a +novel framework for O2O safe RL, comprising two key components that work in +concert: \emph{Value Pre-Alignment} to align the Q-functions with the +underlying truth before online learning, and \emph{Adaptive PID Control} to +effectively adjust the Lagrange multipliers during online finetuning. Extensive +experiments demonstrate that Marvel significantly outperforms existing +baselines in both reward maximization and safety constraint satisfaction. By +introducing the first policy-finetuning based framework for O2O safe RL, which +is compatible with many offline and online safe RL methods, our work has the +great potential to advance the field towards more efficient and practical safe +RL solutions. + +
+
+
+
+
+ + ☆ CA-SSLR: Condition-Aware Self-Supervised Learning Representation for + Generalized Speech Processing NeurIPS + 2024 + + +
+ We introduce Condition-Aware Self-Supervised Learning Representation +(CA-SSLR), a generalist conditioning model broadly applicable to various +speech-processing tasks. Compared to standard fine-tuning methods that optimize +for downstream models, CA-SSLR integrates language and speaker embeddings from +earlier layers, making the SSL model aware of the current language and speaker +context. This approach reduces the reliance on input audio features while +preserving the integrity of the base SSLR. CA-SSLR improves the model's +capabilities and demonstrates its generality on unseen tasks with minimal +task-specific tuning. Our method employs linear modulation to dynamically +adjust internal representations, enabling fine-grained adaptability without +significantly altering the original model behavior. Experiments show that +CA-SSLR reduces the number of trainable parameters, mitigates overfitting, and +excels in under-resourced and unseen tasks. Specifically, CA-SSLR achieves a +10% relative reduction in LID errors, a 37% improvement in ASR CER on the +ML-SUPERB benchmark, and a 27% decrease in SV EER on VoxCeleb-1, demonstrating +its effectiveness. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ☆ FedDUAL: A Dual-Strategy with Adaptive Loss and Dynamic Aggregation for + Mitigating Data Heterogeneity in Federated Learning + + +
+ Federated Learning (FL) marks a transformative approach to distributed model +training by combining locally optimized models from various clients into a +unified global model. While FL preserves data privacy by eliminating +centralized storage, it encounters significant challenges such as performance +degradation, slower convergence, and reduced robustness of the global model due +to the heterogeneity in client data distributions. Among the various forms of +data heterogeneity, label skew emerges as a particularly formidable and +prevalent issue, especially in domains such as image classification. To address +these challenges, we begin with comprehensive experiments to pinpoint the +underlying issues in the FL training process. Based on our findings, we then +introduce an innovative dual-strategy approach designed to effectively resolve +these issues. First, we introduce an adaptive loss function for client-side +training, meticulously crafted to preserve previously acquired knowledge while +maintaining an optimal equilibrium between local optimization and global model +coherence. Secondly, we develop a dynamic aggregation strategy for aggregating +client models at the server. This approach adapts to each client's unique +learning patterns, effectively addressing the challenges of diverse data across +the network. Our comprehensive evaluation, conducted across three diverse +real-world datasets, coupled with theoretical convergence guarantees, +demonstrates the superior efficacy of our method compared to several +established state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Efficient Task Grouping Through Samplewise Optimisation Landscape + Analysis + + +
+ Shared training approaches, such as multi-task learning (MTL) and +gradient-based meta-learning, are widely used in various machine learning +applications, but they often suffer from negative transfer, leading to +performance degradation in specific tasks. While several optimisation +techniques have been developed to mitigate this issue for pre-selected task +cohorts, identifying optimal task combinations for joint learning - known as +task grouping - remains underexplored and computationally challenging due to +the exponential growth in task combinations and the need for extensive training +and evaluation cycles. This paper introduces an efficient task grouping +framework designed to reduce these overwhelming computational demands of the +existing methods. The proposed framework infers pairwise task similarities +through a sample-wise optimisation landscape analysis, eliminating the need for +the shared model training required to infer task similarities in existing +methods. With task similarities acquired, a graph-based clustering algorithm is +employed to pinpoint near-optimal task groups, providing an approximate yet +efficient and effective solution to the originally NP-hard problem. Empirical +assessments conducted on 8 different datasets highlight the effectiveness of +the proposed framework, revealing a five-fold speed enhancement compared to +previous state-of-the-art methods. Moreover, the framework consistently +demonstrates comparable performance, confirming its remarkable efficiency and +effectiveness in task grouping. + +
+
+ comment: Under review at IEEE Transactions on Pattern Analysis and Machine + Intelligence +
+
+
+
+
+ + ☆ Stabilizing and Solving Inverse Problems using Data and Machine Learning + + +
+ We consider an inverse problem involving the reconstruction of the solution +to a nonlinear partial differential equation (PDE) with unknown boundary +conditions. Instead of direct boundary data, we are provided with a large +dataset of boundary observations for typical solutions (collective data) and a +bulk measurement of a specific realization. To leverage this collective data, +we first compress the boundary data using proper orthogonal decomposition (POD) +in a linear expansion. Next, we identify a possible nonlinear low-dimensional +structure in the expansion coefficients using an auto-encoder, which provides a +parametrization of the dataset in a lower-dimensional latent space. We then +train a neural network to map the latent variables representing the boundary +data to the solution of the PDE. Finally, we solve the inverse problem by +optimizing a data-fitting term over the latent space. + We analyze the underlying stabilized finite element method in the linear +setting and establish optimal error estimates in the $H^1$ and $L^2$-norms. The +nonlinear problem is then studied numerically, demonstrating the effectiveness +of our approach. + +
+
+
+
+
+ + ☆ Providing Differential Privacy for Federated Learning Over Wireless: A + Cross-layer Framework + + +
+ Federated Learning (FL) is a distributed machine learning framework that +inherently allows edge devices to maintain their local training data, thus +providing some level of privacy. However, FL's model updates still pose a risk +of privacy leakage, which must be mitigated. Over-the-air FL (OTA-FL) is an +adapted FL design for wireless edge networks that leverages the natural +superposition property of the wireless medium. We propose a wireless physical +layer (PHY) design for OTA-FL which improves differential privacy (DP) through +a decentralized, dynamic power control that utilizes both inherent Gaussian +noise in the wireless channel and a cooperative jammer (CJ) for additional +artificial noise generation when higher privacy levels are required. Although +primarily implemented within the Upcycled-FL framework, where a +resource-efficient method with first-order approximations is used at every even +iteration to decrease the required information from clients, our power control +strategy is applicable to any FL framework, including FedAvg and FedProx as +shown in the paper. This adaptation showcases the flexibility and effectiveness +of our design across different learning algorithms while maintaining a strong +emphasis on privacy. Our design removes the need for client-side artificial +noise injection for DP, utilizing a cooperative jammer to enhance privacy +without affecting transmission efficiency for higher privacy demands. Privacy +analysis is provided using the Moments Accountant method. We perform a +convergence analysis for non-convex objectives to tackle heterogeneous data +distributions, highlighting the inherent trade-offs between privacy and +accuracy. Numerical results show that our approach with various FL algorithms +outperforms the state-of-the-art under the same DP conditions on the non-i.i.d. +FEMNIST dataset, and highlight the cooperative jammer's effectiveness in +ensuring strict privacy. + +
+
+ comment: submitted for an IEEE publication +
+
+
+
+
+ + ☆ Federated Automated Feature Engineering + + +
+ Automated feature engineering (AutoFE) is used to automatically create new +features from original features to improve predictive performance without +needing significant human intervention and expertise. Many algorithms exist for +AutoFE, but very few approaches exist for the federated learning (FL) setting +where data is gathered across many clients and is not shared between clients or +a central server. We introduce AutoFE algorithms for the horizontal, vertical, +and hybrid FL settings, which differ in how the data is gathered across +clients. To the best of our knowledge, we are the first to develop AutoFE +algorithms for the horizontal and hybrid FL cases, and we show that the +downstream model performance of federated AutoFE is similar to the case where +data is held centrally and AutoFE is performed centrally. + +
+
+ comment: Preliminary Work +
+
+
+
+
+ + ☆ Asynchronous Batch Bayesian Optimization with Pipelining Evaluations for + Experimental Resource$\unicode{x2013}$constrained Conditions + + +
+ Bayesian optimization is efficient even with a small amount of data and is +used in engineering and in science, including biology and chemistry. In +Bayesian optimization, a parameterized model with an uncertainty is fitted to +explain the experimental data, and then the model suggests parameters that +would most likely improve the results. Batch Bayesian optimization reduces the +processing time of optimization by parallelizing experiments. However, batch +Bayesian optimization cannot be applied if the number of parallelized +experiments is limited by the cost or scarcity of equipment; in such cases, +sequential methods require an unrealistic amount of time. In this study, we +developed pipelining Bayesian optimization (PipeBO) to reduce the processing +time of optimization even with a limited number of parallel experiments. PipeBO +was inspired by the pipelining of central processing unit architecture, which +divides computational tasks into multiple processes. PipeBO was designed to +achieve experiment parallelization by overlapping various processes of the +experiments. PipeBO uses the results of completed experiments to update the +parameters of running parallelized experiments. Using the Black-Box +Optimization Benchmarking, which consists of 24 benchmark functions, we +compared PipeBO with the sequential Bayesian optimization methods. PipeBO +reduced the average processing time of optimization to about 56% for the +experiments that consisted of two processes or even less for those with more +processes for 20 out of the 24 functions. Overall, PipeBO parallelizes Bayesian +optimization in the resource-constrained settings so that efficient +optimization can be achieved. + +
+
+
+
+
+ + ☆ Probabilistic Gaussian Superposition for Efficient 3D Occupancy + Prediction + + +
+ 3D semantic occupancy prediction is an important task for robust +vision-centric autonomous driving, which predicts fine-grained geometry and +semantics of the surrounding scene. Most existing methods leverage dense +grid-based scene representations, overlooking the spatial sparsity of the +driving scenes. Although 3D semantic Gaussian serves as an object-centric +sparse alternative, most of the Gaussians still describe the empty region with +low efficiency. To address this, we propose a probabilistic Gaussian +superposition model which interprets each Gaussian as a probability +distribution of its neighborhood being occupied and conforms to probabilistic +multiplication to derive the overall geometry. Furthermore, we adopt the exact +Gaussian mixture model for semantics calculation to avoid unnecessary +overlapping of Gaussians. To effectively initialize Gaussians in non-empty +region, we design a distribution-based initialization module which learns the +pixel-aligned occupancy distribution instead of the depth of surfaces. We +conduct extensive experiments on nuScenes and KITTI-360 datasets and our +GaussianFormer-2 achieves state-of-the-art performance with high efficiency. +Code: https://github.com/huang-yh/GaussianFormer. + +
+
+ comment: Code is available at: https://github.com/huang-yh/GaussianFormer +
+
+
+
+
+ + ☆ EmbodiedOcc: Embodied 3D Occupancy Prediction for Vision-based Online + Scene Understanding + + +
+ 3D occupancy prediction provides a comprehensive description of the +surrounding scenes and has become an essential task for 3D perception. Most +existing methods focus on offline perception from one or a few views and cannot +be applied to embodied agents which demands to gradually perceive the scene +through progressive embodied exploration. In this paper, we formulate an +embodied 3D occupancy prediction task to target this practical scenario and +propose a Gaussian-based EmbodiedOcc framework to accomplish it. We initialize +the global scene with uniform 3D semantic Gaussians and progressively update +local regions observed by the embodied agent. For each update, we extract +semantic and structural features from the observed image and efficiently +incorporate them via deformable cross-attention to refine the regional +Gaussians. Finally, we employ Gaussian-to-voxel splatting to obtain the global +3D occupancy from the updated 3D Gaussians. Our EmbodiedOcc assumes an unknown +(i.e., uniformly distributed) environment and maintains an explicit global +memory of it with 3D Gaussians. It gradually gains knowledge through local +refinement of regional Gaussians, which is consistent with how humans +understand new scenes through embodied exploration. We reorganize an +EmbodiedOcc-ScanNet benchmark based on local annotations to facilitate the +evaluation of the embodied 3D occupancy prediction task. Experiments +demonstrate that our EmbodiedOcc outperforms existing local prediction methods +and accomplishes the embodied occupancy prediction with high accuracy and +strong expandability. Our code is available at: +https://github.com/YkiWu/EmbodiedOcc. + +
+
+ comment: Code: https://github.com/YkiWu/EmbodiedOcc +
+
+
+
+
+ + ☆ A Hitchhiker's Guide to Understanding Performances of Two-Class + Classifiers + + +
+ Properly understanding the performances of classifiers is essential in +various scenarios. However, the literature often relies only on one or two +standard scores to compare classifiers, which fails to capture the nuances of +application-specific requirements, potentially leading to suboptimal classifier +selection. Recently, a paper on the foundations of the theory of +performance-based ranking introduced a tool, called the Tile, that organizes an +infinity of ranking scores into a 2D map. Thanks to the Tile, it is now +possible to evaluate and compare classifiers efficiently, displaying all +possible application-specific preferences instead of having to rely on a pair +of scores. In this paper, we provide a first hitchhiker's guide for +understanding the performances of two-class classifiers by presenting four +scenarios, each showcasing a different user profile: a theoretical analyst, a +method designer, a benchmarker, and an application developer. Particularly, we +show that we can provide different interpretative flavors that are adapted to +the user's needs by mapping different values on the Tile. As an illustration, +we leverage the newly introduced Tile tool and the different flavors to rank +and analyze the performances of 74 state-of-the-art semantic segmentation +models in two-class classification through the eyes of the four user profiles. +Through these user profiles, we demonstrate that the Tile effectively captures +the behavior of classifiers in a single visualization, while accommodating an +infinite number of ranking scores. + +
+
+
+
+
+ + ☆ Finer Behavioral Foundation Models via Auto-Regressive Features and + Advantage Weighting + + +
+ The forward-backward representation (FB) is a recently proposed framework +(Touati et al., 2023; Touati & Ollivier, 2021) to train behavior foundation +models (BFMs) that aim at providing zero-shot efficient policies for any new +task specified in a given reinforcement learning (RL) environment, without +training for each new task. Here we address two core limitations of FB model +training. First, FB, like all successor-feature-based methods, relies on a +linear encoding of tasks: at test time, each new reward function is linearly +projected onto a fixed set of pre-trained features. This limits expressivity as +well as precision of the task representation. We break the linearity limitation +by introducing auto-regressive features for FB, which let finegrained task +features depend on coarser-grained task information. This can represent +arbitrary nonlinear task encodings, thus significantly increasing expressivity +of the FB framework. Second, it is well-known that training RL agents from +offline datasets often requires specific techniques.We show that FB works well +together with such offline RL techniques, by adapting techniques from (Nair et +al.,2020b; Cetin et al., 2024) for FB. This is necessary to get non-flatlining +performance in some datasets, such as DMC Humanoid. As a result, we produce +efficient FB BFMs for a number of new environments. Notably, in the D4RL +locomotion benchmark, the generic FB agent matches the performance of standard +single-task offline agents (IQL, XQL). In many setups, the offline techniques +are needed to get any decent performance at all. The auto-regressive features +have a positive but moderate impact, concentrated on tasks requiring spatial +precision and task generalization beyond the behaviors represented in the +trainset. + +
+
+
+
+
+ + ☆ Machine Theory of Mind for Autonomous Cyber-Defence + + +
+ Intelligent autonomous agents hold much potential for the domain of +cyber-security. However, due to many state-of-the-art approaches relying on +uninterpretable black-box models, there is growing demand for methods that +offer stakeholders clear and actionable insights into their latent beliefs and +motivations. To address this, we evaluate Theory of Mind (ToM) approaches for +Autonomous Cyber Operations. Upon learning a robust prior, ToM models can +predict an agent's goals, behaviours, and contextual beliefs given only a +handful of past behaviour observations. In this paper, we introduce a novel +Graph Neural Network (GNN)-based ToM architecture tailored for cyber-defence, +Graph-In, Graph-Out (GIGO)-ToM, which can accurately predict both the targets +and attack trajectories of adversarial cyber agents over arbitrary computer +network topologies. To evaluate the latter, we propose a novel extension of the +Wasserstein distance for measuring the similarity of graph-based probability +distributions. Whereas the standard Wasserstein distance lacks a fixed +reference scale, we introduce a graph-theoretic normalization factor that +enables a standardized comparison between networks of different sizes. We +furnish this metric, which we term the Network Transport Distance (NTD), with a +weighting function that emphasizes predictions according to custom node +features, allowing network operators to explore arbitrary strategic +considerations. Benchmarked against a Graph-In, Dense-Out (GIDO)-ToM +architecture in an abstract cyber-defence environment, our empirical +evaluations show that GIGO-ToM can accurately predict the goals and behaviours +of various unseen cyber-attacking agents across a range of network topologies, +as well as learn embeddings that can effectively characterize their policies. + +
+
+ comment: 29 pages, 17 figures, 12 tables +
+
+
+
+
+ + ☆ Approximate Top-$k$ for Increased Parallelism + + +
+ We present an evaluation of bucketed approximate top-$k$ algorithms. +Computing top-$k$ exactly suffers from limited parallelism, because the $k$ +largest values must be aggregated along the vector, thus is not well suited to +computation on highly-parallel machine learning accelerators. By relaxing the +requirement that the top-$k$ is exact, bucketed algorithms can dramatically +increase the parallelism available by independently computing many smaller +top-$k$ operations. We explore the design choices of this class of algorithms +using both theoretical analysis and empirical evaluation on downstream tasks. +Our motivating examples are sparsity algorithms for language models, which +often use top-$k$ to select the most important parameters or activations. We +also release a fast bucketed top-$k$ implementation for PyTorch. + +
+
+
+
+
+ + ☆ Multi-Scale Node Embeddings for Graph Modeling and Generation + + +
+ Lying at the interface between Network Science and Machine Learning, node +embedding algorithms take a graph as input and encode its structure onto output +vectors that represent nodes in an abstract geometric space, enabling various +vector-based downstream tasks such as network modelling, data compression, link +prediction, and community detection. Two apparently unrelated limitations +affect these algorithms. On one hand, it is not clear what the basic operation +defining vector spaces, i.e. the vector sum, corresponds to in terms of the +original nodes in the network. On the other hand, while the same input network +can be represented at multiple levels of resolution by coarse-graining the +constituent nodes into arbitrary block-nodes, the relationship between node +embeddings obtained at different hierarchical levels is not understood. Here, +building on recent results in network renormalization theory, we address these +two limitations at once and define a multiscale node embedding method that, +upon arbitrary coarse-grainings, ensures statistical consistency of the +embedding vector of a block-node with the sum of the embedding vectors of its +constituent nodes. We illustrate the power of this approach on two economic +networks that can be naturally represented at multiple resolution levels: +namely, the international trade between (sets of) countries and the +input-output flows among (sets of) industries in the Netherlands. We confirm +the statistical consistency between networks retrieved from coarse-grained node +vectors and networks retrieved from sums of fine-grained node vectors, a result +that cannot be achieved by alternative methods. Several key network properties, +including a large number of triangles, are successfully replicated already from +embeddings of very low dimensionality, allowing for the generation of faithful +replicas of the original networks at arbitrary resolution levels. + +
+
+
+
+
+ + ☆ ActFusion: a Unified Diffusion Model for Action Segmentation and + Anticipation NeurIPS 2024 + + +
+ Temporal action segmentation and long-term action anticipation are two +popular vision tasks for the temporal analysis of actions in videos. Despite +apparent relevance and potential complementarity, these two problems have been +investigated as separate and distinct tasks. In this work, we tackle these two +problems, action segmentation and action anticipation, jointly using a unified +diffusion model dubbed ActFusion. The key idea to unification is to train the +model to effectively handle both visible and invisible parts of the sequence in +an integrated manner; the visible part is for temporal segmentation, and the +invisible part is for future anticipation. To this end, we introduce a new +anticipative masking strategy during training in which a late part of the video +frames is masked as invisible, and learnable tokens replace these frames to +learn to predict the invisible future. Experimental results demonstrate the +bi-directional benefits between action segmentation and anticipation. ActFusion +achieves the state-of-the-art performance across the standard benchmarks of 50 +Salads, Breakfast, and GTEA, outperforming task-specific models in both of the +two tasks with a single unified model through joint learning. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ☆ Distributionally Robust Performative Prediction NeurIPS + + +
+ Performative prediction aims to model scenarios where predictive outcomes +subsequently influence the very systems they target. The pursuit of a +performative optimum (PO) -- minimizing performative risk -- is generally +reliant on modeling of the distribution map, which characterizes how a deployed +ML model alters the data distribution. Unfortunately, inevitable +misspecification of the distribution map can lead to a poor approximation of +the true PO. To address this issue, we introduce a novel framework of +distributionally robust performative prediction and study a new solution +concept termed as distributionally robust performative optimum (DRPO). We show +provable guarantees for DRPO as a robust approximation to the true PO when the +nominal distribution map is different from the actual one. Moreover, +distributionally robust performative prediction can be reformulated as an +augmented performative prediction problem, enabling efficient optimization. The +experimental results demonstrate that DRPO offers potential advantages over +traditional PO approach when the distribution map is misspecified at either +micro- or macro-level. + +
+
+ comment: In Proceedings of the 38th Conference on Neural Information + Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Likelihood-Scheduled Score-Based Generative Modeling for Fully 3D PET + Image Reconstruction + + +
+ Medical image reconstruction with pre-trained score-based generative models +(SGMs) has advantages over other existing state-of-the-art deep-learned +reconstruction methods, including improved resilience to different scanner +setups and advanced image distribution modeling. SGM-based reconstruction has +recently been applied to simulated positron emission tomography (PET) datasets, +showing improved contrast recovery for out-of-distribution lesions relative to +the state-of-the-art. However, existing methods for SGM-based reconstruction +from PET data suffer from slow reconstruction, burdensome hyperparameter tuning +and slice inconsistency effects (in 3D). In this work, we propose a practical +methodology for fully 3D reconstruction that accelerates reconstruction and +reduces the number of critical hyperparameters by matching the likelihood of an +SGM's reverse diffusion process to a current iterate of the maximum-likelihood +expectation maximization algorithm. Using the example of low-count +reconstruction from simulated $[^{18}$F]DPA-714 datasets, we show our +methodology can match or improve on the NRMSE and SSIM of existing +state-of-the-art SGM-based PET reconstruction while reducing reconstruction +time and the need for hyperparameter tuning. We evaluate our methodology +against state-of-the-art supervised and conventional reconstruction algorithms. +Finally, we demonstrate a first-ever implementation of SGM-based reconstruction +for real 3D PET data, specifically $[^{18}$F]DPA-714 data, where we integrate +perpendicular pre-trained SGMs to eliminate slice inconsistency issues. + +
+
+ comment: 11 pages, 12 figures. Submitted to Transactions on Medical Imaging +
+
+
+
+
+ + ☆ Action Mapping for Reinforcement Learning in Continuous Environments + with Constraints + + +
+ Deep reinforcement learning (DRL) has had success across various domains, but +applying it to environments with constraints remains challenging due to poor +sample efficiency and slow convergence. Recent literature explored +incorporating model knowledge to mitigate these problems, particularly through +the use of models that assess the feasibility of proposed actions. However, +integrating feasibility models efficiently into DRL pipelines in environments +with continuous action spaces is non-trivial. We propose a novel DRL training +strategy utilizing action mapping that leverages feasibility models to +streamline the learning process. By decoupling the learning of feasible actions +from policy optimization, action mapping allows DRL agents to focus on +selecting the optimal action from a reduced feasible action set. We demonstrate +through experiments that action mapping significantly improves training +performance in constrained environments with continuous action spaces, +especially with imperfect feasibility models. + +
+
+
+
+
+ + ☆ GRAM: Generalization in Deep RL with a Robust Adaptation Module + + +
+ The reliable deployment of deep reinforcement learning in real-world settings +requires the ability to generalize across a variety of conditions, including +both in-distribution scenarios seen during training as well as novel +out-of-distribution scenarios. In this work, we present a framework for +dynamics generalization in deep reinforcement learning that unifies these two +distinct types of generalization within a single architecture. We introduce a +robust adaptation module that provides a mechanism for identifying and reacting +to both in-distribution and out-of-distribution environment dynamics, along +with a joint training pipeline that combines the goals of in-distribution +adaptation and out-of-distribution robustness. Our algorithm GRAM achieves +strong generalization performance across in-distribution and +out-of-distribution scenarios upon deployment, which we demonstrate on a +variety of realistic simulated locomotion tasks with a quadruped robot. + +
+
+
+
+
+ + ☆ Generative-Model-Based Fully 3D PET Image Reconstruction by Conditional + Diffusion Sampling + + +
+ Score-based generative models (SGMs) have recently shown promising results +for image reconstruction on simulated positron emission tomography (PET) +datasets. In this work we have developed and implemented practical methodology +for 3D image reconstruction with SGMs, and perform (to our knowledge) the first +SGM-based reconstruction of real fully 3D PET data. We train an SGM on +full-count reference brain images, and extend methodology to allow SGM-based +reconstructions at very low counts (1% of original, to simulate low-dose or +short-duration scanning). We then perform reconstructions for multiple +independent realisations of 1% count data, allowing us to analyse the bias and +variance characteristics of the method. We sample from the learned posterior +distribution of the generative algorithm to calculate uncertainty images for +our reconstructions. We evaluate the method's performance on real full- and +low-count PET data and compare with conventional OSEM and MAP-EM baselines, +showing that our SGM-based low-count reconstructions match full-dose +reconstructions more closely and in a bias-variance trade-off comparison, our +SGM-reconstructed images have lower variance than existing baselines. Future +work will compare to supervised deep-learned methods, with other avenues for +investigation including how data conditioning affects the SGM's posterior +distribution and the algorithm's performance with different tracers. + +
+
+ comment: 2 pages, 2 figures. Accepted for oral presentation at IEEE NSS MIC + RTSD 2024 (submitted May 2024; accepted July 2024; presented Nov 2024) +
+
+
+
+
+ + ☆ The Tile: A 2D Map of Ranking Scores for Two-Class Classification + + +
+ In the computer vision and machine learning communities, as well as in many +other research domains, rigorous evaluation of any new method, including +classifiers, is essential. One key component of the evaluation process is the +ability to compare and rank methods. However, ranking classifiers and +accurately comparing their performances, especially when taking +application-specific preferences into account, remains challenging. For +instance, commonly used evaluation tools like Receiver Operating Characteristic +(ROC) and Precision/Recall (PR) spaces display performances based on two +scores. Hence, they are inherently limited in their ability to compare +classifiers across a broader range of scores and lack the capability to +establish a clear ranking among classifiers. In this paper, we present a novel +versatile tool, named the Tile, that organizes an infinity of ranking scores in +a single 2D map for two-class classifiers, including common evaluation scores +such as the accuracy, the true positive rate, the positive predictive value, +Jaccard's coefficient, and all F-beta scores. Furthermore, we study the +properties of the underlying ranking scores, such as the influence of the +priors or the correspondences with the ROC space, and depict how to +characterize any other score by comparing them to the Tile. Overall, we +demonstrate that the Tile is a powerful tool that effectively captures all the +rankings in a single visualization and allows interpreting them. + +
+
+
+
+
+ + ☆ ALMA: Alignment with Minimal Annotation + + +
+ Recent approaches to large language model (LLM) alignment typically require +millions of human annotations or rely on external aligned models for synthetic +data generation. This paper introduces ALMA: Alignment with Minimal Annotation, +demonstrating that effective alignment can be achieved using only 9,000 labeled +examples -- less than 1% of conventional approaches. ALMA generates large +amounts of high-quality synthetic alignment data through new techniques: +diverse prompt synthesis via few-shot learning, diverse response generation +with multiple model checkpoints, and judge (reward model) enhancement through +score aggregation and self-distillation. Using only a pretrained Llama3 base +model, 5,000 SFT examples, and 4,000 judge annotations, ALMA achieves +performance close to Llama3-Instruct across diverse alignment benchmarks (e.g., +0.1% difference on AlpacaEval 2.0 score). These results are achieved with a +multi-round, self-bootstrapped data synthesis and training recipe that +continues to improve for 10 rounds, surpassing the typical 3-round ceiling of +previous methods. These results suggest that base models already possess +sufficient knowledge for effective alignment, and that synthetic data +generation methods can expose it. + +
+
+
+
+
+ + ☆ Structure-Aware Stylized Image Synthesis for Robust Medical Image + Segmentation + + +
+ Accurate medical image segmentation is essential for effective diagnosis and +treatment planning but is often challenged by domain shifts caused by +variations in imaging devices, acquisition conditions, and patient-specific +attributes. Traditional domain generalization methods typically require +inclusion of parts of the test domain within the training set, which is not +always feasible in clinical settings with limited diverse data. Additionally, +although diffusion models have demonstrated strong capabilities in image +generation and style transfer, they often fail to preserve the critical +structural information necessary for precise medical analysis. To address these +issues, we propose a novel medical image segmentation method that combines +diffusion models and Structure-Preserving Network for structure-aware one-shot +image stylization. Our approach effectively mitigates domain shifts by +transforming images from various sources into a consistent style while +maintaining the location, size, and shape of lesions. This ensures robust and +accurate segmentation even when the target domain is absent from the training +data. Experimental evaluations on colonoscopy polyp segmentation and skin +lesion segmentation datasets show that our method enhances the robustness and +accuracy of segmentation models, achieving superior performance metrics +compared to baseline models without style transfer. This structure-aware +stylization framework offers a practical solution for improving medical image +segmentation across diverse domains, facilitating more reliable clinical +diagnoses. + +
+
+
+
+
+ + ☆ Deep Causal Inference for Point-referenced Spatial Data with Continuous + Treatments + + +
+ Causal reasoning is often challenging with spatial data, particularly when +handling high-dimensional inputs. To address this, we propose a neural network +(NN) based framework integrated with an approximate Gaussian process to manage +spatial interference and unobserved confounding. Additionally, we adopt a +generalized propensity-score-based approach to address partially observed +outcomes when estimating causal effects with continuous treatments. We evaluate +our framework using synthetic, semi-synthetic, and real-world data inferred +from satellite imagery. Our results demonstrate that NN-based models +significantly outperform linear spatial regression models in estimating causal +effects. Furthermore, in real-world case studies, NN-based models offer more +reasonable predictions of causal effects, facilitating decision-making in +relevant applications. + +
+
+ comment: 16 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Complexity of Vector-valued Prediction: From Linear Models to Stochastic + Convex Optimization + + +
+ We study the problem of learning vector-valued linear predictors: these are +prediction rules parameterized by a matrix that maps an $m$-dimensional feature +vector to a $k$-dimensional target. We focus on the fundamental case with a +convex and Lipschitz loss function, and show several new theoretical results +that shed light on the complexity of this problem and its connection to related +learning models. First, we give a tight characterization of the sample +complexity of Empirical Risk Minimization (ERM) in this setting, establishing +that $\smash{\widetilde{\Omega}}(k/\epsilon^2)$ examples are necessary for ERM +to reach $\epsilon$ excess (population) risk; this provides for an exponential +improvement over recent results by Magen and Shamir (2023) in terms of the +dependence on the target dimension $k$, and matches a classical upper bound due +to Maurer (2016). Second, we present a black-box conversion from general +$d$-dimensional Stochastic Convex Optimization (SCO) to vector-valued linear +prediction, showing that any SCO problem can be embedded as a prediction +problem with $k=\Theta(d)$ outputs. These results portray the setting of +vector-valued linear prediction as bridging between two extensively studied yet +disparate learning models: linear models (corresponds to $k=1$) and general +$d$-dimensional SCO (with $k=\Theta(d)$). + +
+
+
+
+
+ + ☆ Reinforcement Learning from Wild Animal Videos + + +
+ We propose to learn legged robot locomotion skills by watching thousands of +wild animal videos from the internet, such as those featured in nature +documentaries. Indeed, such videos offer a rich and diverse collection of +plausible motion examples, which could inform how robots should move. To +achieve this, we introduce Reinforcement Learning from Wild Animal Videos +(RLWAV), a method to ground these motions into physical robots. We first train +a video classifier on a large-scale animal video dataset to recognize actions +from RGB clips of animals in their natural habitats. We then train a +multi-skill policy to control a robot in a physics simulator, using the +classification score of a third-person camera capturing videos of the robot's +movements as a reward for reinforcement learning. Finally, we directly transfer +the learned policy to a real quadruped Solo. Remarkably, despite the extreme +gap in both domain and embodiment between animals in the wild and robots, our +approach enables the policy to learn diverse skills such as walking, jumping, +and keeping still, without relying on reference trajectories nor skill-specific +rewards. + +
+
+ comment: Project website: https://elliotchanesane31.github.io/RLWAV/ +
+
+
+
+
+ + ☆ SynFinTabs: A Dataset of Synthetic Financial Tables for Information and + Table Extraction + + +
+ Table extraction from document images is a challenging AI problem, and +labelled data for many content domains is difficult to come by. Existing table +extraction datasets often focus on scientific tables due to the vast amount of +academic articles that are readily available, along with their source code. +However, there are significant layout and typographical differences between +tables found across scientific, financial, and other domains. Current datasets +often lack the words, and their positions, contained within the tables, instead +relying on unreliable OCR to extract these features for training modern machine +learning models on natural language processing tasks. Therefore, there is a +need for a more general method of obtaining labelled data. We present +SynFinTabs, a large-scale, labelled dataset of synthetic financial tables. Our +hope is that our method of generating these synthetic tables is transferable to +other domains. To demonstrate the effectiveness of our dataset in training +models to extract information from table images, we create FinTabQA, a layout +large language model trained on an extractive question-answering task. We test +our model using real-world financial tables and compare it to a +state-of-the-art generative model and discuss the results. We make the dataset, +model, and dataset generation code publicly available. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ SCADE: Scalable Command-line Anomaly Detection Engine + + +
+ As command-line interfaces remain an integral part of high-computation +environments, the risk of exploitation through stealthy, complex command-line +abuse continues to grow. Conventional security solutions often struggle with +these command-line-based anomalies due to their context-specific nature and +lack of labeled data, especially in detecting rare, malicious patterns amidst +legitimate, high-volume activity. This gap has left organizations vulnerable to +sophisticated threats like Living-off-the-Land (LOL) attacks, where standard +detection tools frequently miss or misclassify anomalous command-line behavior. +We introduce Scalable Command-Line Anomaly Detection Engine (SCADE), who +addresses these challenges by introducing a dual-layered detection framework +that combines a global statistical analysis with local context-specific anomaly +detection, innovatively using a novel ensemble of statistical models such as +BM25 and Log Entropy, adapted for command-line data. The framework also +features a dynamic thresholding mechanism for adaptive anomaly detection, +ensuring high precision and recall even in environments with extremely high +Signal-to-Noise Ratios (SNRs). Initial experimental results demonstrate the +effectiveness of the framework, achieving above 98% SNR in identifying unusual +command-line behavior while minimizing false positives. In this paper, we +present SCADE's core architecture, including its metadata-enriched approach to +anomaly detection and the design choices behind its scalability for +enterprise-level deployment. We argue that SCADE represents a significant +advancement in command-line anomaly detection, offering a robust, adaptive +framework for security analysts and researchers seeking to enhance detection +accuracy in high-computation environments. + +
+
+
+
+
+ + ☆ Quantifying the Limits of Segment Anything Model: Analyzing Challenges + in Segmenting Tree-Like and Low-Contrast Structures + + +
+ Segment Anything Model (SAM) has shown impressive performance in interactive +and zero-shot segmentation across diverse domains, suggesting that they have +learned a general concept of "objects" from their large-scale training. +However, we observed that SAM struggles with certain types of objects, +particularly those featuring dense, tree-like structures and low textural +contrast from their surroundings. These failure modes are critical for +understanding its limitations in real-world use. In order to systematically +examine this issue, we propose metrics to quantify two key object +characteristics: tree-likeness and textural separability. Through extensive +controlled synthetic experiments and testing on real datasets, we demonstrate +that SAM's performance is noticeably correlated with these factors. We link +these behaviors under the concept of "textural confusion", where SAM +misinterprets local structure as global texture, leading to over-segmentation, +or struggles to differentiate objects from similarly textured backgrounds. +These findings offer the first quantitative framework to model SAM's +challenges, providing valuable insights into its limitations and guiding future +improvements for vision foundation models. + +
+
+ comment: Code: https://github.com/mazurowski-lab/SAM-TexturalConfusion-Metrics +
+
+
+
+
+ + ☆ LMDM:Latent Molecular Diffusion Model For 3D Molecule Generation + + +
+ n this work, we propose a latent molecular diffusion model that can make the +generated 3D molecules rich in diversity and maintain rich geometric features. +The model captures the information of the forces and local constraints between +atoms so that the generated molecules can maintain Euclidean transformation and +high level of effectiveness and diversity. We also use the lowerrank manifold +advantage of the latent variables of the latent model to fuse the information +of the forces between atoms to better maintain the geometric equivariant +properties of the molecules. Because there is no need to perform information +fusion encoding in stages like traditional encoders and decoders, this reduces +the amount of calculation in the back-propagation process. The model keeps the +forces and local constraints of particle bonds in the latent variable space, +reducing the impact of underfitting on the surface of the network on the large +position drift of the particle geometry, so that our model can converge +earlier. We introduce a distribution control variable in each backward step to +strengthen exploration and improve the diversity of generation. In the +experiment, the quality of the samples we generated and the convergence speed +of the model have been significantly improved. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2209.05710 by other authors +
+
+
+
+
+ + ☆ A History of Philosophy in Colombia through Topic Modelling + + +
+ Data-driven approaches to philosophy have emerged as a valuable tool for +studying the history of the discipline. However, most studies in this area have +focused on a limited number of journals from specific regions and subfields. We +expand the scope of this research by applying dynamic topic modelling +techniques to explore the history of philosophy in Colombia and Latin America. +Our study examines the Colombian philosophy journal Ideas y Valores, founded in +1951 and currently one of the most influential academic philosophy journals in +the region. By analyzing the evolution of topics across the journal's history, +we identify various trends and specific dynamics in philosophical discourse +within the Colombian and Latin American context. Our findings reveal that the +most prominent topics are value theory (including ethics, political philosophy, +and aesthetics), epistemology, and the philosophy of science. We also trace the +evolution of articles focusing on the historical and interpretive aspects of +philosophical texts, and we note a notable emphasis on German philosophers such +as Kant, Husserl, and Hegel on various topics throughout the journal's +lifetime. Additionally, we investigate whether articles with a historical focus +have decreased over time due to editorial pressures. Our analysis suggests no +significant decline in such articles. Finally, we propose ideas for extending +this research to other Latin American journals and suggest improvements for +natural language processing workflows in non-English languages. + +
+
+
+
+
+ + ☆ HyperMARL: Adaptive Hypernetworks for Multi-Agent RL + + +
+ Balancing individual specialisation and shared behaviours is a critical +challenge in multi-agent reinforcement learning (MARL). Existing methods +typically focus on encouraging diversity or leveraging shared representations. +Full parameter sharing (FuPS) improves sample efficiency but struggles to learn +diverse behaviours when required, while no parameter sharing (NoPS) enables +diversity but is computationally expensive and sample inefficient. To address +these challenges, we introduce HyperMARL, a novel approach using hypernetworks +to balance efficiency and specialisation. HyperMARL generates agent-specific +actor and critic parameters, enabling agents to adaptively exhibit diverse or +homogeneous behaviours as needed, without modifying the learning objective or +requiring prior knowledge of the optimal diversity. Furthermore, HyperMARL +decouples agent-specific and state-based gradients, which empirically +correlates with reduced policy gradient variance, potentially offering insights +into its ability to capture diverse behaviours. Across MARL benchmarks +requiring homogeneous, heterogeneous, or mixed behaviours, HyperMARL +consistently matches or outperforms FuPS, NoPS, and diversity-focused methods, +achieving NoPS-level diversity with a shared architecture. These results +highlight the potential of hypernetworks as a versatile approach to the +trade-off between specialisation and shared behaviours in MARL. + +
+
+
+
+
+ + ☆ Foundations of the Theory of Performance-Based Ranking + + +
+ Ranking entities such as algorithms, devices, methods, or models based on +their performances, while accounting for application-specific preferences, is a +challenge. To address this challenge, we establish the foundations of a +universal theory for performance-based ranking. First, we introduce a rigorous +framework built on top of both the probability and order theories. Our new +framework encompasses the elements necessary to (1) manipulate performances as +mathematical objects, (2) express which performances are worse than or +equivalent to others, (3) model tasks through a variable called satisfaction, +(4) consider properties of the evaluation, (5) define scores, and (6) specify +application-specific preferences through a variable called importance. On top +of this framework, we propose the first axiomatic definition of performance +orderings and performance-based rankings. Then, we introduce a universal +parametric family of scores, called ranking scores, that can be used to +establish rankings satisfying our axioms, while considering +application-specific preferences. Finally, we show, in the case of two-class +classification, that the family of ranking scores encompasses well-known +performance scores, including the accuracy, the true positive rate (recall, +sensitivity), the true negative rate (specificity), the positive predictive +value (precision), and F1. However, we also show that some other scores +commonly used to compare classifiers are unsuitable to derive performance +orderings satisfying the axioms. Therefore, this paper provides the computer +vision and machine learning communities with a rigorous framework for +evaluating and ranking entities. + +
+
+
+
+
+ + ☆ Physics-informed Deep Learning for Muscle Force Prediction with + Unlabeled sEMG Signals + + +
+ Computational biomechanical analysis plays a pivotal role in understanding +and improving human movements and physical functions. Although physics-based +modeling methods can interpret the dynamic interaction between the neural drive +to muscle dynamics and joint kinematics, they suffer from high computational +latency. In recent years, data-driven methods have emerged as a promising +alternative due to their fast execution speed, but label information is still +required during training, which is not easy to acquire in practice. To tackle +these issues, this paper presents a novel physics-informed deep learning method +to predict muscle forces without any label information during model training. +In addition, the proposed method could also identify personalized muscle-tendon +parameters. To achieve this, the Hill muscle model-based forward dynamics is +embedded into the deep neural network as the additional loss to further +regulate the behavior of the deep neural network. Experimental validations on +the wrist joint from six healthy subjects are performed, and a fully connected +neural network (FNN) is selected to implement the proposed method. The +predicted results of muscle forces show comparable or even lower root mean +square error (RMSE) and higher coefficient of determination compared with +baseline methods, which have to use the labeled surface electromyography (sEMG) +signals, and it can also identify muscle-tendon parameters accurately, +demonstrating the effectiveness of the proposed physics-informed deep learning +method. + +
+
+ comment: 11pages, 8 figures, journal +
+
+
+
+
+ + ☆ Directed Structural Adaptation to Overcome Statistical Conflicts and + Enable Continual Learning AAAI-2024 + + +
+ Adaptive networks today rely on overparameterized fixed topologies that +cannot break through the statistical conflicts they encounter in the data they +are exposed to, and are prone to "catastrophic forgetting" as the network +attempts to reuse the existing structures to learn new task. We propose a +structural adaptation method, DIRAD, that can complexify as needed and in a +directed manner without being limited by statistical conflicts within a +dataset. We then extend this method and present the PREVAL framework, designed +to prevent "catastrophic forgetting" in continual learning by detection of new +data and assigning encountered data to suitable models adapted to process them, +without needing task labels anywhere in the workflow. We show the reliability +of the DIRAD in growing a network with high performance and orders-of-magnitude +simpler than fixed topology networks; and demonstrate the proof-of-concept +operation of PREVAL, in which continual adaptation to new tasks is observed +while being able to detect and discern previously-encountered tasks. + +
+
+ comment: Presented in Deployable AI (DAI) workshop at AAAI-2024 +
+
+
+
+
+ + ☆ Linear Discriminant Analysis in Credit Scoring: A Transparent Hybrid + Model Approach + + +
+ The development of computing has made credit scoring approaches possible, +with various machine learning (ML) and deep learning (DL) techniques becoming +more and more valuable. While complex models yield more accurate predictions, +their interpretability is often weakened, which is a concern for credit scoring +that places importance on decision fairness. As features of the dataset are a +crucial factor for the credit scoring system, we implement Linear Discriminant +Analysis (LDA) as a feature reduction technique, which reduces the burden of +the models complexity. We compared 6 different machine learning models, 1 deep +learning model, and a hybrid model with and without using LDA. From the result, +we have found our hybrid model, XG-DNN, outperformed other models with the +highest accuracy of 99.45% and a 99% F1 score with LDA. Lastly, to interpret +model decisions, we have applied 2 different explainable AI techniques named +LIME (local) and Morris Sensitivity Analysis (global). Through this research, +we showed how feature reduction techniques can be used without affecting the +performance and explainability of the model, which can be very useful in +resource-constrained settings to optimize the computational workload. + +
+
+ comment: Accepted on International Conference on Computer and Information + Technology (ICCIT) 2024 +
+
+
+
+
+ + ☆ SKIM: Any-bit Quantization Pushing The Limits of Post-Training + Quantization + + +
+ Large Language Models (LLMs) exhibit impressive performance across various +tasks, but deploying them for inference poses challenges. Their high resource +demands often necessitate complex, costly multi-GPU pipelines, or the use of +smaller, less capable models. While quantization offers a promising solution +utilizing lower precision for model storage, existing methods frequently +experience significant performance drops at lower precision levels. +Additionally, they typically provide only a limited set of solutions at +specific bit levels, many of which are extensively manually tuned. To address +these challenges, we propose a new method called SKIM: Scaled K-means +clustering wIth Mixed precision. Our approach introduces two novel techniques: +1. A greedy algorithm to solve approximately optimal bit allocation across +weight channels, and 2. A trainable scaling vector for non-differentiable +K-means clustering. These techniques substantially improve performance and can +be adapted to any given bit. Notably, in terms of model perplexity, our method +narrows the gap between 3-bit quantized LLaMA models and their full precision +counterparts by 16.3% on average. + +
+
+
+
+
+ + ☆ Multi-Layer Privacy-Preserving Record Linkage with Clerical Review based + on gradual information disclosure + + +
+ Privacy-Preserving Record linkage (PPRL) is an essential component in data +integration tasks of sensitive information. The linkage quality determines the +usability of combined datasets and (machine learning) applications based on +them. We present a novel privacy-preserving protocol that integrates clerical +review in PPRL using a multi-layer active learning process. Uncertain match +candidates are reviewed on several layers by human and non-human oracles to +reduce the amount of disclosed information per record and in total. Predictions +are propagated back to update previous layers, resulting in an improved linkage +performance for non-reviewed candidates as well. The data owners remain in +control of the amount of information they share for each record. Therefore, our +approach follows need-to-know and data sovereignty principles. The experimental +evaluation on real-world datasets shows considerable linkage quality +improvements with limited labeling effort and privacy risks. + +
+
+ comment: Accepted at 21st Conference on Database Systems for Business, + Technology and Web (BTW) +
+
+
+
+
+ + ☆ Fixed-Mean Gaussian Processes for Post-hoc Bayesian Deep Learning + + +
+ Recently, there has been an increasing interest in performing post-hoc +uncertainty estimation about the predictions of pre-trained deep neural +networks (DNNs). Given a pre-trained DNN via back-propagation, these methods +enhance the original network by adding output confidence measures, such as +error bars, without compromising its initial accuracy. In this context, we +introduce a novel family of sparse variational Gaussian processes (GPs), where +the posterior mean is fixed to any continuous function when using a universal +kernel. Specifically, we fix the mean of this GP to the output of the +pre-trained DNN, allowing our approach to effectively fit the GP's predictive +variances to estimate the DNN prediction uncertainty. Our approach leverages +variational inference (VI) for efficient stochastic optimization, with training +costs that remain independent of the number of training points, scaling +efficiently to large datasets such as ImageNet. The proposed method, called +fixed mean GP (FMGP), is architecture-agnostic, relying solely on the +pre-trained model's outputs to adjust the predictive variances. Experimental +results demonstrate that FMGP improves both uncertainty estimation and +computational efficiency when compared to state-of-the-art methods. + +
+
+ comment: 12 pages, 6 figures and 2 tables. Submitted to IEEE TRANSACTIONS ON + PATTERN ANALYSIS AND MACHINE INTELLIGENCE +
+
+
+
+
+ + ☆ An In-Depth Examination of Risk Assessment in Multi-Class Classification + Algorithms + + +
+ Advanced classification algorithms are being increasingly used in +safety-critical applications like health-care, engineering, etc. In such +applications, miss-classifications made by ML algorithms can result in +substantial financial or health-related losses. To better anticipate and +prepare for such losses, the algorithm user seeks an estimate for the +probability that the algorithm miss-classifies a sample. We refer to this task +as the risk-assessment. For a variety of models and datasets, we numerically +analyze the performance of different methods in solving the risk-assessment +problem. We consider two solution strategies: a) calibration techniques that +calibrate the output probabilities of classification models to provide accurate +probability outputs; and b) a novel approach based upon the prediction interval +generation technique of conformal prediction. Our conformal prediction based +approach is model and data-distribution agnostic, simple to implement, and +provides reasonable results for a variety of use-cases. We compare the +different methods on a broad variety of models and datasets. + +
+
+
+
+
+ + ☆ On the Lack of Robustness of Binary Function Similarity Systems + + +
+ Binary function similarity, which often relies on learning-based algorithms +to identify what functions in a pool are most similar to a given query +function, is a sought-after topic in different communities, including machine +learning, software engineering, and security. Its importance stems from the +impact it has in facilitating several crucial tasks, from reverse engineering +and malware analysis to automated vulnerability detection. Whereas recent work +cast light around performance on this long-studied problem, the research +landscape remains largely lackluster in understanding the resiliency of the +state-of-the-art machine learning models against adversarial attacks. As +security requires to reason about adversaries, in this work we assess the +robustness of such models through a simple yet effective black-box greedy +attack, which modifies the topology and the content of the control flow of the +attacked functions. We demonstrate that this attack is successful in +compromising all the models, achieving average attack success rates of 57.06% +and 95.81% depending on the problem settings (targeted and untargeted attacks). +Our findings are insightful: top performance on clean data does not necessarily +relate to top robustness properties, which explicitly highlights +performance-robustness trade-offs one should consider when deploying such +models, calling for further research. + +
+
+
+
+
+ + ☆ LossVal: Efficient Data Valuation for Neural Networks + + +
+ Assessing the importance of individual training samples is a key challenge in +machine learning. Traditional approaches retrain models with and without +specific samples, which is computationally expensive and ignores dependencies +between data points. We introduce LossVal, an efficient data valuation method +that computes importance scores during neural network training by embedding a +self-weighting mechanism into loss functions like cross-entropy and mean +squared error. LossVal reduces computational costs, making it suitable for +large datasets and practical applications. Experiments on classification and +regression tasks across multiple datasets show that LossVal effectively +identifies noisy samples and is able to distinguish helpful from harmful +samples. We examine the gradient calculation of LossVal to highlight its +advantages. The source code is available at: +https://github.com/twibiral/LossVal + +
+
+
+
+
+ + ☆ Non-Asymptotic Bounds for Closed-Loop Identification of Unstable + Nonlinear Stochastic Systems + + +
+ We consider the problem of least squares parameter estimation from +single-trajectory data for discrete-time, unstable, closed-loop nonlinear +stochastic systems, with linearly parameterised uncertainty. Assuming a region +of the state space produces informative data, and the system is +sub-exponentially unstable, we establish non-asymptotic guarantees on the +estimation error at times where the state trajectory evolves in this region. If +the whole state space is informative, high probability guarantees on the error +hold for all times. Examples are provided where our results are useful for +analysis, but existing results are not. + +
+
+ comment: 21 pages, 2 figures +
+
+
+
+
+ + ☆ MultiTASC++: A Continuously Adaptive Scheduler for Edge-Based + Multi-Device Cascade Inference + + +
+ Cascade systems, consisting of a lightweight model processing all samples and +a heavier, high-accuracy model refining challenging samples, have become a +widely-adopted distributed inference approach to achieving high accuracy and +maintaining a low computational burden for mobile and IoT devices. As +intelligent indoor environments, like smart homes, continue to expand, a new +scenario emerges, the multi-device cascade. In this setting, multiple diverse +devices simultaneously utilize a shared heavy model hosted on a server, often +situated within or close to the consumer environment. This work introduces +MultiTASC++, a continuously adaptive multi-tenancy-aware scheduler that +dynamically controls the forwarding decision functions of devices to optimize +system throughput while maintaining high accuracy and low latency. Through +extensive experimentation in diverse device environments and with varying +server-side models, we demonstrate the scheduler's efficacy in consistently +maintaining a targeted satisfaction rate while providing the highest available +accuracy across different device tiers and workloads of up to 100 devices. This +demonstrates its scalability and efficiency in addressing the unique challenges +of collaborative DNN inference in dynamic and diverse IoT environments. + +
+
+
+
+
+ + ☆ Understanding Memorization in Generative Models via Sharpness in + Probability Landscapes + + +
+ In this paper, we introduce a geometric framework to analyze memorization in +diffusion models using the eigenvalues of the Hessian of the log probability +density. We propose that memorization arises from isolated points in the +learned probability distribution, characterized by sharpness in the probability +landscape, as indicated by large negative eigenvalues of the Hessian. Through +experiments on various datasets, we demonstrate that these eigenvalues +effectively detect and quantify memorization. Our approach provides a clear +understanding of memorization in diffusion models and lays the groundwork for +developing strategies to ensure secure and reliable generative models + +
+
+
+
+
+ + ☆ Text Change Detection in Multilingual Documents Using Image Comparison + + +
+ Document comparison typically relies on optical character recognition (OCR) +as its core technology. However, OCR requires the selection of appropriate +language models for each document and the performance of multilingual or hybrid +models remains limited. To overcome these challenges, we propose text change +detection (TCD) using an image comparison model tailored for multilingual +documents. Unlike OCR-based approaches, our method employs word-level text +image-to-image comparison to detect changes. Our model generates bidirectional +change segmentation maps between the source and target documents. To enhance +performance without requiring explicit text alignment or scaling preprocessing, +we employ correlations among multi-scale attention features. We also construct +a benchmark dataset comprising actual printed and scanned word pairs in various +languages to evaluate our model. We validate our approach using our benchmark +dataset and public benchmarks Distorted Document Images and the LRDE Document +Binarization Dataset. We compare our model against state-of-the-art semantic +segmentation and change detection models, as well as to conventional OCR-based +models. + +
+
+ comment: 15pages, 11figures 6tables, wacv2025 accepted +
+
+
+
+
+ + ☆ Compositional Generative Multiphysics and Multi-component Simulation + + +
+ Multiphysics simulation, which models the interactions between multiple +physical processes, and multi-component simulation of complex structures are +critical in fields like nuclear and aerospace engineering. Previous studies +often rely on numerical solvers or machine learning-based surrogate models to +solve or accelerate these simulations. However, multiphysics simulations +typically require integrating multiple specialized solvers-each responsible for +evolving a specific physical process-into a coupled program, which introduces +significant development challenges. Furthermore, no universal algorithm exists +for multi-component simulations, which adds to the complexity. Here we propose +compositional Multiphysics and Multi-component Simulation with Diffusion models +(MultiSimDiff) to overcome these challenges. During diffusion-based training, +MultiSimDiff learns energy functions modeling the conditional probability of +one physical process/component conditioned on other processes/components. In +inference, MultiSimDiff generates coupled multiphysics solutions and +multi-component structures by sampling from the joint probability distribution, +achieved by composing the learned energy functions in a structured way. We test +our method in three tasks. In the reaction-diffusion and nuclear thermal +coupling problems, MultiSimDiff successfully predicts the coupling solution +using decoupled data, while the surrogate model fails in the more complex +second problem. For the thermal and mechanical analysis of the prismatic fuel +element, MultiSimDiff trained for single component prediction accurately +predicts a larger structure with 64 components, reducing the relative error by +40.3% compared to the surrogate model. + +
+
+ comment: 30pages,13 figures +
+
+
+
+
+ + ☆ DeepFEA: Deep Learning for Prediction of Transient Finite Element + Analysis Solutions + + +
+ Finite Element Analysis (FEA) is a powerful but computationally intensive +method for simulating physical phenomena. Recent advancements in machine +learning have led to surrogate models capable of accelerating FEA. Yet there +are still limitations in developing surrogates of transient FEA models that can +simultaneously predict the solutions for both nodes and elements with +applicability on both the 2D and 3D domains. Motivated by this research gap, +this study proposes DeepFEA, a deep learning-based framework that leverages a +multilayer Convolutional Long Short-Term Memory (ConvLSTM) network branching +into two parallel convolutional neural networks to predict the solutions for +both nodes and elements of FEA models. The proposed network is optimized using +a novel adaptive learning algorithm, called Node-Element Loss Optimization +(NELO). NELO minimizes the error occurring at both branches of the network +enabling the prediction of solutions for transient FEA simulations. The +experimental evaluation of DeepFEA is performed on three datasets in the +context of structural mechanics, generated to serve as publicly available +reference datasets. The results show that DeepFEA can achieve less than 3% +normalized mean and root mean squared error for 2D and 3D simulation scenarios, +and inference times that are two orders of magnitude faster than FEA. In +contrast, relevant state-of-the-art methods face challenges with +multi-dimensional output and dynamic input prediction. Furthermore, DeepFEA's +robustness was demonstrated in a real-life biomedical scenario, confirming its +suitability for accurate and efficient predictions of FEA simulations. + +
+
+ comment: This work has been submitted to a journal for possible publication +
+
+
+
+
+ + ☆ Missing Melodies: AI Music Generation and its "Nearly" Complete Omission + of the Global South + + +
+ Recent advances in generative AI have sparked renewed interest and expanded +possibilities for music generation. However, the performance and versatility of +these systems across musical genres are heavily influenced by the availability +of training data. We conducted an extensive analysis of over one million hours +of audio datasets used in AI music generation research and manually reviewed +more than 200 papers from eleven prominent AI and music conferences and +organizations (AAAI, ACM, EUSIPCO, EURASIP, ICASSP, ICML, IJCAI, ISMIR, +NeurIPS, NIME, SMC) to identify a critical gap in the fair representation and +inclusion of the musical genres of the Global South in AI research. Our +findings reveal a stark imbalance: approximately 86% of the total dataset hours +and over 93% of researchers focus primarily on music from the Global North. +However, around 40% of these datasets include some form of non-Western music, +genres from the Global South account for only 14.6% of the data. Furthermore, +approximately 51% of the papers surveyed concentrate on symbolic music +generation, a method that often fails to capture the cultural nuances inherent +in music from regions such as South Asia, the Middle East, and Africa. As AI +increasingly shapes the creation and dissemination of music, the significant +underrepresentation of music genres in datasets and research presents a serious +threat to global musical diversity. We also propose some important steps to +mitigate these risks and foster a more inclusive future for AI-driven music +generation. + +
+
+ comment: Submitted to CACM, 12 pages, 2 figures +
+
+
+
+
+ + ☆ HyperFLINT: Hypernetwork-based Flow Estimation and Temporal + Interpolation for Scientific Ensemble Visualization + + +
+ We present HyperFLINT (Hypernetwork-based FLow estimation and temporal +INTerpolation), a novel deep learning-based approach for estimating flow +fields, temporally interpolating scalar fields, and facilitating parameter +space exploration in spatio-temporal scientific ensemble data. This work +addresses the critical need to explicitly incorporate ensemble parameters into +the learning process, as traditional methods often neglect these, limiting +their ability to adapt to diverse simulation settings and provide meaningful +insights into the data dynamics. HyperFLINT introduces a hypernetwork to +account for simulation parameters, enabling it to generate accurate +interpolations and flow fields for each timestep by dynamically adapting to +varying conditions, thereby outperforming existing parameter-agnostic +approaches. The architecture features modular neural blocks with convolutional +and deconvolutional layers, supported by a hypernetwork that generates weights +for the main network, allowing the model to better capture intricate simulation +dynamics. A series of experiments demonstrates HyperFLINT's significantly +improved performance in flow field estimation and temporal interpolation, as +well as its potential in enabling parameter space exploration, offering +valuable insights into complex scientific ensembles. + +
+
+
+
+
+ + ☆ Learnable Similarity and Dissimilarity Guided Symmetric Non-Negative + Matrix Factorization + + +
+ Symmetric nonnegative matrix factorization (SymNMF) is a powerful tool for +clustering, which typically uses the $k$-nearest neighbor ($k$-NN) method to +construct similarity matrix. However, $k$-NN may mislead clustering since the +neighbors may belong to different clusters, and its reliability generally +decreases as $k$ grows. In this paper, we construct the similarity matrix as a +weighted $k$-NN graph with learnable weight that reflects the reliability of +each $k$-th NN. This approach reduces the search space of the similarity matrix +learning to $n - 1$ dimension, as opposed to the $\mathcal{O}(n^2)$ dimension +of existing methods, where $n$ represents the number of samples. Moreover, to +obtain a discriminative similarity matrix, we introduce a dissimilarity matrix +with a dual structure of the similarity matrix, and propose a new form of +orthogonality regularization with discussions on its geometric interpretation +and numerical stability. An efficient alternative optimization algorithm is +designed to solve the proposed model, with theoretically guarantee that the +variables converge to a stationary point that satisfies the KKT conditions. The +advantage of the proposed model is demonstrated by the comparison with nine +state-of-the-art clustering methods on eight datasets. The code is available at +\url{https://github.com/lwl-learning/LSDGSymNMF}. + +
+
+ comment: 12 pages, 14 figures +
+
+
+
+
+ + ☆ Federated Learning in Mobile Networks: A Comprehensive Case Study on + Traffic Forecasting + + +
+ The increasing demand for efficient resource allocation in mobile networks +has catalyzed the exploration of innovative solutions that could enhance the +task of real-time cellular traffic prediction. Under these circumstances, +federated learning (FL) stands out as a distributed and privacy-preserving +solution to foster collaboration among different sites, thus enabling +responsive near-the-edge solutions. In this paper, we comprehensively study the +potential benefits of FL in telecommunications through a case study on +federated traffic forecasting using real-world data from base stations (BSs) in +Barcelona (Spain). Our study encompasses relevant aspects within the federated +experience, including model aggregation techniques, outlier management, the +impact of individual clients, personalized learning, and the integration of +exogenous sources of data. The performed evaluation is based on both prediction +accuracy and sustainability, thus showcasing the environmental impact of +employed FL algorithms in various settings. The findings from our study +highlight FL as a promising and robust solution for mobile traffic prediction, +emphasizing its twin merits as a privacy-conscious and environmentally +sustainable approach, while also demonstrating its capability to overcome data +heterogeneity and ensure high-quality predictions, marking a significant stride +towards its integration in mobile traffic management systems. + +
+
+
+
+
+ + ☆ Towards Generalizable Autonomous Penetration Testing via Domain + Randomization and Meta-Reinforcement Learning + + +
+ With increasing numbers of vulnerabilities exposed on the internet, +autonomous penetration testing (pentesting) has emerged as an emerging research +area, while reinforcement learning (RL) is a natural fit for studying +autonomous pentesting. Previous research in RL-based autonomous pentesting +mainly focused on enhancing agents' learning efficacy within abstract simulated +training environments. They overlooked the applicability and generalization +requirements of deploying agents' policies in real-world environments that +differ substantially from their training settings. In contrast, for the first +time, we shift focus to the pentesting agents' ability to generalize across +unseen real environments. For this purpose, we propose a Generalizable +Autonomous Pentesting framework (namely GAP) for training agents capable of +drawing inferences from one to another -- a key requirement for the broad +application of autonomous pentesting and a hallmark of human intelligence. GAP +introduces a Real-to-Sim-to-Real pipeline with two key methods: domain +randomization and meta-RL learning. Specifically, we are among the first to +apply domain randomization in autonomous pentesting and propose a large +language model-powered domain randomization method for synthetic environment +generation. We further apply meta-RL to improve the agents' generalization +ability in unseen environments by leveraging the synthetic environments. The +combination of these two methods can effectively bridge the generalization gap +and improve policy adaptation performance. Experiments are conducted on various +vulnerable virtual machines, with results showing that GAP can (a) enable +policy learning in unknown real environments, (b) achieve zero-shot policy +transfer in similar environments, and (c) realize rapid policy adaptation in +dissimilar environments. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Distance-Adaptive Quaternion Knowledge Graph Embedding with + Bidirectional Rotation COLING 2025 + + +
+ Quaternion contains one real part and three imaginary parts, which provided a +more expressive hypercomplex space for learning knowledge graph. Existing +quaternion embedding models measure the plausibility of a triplet either +through semantic matching or geometric distance scoring functions. However, it +appears that semantic matching diminishes the separability of entities, while +the distance scoring function weakens the semantics of entities. To address +this issue, we propose a novel quaternion knowledge graph embedding model. Our +model combines semantic matching with entity's geometric distance to better +measure the plausibility of triplets. Specifically, in the quaternion space, we +perform a right rotation on head entity and a reverse rotation on tail entity +to learn rich semantic features. Then, we utilize distance adaptive +translations to learn geometric distance between entities. Furthermore, we +provide mathematical proofs to demonstrate our model can handle complex logical +relationships. Extensive experimental results and analyses show our model +significantly outperforms previous models on well-known knowledge graph +completion benchmark datasets. Our code is available at +https://github.com/llqy123/DaBR. + +
+
+ comment: Accepted by COLING 2025 +
+
+
+
+
+ + ☆ Integrated Sensing and Communications for Low-Altitude Economy: A Deep + Reinforcement Learning Approach + + +
+ This paper studies an integrated sensing and communications (ISAC) system for +low-altitude economy (LAE), where a ground base station (GBS) provides +communication and navigation services for authorized unmanned aerial vehicles +(UAVs), while sensing the low-altitude airspace to monitor the unauthorized +mobile target. The expected communication sum-rate over a given flight period +is maximized by jointly optimizing the beamforming at the GBS and UAVs' +trajectories, subject to the constraints on the average signal-to-noise ratio +requirement for sensing, the flight mission and collision avoidance of UAVs, as +well as the maximum transmit power at the GBS. Typically, this is a sequential +decision-making problem with the given flight mission. Thus, we transform it to +a specific Markov decision process (MDP) model called episode task. Based on +this modeling, we propose a novel LAE-oriented ISAC scheme, referred to as Deep +LAE-ISAC (DeepLSC), by leveraging the deep reinforcement learning (DRL) +technique. In DeepLSC, a reward function and a new action selection policy +termed constrained noise-exploration policy are judiciously designed to fulfill +various constraints. To enable efficient learning in episode tasks, we develop +a hierarchical experience replay mechanism, where the gist is to employ all +experiences generated within each episode to jointly train the neural network. +Besides, to enhance the convergence speed of DeepLSC, a symmetric experience +augmentation mechanism, which simultaneously permutes the indexes of all +variables to enrich available experience sets, is proposed. Simulation results +demonstrate that compared with benchmarks, DeepLSC yields a higher sum-rate +while meeting the preset constraints, achieves faster convergence, and is more +robust against different settings. + +
+
+ comment: submitted for an IEEE publication +
+
+
+
+
+ + ☆ Boundary-Guided Learning for Gene Expression Prediction in Spatial + Transcriptomics + + +
+ Spatial transcriptomics (ST) has emerged as an advanced technology that +provides spatial context to gene expression. Recently, deep learning-based +methods have shown the capability to predict gene expression from WSI data +using ST data. Existing approaches typically extract features from images and +the neighboring regions using pretrained models, and then develop methods to +fuse this information to generate the final output. However, these methods +often fail to account for the cellular structure similarity, cellular density +and the interactions within the microenvironment. In this paper, we propose a +framework named BG-TRIPLEX, which leverages boundary information extracted from +pathological images as guiding features to enhance gene expression prediction +from WSIs. Specifically, our model consists of three branches: the spot, +in-context and global branches. In the spot and in-context branches, boundary +information, including edge and nuclei characteristics, is extracted using +pretrained models. These boundary features guide the learning of cellular +morphology and the characteristics of microenvironment through Multi-Head +Cross-Attention. Finally, these features are integrated with global features to +predict the final output. Extensive experiments were conducted on three public +ST datasets. The results demonstrate that our BG-TRIPLEX consistently +outperforms existing methods in terms of Pearson Correlation Coefficient (PCC). +This method highlights the crucial role of boundary features in understanding +the complex interactions between WSI and gene expression, offering a promising +direction for future research. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Space to Policy: Scalable Brick Kiln Detection and Automatic Compliance + Monitoring with Geospatial Data + + +
+ Air pollution kills 7 million people annually. The brick kiln sector +significantly contributes to economic development but also accounts for 8-14\% +of air pollution in India. Policymakers have implemented compliance measures to +regulate brick kilns. Emission inventories are critical for air quality +modeling and source apportionment studies. However, the largely unorganized +nature of the brick kiln sector necessitates labor-intensive survey efforts for +monitoring. Recent efforts by air quality researchers have relied on manual +annotation of brick kilns using satellite imagery to build emission +inventories, but this approach lacks scalability. Machine-learning-based object +detection methods have shown promise for detecting brick kilns; however, +previous studies often rely on costly high-resolution imagery and fail to +integrate with governmental policies. In this work, we developed a scalable +machine-learning pipeline that detected and classified 30638 brick kilns across +five states in the Indo-Gangetic Plain using free, moderate-resolution +satellite imagery from Planet Labs. Our detections have a high correlation with +on-ground surveys. We performed automated compliance analysis based on +government policies. In the Delhi airshed, stricter policy enforcement has led +to the adoption of efficient brick kiln technologies. This study highlights the +need for inclusive policies that balance environmental sustainability with the +livelihoods of workers. + +
+
+
+
+
+ + ☆ Graph Neural Networks Need Cluster-Normalize-Activate Modules NeurIPS 2024 + + +
+ Graph Neural Networks (GNNs) are non-Euclidean deep learning models for +graph-structured data. Despite their successful and diverse applications, +oversmoothing prohibits deep architectures due to node features converging to a +single fixed point. This severely limits their potential to solve complex +tasks. To counteract this tendency, we propose a plug-and-play module +consisting of three steps: Cluster-Normalize-Activate (CNA). By applying CNA +modules, GNNs search and form super nodes in each layer, which are normalized +and activated individually. We demonstrate in node classification and property +prediction tasks that CNA significantly improves the accuracy over the +state-of-the-art. Particularly, CNA reaches 94.18% and 95.75% accuracy on Cora +and CiteSeer, respectively. It further benefits GNNs in regression tasks as +well, reducing the mean squared error compared to all baselines. At the same +time, GNNs with CNA require substantially fewer learnable parameters than +competing architectures. + +
+
+ comment: 17 pages, 6 figures, 6 tables, accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Pathwise optimization for bridge-type estimators and its applications + + +
+ Sparse parametric models are of great interest in statistical learning and +are often analyzed by means of regularized estimators. Pathwise methods allow +to efficiently compute the full solution path for penalized estimators, for any +possible value of the penalization parameter $\lambda$. In this paper we deal +with the pathwise optimization for bridge-type problems; i.e. we are interested +in the minimization of a loss function, such as negative log-likelihood or +residual sum of squares, plus the sum of $\ell^q$ norms with $q\in(0,1]$ +involving adpative coefficients. For some loss functions this regularization +achieves asymptotically the oracle properties (such as the selection +consistency). Nevertheless, since the objective function involves nonconvex and +nondifferentiable terms, the minimization problem is computationally +challenging. + The aim of this paper is to apply some general algorithms, arising from +nonconvex optimization theory, to compute efficiently the path solutions for +the adaptive bridge estimator with multiple penalties. In particular, we take +into account two different approaches: accelerated proximal gradient descent +and blockwise alternating optimization. The convergence and the path +consistency of these algorithms are discussed. In order to assess our methods, +we apply these algorithms to the penalized estimation of diffusion processes +observed at discrete times. This latter represents a recent research topic in +the field of statistics for time-dependent data. + +
+
+
+
+
+ + ☆ AI4EF: Artificial Intelligence for Energy Efficiency in the Building + Sector + + +
+ AI4EF, Artificial Intelligence for Energy Efficiency, is an advanced, +user-centric tool designed to support decision-making in building energy +retrofitting and efficiency optimization. Leveraging machine learning (ML) and +data-driven insights, AI4EF enables stakeholders such as public sector +representatives, energy consultants, and building owners to model, analyze, and +predict energy consumption, retrofit costs, and environmental impacts of +building upgrades. Featuring a modular framework, AI4EF includes customizable +building retrofitting, photovoltaic installation assessment, and predictive +modeling tools that allow users to input building parameters and receive +tailored recommendations for achieving energy savings and carbon reduction +goals. Additionally, the platform incorporates a Training Playground for data +scientists to refine ML models used by said framework. Finally, AI4EF provides +access to the Enershare Data Space to facilitate seamless data sharing and +access within the ecosystem. Its compatibility with open-source identity +management, Keycloak, enhances security and accessibility, making it adaptable +for various regulatory and organizational contexts. This paper presents an +architectural overview of AI4EF, its application in energy efficiency +scenarios, and its potential for advancing sustainable energy practices through +artificial intelligence (AI). + +
+
+
+
+
+ + ☆ Dynamic Graph Representation with Contrastive Learning for Financial + Market Prediction: Integrating Temporal Evolution and Static Relations + + +
+ Temporal Graph Learning (TGL) is crucial for capturing the evolving nature of +stock markets. Traditional methods often ignore the interplay between dynamic +temporal changes and static relational structures between stocks. To address +this issue, we propose the Dynamic Graph Representation with Contrastive +Learning (DGRCL) framework, which integrates dynamic and static graph relations +to improve the accuracy of stock trend prediction. Our framework introduces two +key components: the Embedding Enhancement (EE) module and the Contrastive +Constrained Training (CCT) module. The EE module focuses on dynamically +capturing the temporal evolution of stock data, while the CCT module enforces +static constraints based on stock relations, refined within contrastive +learning. This dual-relation approach allows for a more comprehensive +understanding of stock market dynamics. Our experiments on two major U.S. stock +market datasets, NASDAQ and NYSE, demonstrate that DGRCL significantly +outperforms state-of-the-art TGL baselines. Ablation studies indicate the +importance of both modules. Overall, DGRCL not only enhances prediction ability +but also provides a robust framework for integrating temporal and relational +data in dynamic graphs. Code and data are available for public access. + +
+
+ comment: 12 pages, 2 figures, author manuscript accepted for ICAART 2025 + (International Conference on Agents and Artificial Intelligence) +
+
+
+
+
+ + ☆ A Note on Spectral Map + + +
+ In molecular dynamics (MD) simulations, transitions between states are often +rare events due to energy barriers that exceed the thermal temperature. Because +of their infrequent occurrence and the huge number of degrees of freedom in +molecular systems, understanding the physical properties that drive rare events +is immensely difficult. A common approach to this problem is to propose a +collective variable (CV) that describes this process by a simplified +representation. However, choosing CVs is not easy, as it often relies on +physical intuition. Machine learning (ML) techniques provide a promising +approach for effectively extracting optimal CVs from MD data. Here, we provide +a note on a recent unsupervised ML method called spectral map, which constructs +CVs by maximizing the timescale separation between slow and fast variables in +the system. + +
+
+ comment: A letter prepared for the Ensemble journal of the Molecular + Simulation Society of Japan (MSSJ) +
+
+
+
+
+ + ☆ Blind Underwater Image Restoration using Co-Operational Regressor + Networks + + +
+ The exploration of underwater environments is essential for applications such +as biological research, archaeology, and infrastructure maintenanceHowever, +underwater imaging is challenging due to the waters unique properties, +including scattering, absorption, color distortion, and reduced visibility. To +address such visual degradations, a variety of approaches have been proposed +covering from basic signal processing methods to deep learning models; however, +none of them has proven to be consistently successful. In this paper, we +propose a novel machine learning model, Co-Operational Regressor Networks +(CoRe-Nets), designed to achieve the best possible underwater image +restoration. A CoRe-Net consists of two co-operating networks: the Apprentice +Regressor (AR), responsible for image transformation, and the Master Regressor +(MR), which evaluates the Peak Signal-to-Noise Ratio (PSNR) of the images +generated by the AR and feeds it back to AR. CoRe-Nets are built on +Self-Organized Operational Neural Networks (Self-ONNs), which offer a superior +learning capability by modulating nonlinearity in kernel transformations. The +effectiveness of the proposed model is demonstrated on the benchmark Large +Scale Underwater Image (LSUI) dataset. Leveraging the joint learning +capabilities of the two cooperating networks, the proposed model achieves the +state-of-art restoration performance with significantly reduced computational +complexity and often presents such results that can even surpass the visual +quality of the ground truth with a 2-pass application. Our results and the +optimized PyTorch implementation of the proposed approach are now publicly +shared on GitHub. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ LaserGuider: A Laser Based Physical Backdoor Attack against Deep Neural + Networks + + +
+ Backdoor attacks embed hidden associations between triggers and targets in +deep neural networks (DNNs), causing them to predict the target when a trigger +is present while maintaining normal behavior otherwise. Physical backdoor +attacks, which use physical objects as triggers, are feasible but lack remote +control, temporal stealthiness, flexibility, and mobility. To overcome these +limitations, in this work, we propose a new type of backdoor triggers utilizing +lasers that feature long-distance transmission and instant-imaging properties. +Based on the laser-based backdoor triggers, we present a physical backdoor +attack, called LaserGuider, which possesses remote control ability and achieves +high temporal stealthiness, flexibility, and mobility. We also introduce a +systematic approach to optimize laser parameters for improving attack +effectiveness. Our evaluation on traffic sign recognition DNNs, critical in +autonomous vehicles, demonstrates that LaserGuider with three different +laser-based triggers achieves over 90% attack success rate with negligible +impact on normal inputs. Additionally, we release LaserMark, the first dataset +of real world traffic signs stamped with physical laser spots, to support +further research in backdoor attacks and defenses. + +
+
+ comment: In Proceedings of the 23rd International Conference on Applied + Cryptography and Network Security (ACNS), Munich, Germany, 23-26 June, 2025 +
+
+
+
+
+ + ☆ How well behaved is finite dimensional Diffusion Maps? + + +
+ Under a set of assumptions on a family of submanifolds $\subset {\mathbb +R}^D$, we derive a series of geometric properties that remain valid after +finite-dimensional and almost isometric Diffusion Maps (DM), including almost +uniform density, finite polynomial approximation and local reach. Leveraging +these properties, we establish rigorous bounds on the embedding errors +introduced by the DM algorithm is $O\left((\frac{\log +n}{n})^{\frac{1}{8d+16}}\right)$. These results offer a solid theoretical +foundation for understanding the performance and reliability of DM in practical +applications. + +
+
+ comment: 20 pages, 3 figures +
+
+
+
+
+ + ☆ Safe and Efficient Online Convex Optimization with Linear Budget + Constraints and Partial Feedback + + +
+ This paper studies online convex optimization with unknown linear budget +constraints, where only the gradient information of the objective and the +bandit feedback of constraint functions are observed. We propose a safe and +efficient Lyapunov-optimization algorithm (SELO) that can achieve an +$O(\sqrt{T})$ regret and zero cumulative constraint violation. The result also +implies SELO achieves $O(\sqrt{T})$ regret when the budget is hard and not +allowed to be violated. The proposed algorithm is computationally efficient as +it resembles a primal-dual algorithm where the primal problem is an +unconstrained, strongly convex and smooth problem, and the dual problem has a +simple gradient-type update. The algorithm and theory are further justified in +a simulated application of energy-efficient task processing in distributed data +centers. + +
+
+
+
+
+ + ☆ Exploring Fully Convolutional Networks for the Segmentation of + Hyperspectral Imaging Applied to Advanced Driver Assistance Systems + + +
+ Advanced Driver Assistance Systems (ADAS) are designed with the main purpose +of increasing the safety and comfort of vehicle occupants. Most of current +computer vision-based ADAS perform detection and tracking tasks quite +successfully under regular conditions, but are not completely reliable, +particularly under adverse weather and changing lighting conditions, neither in +complex situations with many overlapping objects. In this work we explore the +use of hyperspectral imaging (HSI) in ADAS on the assumption that the distinct +near infrared (NIR) spectral reflectances of different materials can help to +better separate the objects in a driving scene. In particular, this paper +describes some experimental results of the application of fully convolutional +networks (FCN) to the image segmentation of HSI for ADAS applications. More +specifically, our aim is to investigate to what extent the spatial features +codified by convolutional filters can be helpful to improve the performance of +HSI segmentation systems. With that aim, we use the HSI-Drive v1.1 dataset, +which provides a set of labelled images recorded in real driving conditions +with a small-size snapshot NIR-HSI camera. Finally, we analyze the +implementability of such a HSI segmentation system by prototyping the developed +FCN model together with the necessary hyperspectral cube preprocessing stage +and characterizing its performance on an MPSoC. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2411.19274 +
+
+
+
+
+ + ☆ Local Curvature Smoothing with Stein's Identity for Efficient Score + Matching NeurIPS 2024 + + +
+ The training of score-based diffusion models (SDMs) is based on score +matching. The challenge of score matching is that it includes a computationally +expensive Jacobian trace. While several methods have been proposed to avoid +this computation, each has drawbacks, such as instability during training and +approximating the learning as learning a denoising vector field rather than a +true score. We propose a novel score matching variant, local curvature +smoothing with Stein's identity (LCSS). The LCSS bypasses the Jacobian trace by +applying Stein's identity, enabling regularization effectiveness and efficient +computation. We show that LCSS surpasses existing methods in sample generation +performance and matches the performance of denoising score matching, widely +adopted by most SDMs, in evaluations such as FID, Inception score, and bits per +dimension. Furthermore, we show that LCSS enables realistic image generation +even at a high resolution of $1024 \times 1024$. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ☆ Electronic Health Records-Based Data-Driven Diabetes Knowledge Unveiling + and Risk Prognosis + + +
+ In the healthcare sector, the application of deep learning technologies has +revolutionized data analysis and disease forecasting. This is particularly +evident in the field of diabetes, where the deep analysis of Electronic Health +Records (EHR) has unlocked new opportunities for early detection and effective +intervention strategies. Our research presents an innovative model that +synergizes the capabilities of Bidirectional Long Short-Term Memory +Networks-Conditional Random Field (BiLSTM-CRF) with a fusion of XGBoost and +Logistic Regression. This model is designed to enhance the accuracy of diabetes +risk prediction by conducting an in-depth analysis of electronic medical +records data. The first phase of our approach involves employing BiLSTM-CRF to +delve into the temporal characteristics and latent patterns present in EHR +data. This method effectively uncovers the progression trends of diabetes, +which are often hidden in the complex data structures of medical records. The +second phase leverages the combined strength of XGBoost and Logistic Regression +to classify these extracted features and evaluate associated risks. This dual +approach facilitates a more nuanced and precise prediction of diabetes, +outperforming traditional models, particularly in handling multifaceted and +nonlinear medical datasets. Our research demonstrates a notable advancement in +diabetes prediction over traditional methods, showcasing the effectiveness of +our combined BiLSTM-CRF, XGBoost, and Logistic Regression model. This study +highlights the value of data-driven strategies in clinical decision-making, +equipping healthcare professionals with precise tools for early detection and +intervention. By enabling personalized treatment and timely care, our approach +signifies progress in incorporating advanced analytics in healthcare, +potentially improving outcomes for diabetes and other chronic conditions. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ BEFL: Balancing Energy Consumption in Federated Learning for Mobile Edge + IoT + + +
+ Federated Learning (FL) is a privacy-preserving distributed learning paradigm +designed to build a highly accurate global model. In Mobile Edge IoT (MEIoT), +the training and communication processes can significantly deplete the limited +battery resources of devices. Existing research primarily focuses on reducing +overall energy consumption, but this may inadvertently create energy +consumption imbalances, leading to the premature dropout of energy-sensitive +devices.To address these challenges, we propose BEFL, a joint optimization +framework aimed at balancing three objectives: enhancing global model accuracy, +minimizing total energy consumption, and reducing energy usage disparities +among devices. First, taking into account the communication constraints of +MEIoT and the heterogeneity of devices, we employed the Sequential Least +Squares Programming (SLSQP) algorithm for the rational allocation of +communication resources. Based on this, we introduce a heuristic client +selection algorithm that combines cluster partitioning with utility-driven +approaches to alleviate both the total energy consumption of all devices and +the discrepancies in energy usage.Furthermore, we utilize the proposed +heuristic client selection algorithm as a template for offline imitation +learning during pre-training, while adopting a ranking-based reinforcement +learning approach online to further boost training efficiency. Our experiments +reveal that BEFL improves global model accuracy by 1.6\%, reduces energy +consumption variance by 72.7\%, and lowers total energy consumption by 28.2\% +compared to existing methods. The relevant code can be found at +\href{URL}{https://github.com/juzehao/BEFL}. + +
+
+
+
+
+ + ☆ Learning Speed-Adaptive Walking Agent Using Imitation Learning with + Physics-Informed Simulation + + +
+ Virtual models of human gait, or digital twins, offer a promising solution +for studying mobility without the need for labor-intensive data collection. +However, challenges such as the sim-to-real gap and limited adaptability to +diverse walking conditions persist. To address these, we developed and +validated a framework to create a skeletal humanoid agent capable of adapting +to varying walking speeds while maintaining biomechanically realistic motions. +The framework combines a synthetic data generator, which produces +biomechanically plausible gait kinematics from open-source biomechanics data, +and a training system that uses adversarial imitation learning to train the +agent's walking policy. We conducted comprehensive analyses comparing the +agent's kinematics, synthetic data, and the original biomechanics dataset. The +agent achieved a root mean square error of 5.24 +- 0.09 degrees at varying +speeds compared to ground-truth kinematics data, demonstrating its +adaptability. This work represents a significant step toward developing a +digital twin of human locomotion, with potential applications in biomechanics +research, exoskeleton design, and rehabilitation. + +
+
+ comment: Currently under review +
+
+
+
+
+ + ☆ JANUS: A Difference-Oriented Analyzer For Financial Centralization Risks + in Smart Contracts + + +
+ Some smart contracts violate decentralization principles by defining +privileged accounts that manage other users' assets without permission, +introducing centralization risks that have caused financial losses. Existing +methods, however, face challenges in accurately detecting diverse +centralization risks due to their dependence on predefined behavior patterns. +In this paper, we propose JANUS, an automated analyzer for Solidity smart +contracts that detects financial centralization risks independently of their +specific behaviors. JANUS identifies differences between states reached by +privileged and ordinary accounts, and analyzes whether these differences are +finance-related. Focusing on the impact of risks rather than behaviors, JANUS +achieves improved accuracy compared to existing tools and can uncover +centralization risks with unknown patterns. + To evaluate JANUS's performance, we compare it with other tools using a +dataset of 540 contracts. Our evaluation demonstrates that JANUS outperforms +representative tools in terms of detection accuracy for financial +centralization risks . Additionally, we evaluate JANUS on a real-world dataset +of 33,151 contracts, successfully identifying two types of risks that other +tools fail to detect. We also prove that the state traversal method and +variable summaries, which are used in JANUS to reduce the number of states to +be compared, do not introduce false alarms or omissions in detection. + +
+
+
+
+
+ + ☆ Deep Learning Modeling Method for RF Devices Based on Uniform Noise + Training Set + + +
+ As the scale and complexity of integrated circuits continue to increase, +traditional modeling methods are struggling to address the nonlinear challenges +in radio frequency (RF) chips. Deep learning has been increasingly applied to +RF device modeling. This paper proposes a deep learning-based modeling method +for RF devices using a uniform noise training set, aimed at modeling and +fitting the nonlinear characteristics of RF devices. We hypothesize that a +uniform noise signal can encompass the full range of characteristics across +both frequency and amplitude, and that a deep learning model can effectively +capture and learn these features. Based on this hypothesis, the paper designs a +complete integrated circuit modeling process based on measured data, including +data collection, processing, and neural network training. The proposed method +is experimentally validated using the RF amplifier PW210 as a case study. +Experimental results show that the uniform noise training set allows the model +to capture the nonlinear characteristics of RF devices, and the trained model +can predict waveform patterns it has never encountered before. The proposed +deep learning-based RF device modeling method, using a uniform noise training +set, demonstrates strong generalization capability and excellent training +performance, offering high practical application value. + +
+
+ comment: 9 pages,11 figures +
+
+
+
+
+ + ☆ Exploring AI Text Generation, Retrieval-Augmented Generation, and + Detection Technologies: a Comprehensive Overview + + +
+ The rapid development of Artificial Intelligence (AI) has led to the creation +of powerful text generation models, such as large language models (LLMs), which +are widely used for diverse applications. However, concerns surrounding +AI-generated content, including issues of originality, bias, misinformation, +and accountability, have become increasingly prominent. This paper offers a +comprehensive overview of AI text generators (AITGs), focusing on their +evolution, capabilities, and ethical implications. This paper also introduces +Retrieval-Augmented Generation (RAG), a recent approach that improves the +contextual relevance and accuracy of text generation by integrating dynamic +information retrieval. RAG addresses key limitations of traditional models, +including their reliance on static knowledge and potential inaccuracies in +handling real-world data. Additionally, the paper reviews detection tools that +help differentiate AI-generated text from human-written content and discusses +the ethical challenges these technologies pose. The paper explores future +directions for improving detection accuracy, supporting ethical AI development, +and increasing accessibility. The paper contributes to a more responsible and +reliable use of AI in content creation through these discussions. + +
+
+
+
+
+ + ☆ MT3DNet: Multi-Task learning Network for 3D Surgical Scene + Reconstruction + + +
+ In image-assisted minimally invasive surgeries (MIS), understanding surgical +scenes is vital for real-time feedback to surgeons, skill evaluation, and +improving outcomes through collaborative human-robot procedures. Within this +context, the challenge lies in accurately detecting, segmenting, and estimating +the depth of surgical scenes depicted in high-resolution images, while +simultaneously reconstructing the scene in 3D and providing segmentation of +surgical instruments along with detection labels for each instrument. To +address this challenge, a novel Multi-Task Learning (MTL) network is proposed +for performing these tasks concurrently. A key aspect of this approach involves +overcoming the optimization hurdles associated with handling multiple tasks +concurrently by integrating a Adversarial Weight Update into the MTL framework, +the proposed MTL model achieves 3D reconstruction through the integration of +segmentation, depth estimation, and object detection, thereby enhancing the +understanding of surgical scenes, which marks a significant advancement +compared to existing studies that lack 3D capabilities. Comprehensive +experiments on the EndoVis2018 benchmark dataset underscore the adeptness of +the model in efficiently addressing all three tasks, demonstrating the efficacy +of the proposed techniques. + +
+
+
+
+
+ + ☆ MegaCOIN: Enhancing Medium-Grained Color Perception for Vision-Language + Models + + +
+ In vision-language models (VLMs), the ability to perceive and interpret color +and physical environment is crucial for achieving contextually accurate +understanding and interaction. However, despite advances in multimodal +modeling, there remains a significant lack of specialized datasets that +rigorously evaluate a model's capacity to discern subtle color variations and +spatial context -- critical elements for situational comprehension and reliable +deployment across real-world applications. Toward that goal, we curate +MegaCOIN, a high-quality, human-labeled dataset based on \emph{real} images +with various contextual attributes. MegaCOIN consists of two parts: +MegaCOIN-Instruct, which serves as a supervised fine-tuning (SFT) dataset for +VLMs; and MegaCOIN-Bench, an annotated test set that can be used as a +stand-alone QA dataset. MegaCOIN~provides three annotated features for 220,000 +real images: foreground color, background color, and description of an object's +physical environment, constituting 660k human annotations. In addition, +MegaCOIN can be applied to benchmark domain generalization (DG) algorithms. We +explore benchmarking DG methods in the linear probing setup for VLM and show +some new insights. Last but not least, we show that VLMs, including GPT-4o, +have subpar color recognition capabilities, and fine-tuning with MegaCOIN can +result in improved performance on visual evaluation tasks. In certain cases, +MegaCOIN fine-tuned small-scale opensource models such as LLaVA and Bunny can +outperform closed-source GPT-4o. We hope the utilities of MegaCOIN can shed +light on the directions VLMs can improve and provide a more complex platform +for domain generalization algorithms. + +
+
+ comment: 8 pages, 13 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ A method to benchmark high-dimensional process drift detection + + +
+ Process curves are multivariate finite time series data coming from +manufacturing processes. This paper studies machine learning that detect drifts +in process curve datasets. A theoretic framework to synthetically generate +process curves in a controlled way is introduced in order to benchmark machine +learning algorithms for process drift detection. An evaluation score, called +the temporal area under the curve, is introduced, which allows to quantify how +well machine learning models unveil curves belonging to drift segments. +Finally, a benchmark study comparing popular machine learning approaches on +synthetic data generated with the introduced framework is presented that shows +that existing algorithms often struggle with datasets containing multiple drift +segments. + +
+
+
+
+
+ + ♻ ☆ Masked Autoencoders are PDE Learners + + +
+ Neural solvers for partial differential equations (PDEs) have great potential +to generate fast and accurate physics solutions, yet their practicality is +currently limited by their generalizability. PDEs evolve over broad scales and +exhibit diverse behaviors; predicting these phenomena will require learning +representations across a wide variety of inputs which may encompass different +coefficients, boundary conditions, resolutions, or even equations. As a step +towards generalizable PDE modeling, we adapt masked pretraining for physics +problems. Through self-supervised learning across PDEs, masked autoencoders can +consolidate heterogeneous physics to learn rich latent representations. We show +that learned representations can generalize to a limited set of unseen +equations or parameters and are meaningful enough to regress PDE coefficients +or the classify PDE features. Furthermore, conditioning neural solvers on +learned latent representations can improve time-stepping and super-resolution +performance across a variety of coefficients, discretizations, or boundary +conditions, as well as on certain unseen PDEs. We hope that masked pretraining +can emerge as a unifying method across large, unlabeled, and heterogeneous +datasets to learn latent physics at scale. + +
+
+ comment: 29 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ SmallToLarge (S2L): Scalable Data Selection for Fine-tuning Large + Language Models by Summarizing Training Trajectories of Small Models + + +
+ Despite the effectiveness of data selection for large language models (LLMs) +during pretraining and instruction fine-tuning phases, improving data +efficiency in supervised fine-tuning (SFT) for specialized domains poses +significant challenges due to the complexity of fine-tuning data. To bridge +this gap, we introduce an effective and scalable data selection method for SFT, +SmallToLarge (S2L), which leverages training trajectories from small models to +guide the data selection for larger models. We demonstrate through extensive +experiments that S2L significantly improves data efficiency in SFT for +mathematical problem-solving, reducing the training data to just 11% of the +original MathInstruct dataset (Yue et al., 2023) to match full dataset +performance while outperforming state-of-the-art data selection algorithms by +an average of 4.7% across 6 in- and out-domain evaluation datasets. Remarkably, +selecting only 50K data for SFT, S2L achieves a 32.7% accuracy on the most +challenging MATH (Hendrycks et al., 2021) benchmark, improving Phi-2 (Li et +al., 2023b) by 16.6%. In clinical text summarization on the MIMIC-III dataset +(Johnson et al., 2016), S2L again outperforms training on the full dataset +using only 50% of the data. Notably, S2L can perform data selection using a +reference model 40x smaller than the target model, proportionally reducing the +cost of data selection. + +
+
+
+
+
+ + ♻ ☆ Negative Token Merging: Image-based Adversarial Feature Guidance + + +
+ Text-based adversarial guidance using a negative prompt has emerged as a +widely adopted approach to steer diffusion models away from producing undesired +concepts. While useful, performing adversarial guidance using text alone can be +insufficient to capture complex visual concepts or avoid specific visual +elements like copyrighted characters. In this paper, for the first time we +explore an alternate modality in this direction by performing adversarial +guidance directly using visual features from a reference image or other images +in a batch. We introduce negative token merging (NegToMe), a simple but +effective training-free approach which performs adversarial guidance through +images by selectively pushing apart matching visual features between reference +and generated images during the reverse diffusion process. By simply adjusting +the used reference, NegToMe enables a diverse range of applications. Notably, +when using other images in same batch as reference, we find that NegToMe +significantly enhances output diversity (e.g., racial, gender, visual) by +guiding features of each image away from others. Similarly, when used w.r.t. +copyrighted reference images, NegToMe reduces visual similarity to copyrighted +content by 34.57%. NegToMe is simple to implement using just few-lines of code, +uses only marginally higher (<4%) inference time and is compatible with +different diffusion architectures, including those like Flux, which don't +natively support the use of a negative prompt. Code is available at +https://negtome.github.io + +
+
+
+
+
+ + ♻ ☆ WaveletGPT: Wavelets Meet Large Language Models + + +
+ Large Language Models (LLMs) have ushered in a new wave of artificial +intelligence advancements impacting every scientific field and discipline. They +are trained on a simple objective: to predict the next token given the previous +context. We live in a world where most of the data around us, e.g., text, +audio, and music, has a multi-scale structure associated with it. This paper +infuses LLMs with traditional signal processing ideas, namely wavelets, during +pre-training to take advantage of the structure. Without adding \textbf{any +extra parameters} to a GPT-style LLM architecture, we achieve the same +pre-training performance almost twice as fast in text, raw audio, and symbolic +music. This is achieved by imposing a structure on intermediate embeddings. +When trained for the same number of training steps, we achieve significant +gains in performance, which is comparable to pre-training a larger neural +architecture. Our architecture allows every next token prediction access to +intermediate embeddings at different temporal resolutions in every Transformer +decoder block. This work will hopefully pave the way for incorporating +multi-rate signal processing ideas into traditional LLM pre-training. Further, +we showcase pushing model performance by improving internal structure instead +of just going after scale. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Learning to Reconstruct Accelerated MRI Through K-space Cold Diffusion + without Noise + + +
+ Deep learning-based MRI reconstruction models have achieved superior +performance these days. Most recently, diffusion models have shown remarkable +performance in image generation, in-painting, super-resolution, image editing +and more. As a generalized diffusion model, cold diffusion further broadens the +scope and considers models built around arbitrary image transformations such as +blurring, down-sampling, etc. In this paper, we propose a k-space cold +diffusion model that performs image degradation and restoration in k-space +without the need for Gaussian noise. We provide comparisons with multiple deep +learning-based MRI reconstruction models and perform tests on a well-known +large open-source MRI dataset. Our results show that this novel way of +performing degradation can generate high-quality reconstruction images for +accelerated MRI. + +
+
+ comment: 21 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Regularization by Neural Style Transfer for MRI Field-Transfer + Reconstruction with Limited Data + + +
+ Recent advances in MRI reconstruction have achieved remarkable success with +deep learning-based models. However, most methods depend on large-scale, +task-specific datasets, leaving reconstruction in data-limited settings as a +critical but underexplored challenge. Regularization by denoising (RED) is a +general pipeline that incorporates a denoiser as a prior for image +reconstruction, showing promising results in various image processing tasks, +including denoising, deblurring, and super-resolution. In this work, we propose +a regularization by neural style transfer (RNST) method to further leverage the +priors from the neural transfer and denoising engine. RNST effectively +reconstructs high-quality images from noisy, low-quality inputs across varying +image styles, even with limited data. We validate RNST on clinical MRI scans, +demonstrating its ability to significantly improve image quality. These +findings underline the potential of RNST for MRI field-transfer reconstruction +and its promise in addressing reconstruction tasks in data-constrained +scenarios. + +
+
+ comment: 31 pages, 9 figures, 3 tables, 1 algorithm chart +
+
+
+
+
+ + ♻ ☆ Don't Be So Positive: Negative Step Sizes in Second-Order Methods + + +
+ The value of second-order methods lies in the use of curvature information. +Yet, this information is costly to extract and once obtained, valuable negative +curvature information is often discarded so that the method is globally +convergent. This limits the effectiveness of second-order methods in modern +machine learning. In this paper, we show that second-order and +second-order-like methods are promising optimizers for neural networks provided +that we add one ingredient: negative step sizes. We show that under very +general conditions, methods that produce ascent directions are globally +convergent when combined with a Wolfe line search that allows both positive and +negative step sizes. We experimentally demonstrate that using negative step +sizes is often more effective than common Hessian modification methods. + +
+
+ comment: added affiliation and more references +
+
+
+
+
+ + ♻ ☆ GeoPos: A Minimal Positional Encoding for Enhanced Fine-Grained Details + in Image Synthesis Using Convolutional Neural Networks WACV 2025 + + +
+ The enduring inability of image generative models to recreate intricate +geometric features, such as those present in human hands and fingers has been +an ongoing problem in image generation for nearly a decade. While strides have +been made by increasing model sizes and diversifying training datasets, this +issue remains prevalent across all models, from denoising diffusion models to +Generative Adversarial Networks (GAN), pointing to a fundamental shortcoming in +the underlying architectures. In this paper, we demonstrate how this problem +can be mitigated by augmenting convolution layers geometric capabilities +through providing them with a single input channel incorporating the relative +n-dimensional Cartesian coordinate system. We show this drastically improves +quality of images generated by Diffusion Models, GANs, and Variational +AutoEncoders (VAE). + +
+
+ comment: Accepted at WACV 2025. Contains 19 pages, 15 figures, and 9 tables +
+
+
+
+
+ + ♻ ☆ Is uniform expressivity too restrictive? Towards efficient expressivity + of graph neural networks + + +
+ Uniform expressivity guarantees that a Graph Neural Network (GNN) can express +a query without the parameters depending on the size of the input graphs. This +property is desirable in applications in order to have number of trainable +parameters that is independent of the size of the input graphs. Uniform +expressivity of the two variable guarded fragment (GC2) of first order logic is +a well-celebrated result for Rectified Linear Unit (ReLU) GNNs [Barcelo & al., +2020]. In this article, we prove that uniform expressivity of GC2 queries is +not possible for GNNs with a wide class of Pfaffian activation functions +(including the sigmoid and tanh), answering a question formulated by [Grohe, +2021]. We also show that despite these limitations, many of those GNNs can +still efficiently express GC2 queries in a way that the number of parameters +remains logarithmic on the maximal degree of the input graphs. Furthermore, we +demonstrate that a log-log dependency on the degree is achievable for a certain +choice of activation function. This shows that uniform expressivity can be +successfully relaxed by covering large graphs appearing in practical +applications. Our experiments illustrates that our theoretical estimates hold +in practice. + +
+
+
+
+
+ + ♻ ☆ Introducing the Large Medical Model: State of the art healthcare cost + and risk prediction with transformers trained on patient event sequences + + +
+ With U.S. healthcare spending approaching $5T (NHE Fact Sheet 2024), and 25% +of it estimated to be wasteful (Waste in the US the health care system: +estimated costs and potential for savings, n.d.), the need to better predict +risk and optimal patient care is evermore important. This paper introduces the +Large Medical Model (LMM), a generative pre-trained transformer (GPT) designed +to guide and predict the broad facets of patient care and healthcare +administration. The model is trained on medical event sequences from over 140M +longitudinal patient claims records with a specialized vocabulary built from +medical terminology systems and demonstrates a superior capability to forecast +healthcare costs and identify potential risk factors. Through experimentation +and validation, we showcase the LMM's proficiency in not only in cost and risk +predictions, but also in discerning intricate patterns within complex medical +conditions and an ability to identify novel relationships in patient care. The +LMM is able to improve both cost prediction by 14.1% over the best commercial +models and chronic conditions prediction by 1.9% over the best transformer +models in research predicting a broad set of conditions. The LMM is a +substantial advancement in healthcare analytics, offering the potential to +significantly enhance risk assessment, cost management, and personalized +medicine. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Limit Theorems for Stochastic Gradient Descent with Infinite Variance + + +
+ Stochastic gradient descent is a classic algorithm that has gained great +popularity especially in the last decades as the most common approach for +training models in machine learning. While the algorithm has been well-studied +when stochastic gradients are assumed to have a finite variance, there is +significantly less research addressing its theoretical properties in the case +of infinite variance gradients. In this paper, we establish the asymptotic +behavior of stochastic gradient descent in the context of infinite variance +stochastic gradients, assuming that the stochastic gradient is regular varying +with index $\alpha\in(1,2)$. The closest result in this context was established +in 1969 , in the one-dimensional case and assuming that stochastic gradients +belong to a more restrictive class of distributions. We extend it to the +multidimensional case, covering a broader class of infinite variance +distributions. As we show, the asymptotic distribution of the stochastic +gradient descent algorithm can be characterized as the stationary distribution +of a suitably defined Ornstein-Uhlenbeck process driven by an appropriate +stable L\'evy process. Additionally, we explore the applications of these +results in linear regression and logistic regression models. + +
+
+
+
+
+ + ♻ ☆ A Fisher-Rao gradient flow for entropy-regularised Markov decision + processes in Polish spaces + + +
+ We study the global convergence of a Fisher-Rao policy gradient flow for +infinite-horizon entropy-regularised Markov decision processes with Polish +state and action space. The flow is a continuous-time analogue of a policy +mirror descent method. We establish the global well-posedness of the gradient +flow and demonstrate its exponential convergence to the optimal policy. +Moreover, we prove the flow is stable with respect to gradient evaluation, +offering insights into the performance of a natural policy gradient flow with +log-linear policy parameterisation. To overcome challenges stemming from the +lack of the convexity of the objective function and the discontinuity arising +from the entropy regulariser, we leverage the performance difference lemma and +the duality relationship between the gradient and mirror descent flows. Our +analysis provides a theoretical foundation for developing various discrete +policy gradient algorithms. + +
+
+ comment: add discretizations of gradient flow and their convergence analysis +
+
+
+
+
+ + ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models WACV 2025 + + +
+ In this work, we address the challenging and emergent problem of novel object +detection (NOD), focusing on the accurate detection of both known and novel +object categories during inference. Traditional object detection algorithms are +inherently closed-set, limiting their capability to handle NOD. We present a +novel approach to transform existing closed-set detectors into open-set +detectors. This transformation is achieved by leveraging the complementary +strengths of pre-trained foundational models, specifically CLIP and SAM, +through our cooperative mechanism. Furthermore, by integrating this mechanism +with state-of-the-art open-set detectors such as GDINO, we establish new +benchmarks in object detection performance. Our method achieves 17.42 mAP in +novel object detection and 42.08 mAP for known objects on the challenging LVIS +dataset. Adapting our approach to the COCO OVD split, we surpass the current +state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our +code is available at https://rohit901.github.io/coop-foundation-models/ . + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ ELEMENTAL: Interactive Learning from Demonstrations and Vision-Language + Models for Reward Design in Robotics + + +
+ Reinforcement learning (RL) has demonstrated compelling performance in +robotic tasks, but its success often hinges on the design of complex, ad hoc +reward functions. Researchers have explored how Large Language Models (LLMs) +could enable non-expert users to specify reward functions more easily. However, +LLMs struggle to balance the importance of different features, generalize +poorly to out-of-distribution robotic tasks, and cannot represent the problem +properly with only text-based descriptions. To address these challenges, we +propose ELEMENTAL (intEractive LEarning froM dEmoNstraTion And Language), a +novel framework that combines natural language guidance with visual user +demonstrations to align robot behavior with user intentions better. By +incorporating visual inputs, ELEMENTAL overcomes the limitations of text-only +task specifications, while leveraging inverse reinforcement learning (IRL) to +balance feature weights and match the demonstrated behaviors optimally. +ELEMENTAL also introduces an iterative feedback-loop through self-reflection to +improve feature, reward, and policy learning. Our experiment results +demonstrate that ELEMENTAL outperforms prior work by 42.3% on task success, and +achieves 41.3% better generalization in out-of-distribution tasks, highlighting +its robustness in LfD. + +
+
+
+
+
+ + ♻ ☆ HydraViT: Stacking Heads for a Scalable ViT NeurIPS'24 + + +
+ The architecture of Vision Transformers (ViTs), particularly the Multi-head +Attention (MHA) mechanism, imposes substantial hardware demands. Deploying ViTs +on devices with varying constraints, such as mobile phones, requires multiple +models of different sizes. However, this approach has limitations, such as +training and storing each required model separately. This paper introduces +HydraViT, a novel approach that addresses these limitations by stacking +attention heads to achieve a scalable ViT. By repeatedly changing the size of +the embedded dimensions throughout each layer and their corresponding number of +attention heads in MHA during training, HydraViT induces multiple subnetworks. +Thereby, HydraViT achieves adaptability across a wide spectrum of hardware +environments while maintaining performance. Our experimental results +demonstrate the efficacy of HydraViT in achieving a scalable ViT with up to 10 +subnetworks, covering a wide range of resource constraints. HydraViT achieves +up to 5 p.p. more accuracy with the same GMACs and up to 7 p.p. more accuracy +with the same throughput on ImageNet-1K compared to the baselines, making it an +effective solution for scenarios where hardware availability is diverse or +varies over time. Source code available at https://github.com/ds-kiel/HydraViT. + +
+
+ comment: Accepted at NeurIPS'24, please cite the conference version +
+
+
+
+
+ + ♻ ☆ On Multi-Agent Inverse Reinforcement Learning + + +
+ In multi-agent systems, the agent behavior is highly influenced by its +utility function, as these utilities shape both individual goals as well as +interactions with the other agents. Inverse Reinforcement Learning (IRL) is a +well-established approach to inferring the utility function by observing an +expert behavior within a given environment. In this paper, we extend the IRL +framework to the multi-agent setting, assuming to observe agents who are +following Nash Equilibrium (NE) policies. We theoretically investigate the set +of utilities that explain the behavior of NE experts. Specifically, we provide +an explicit characterization of the feasible reward set and analyze how errors +in estimating the transition dynamics and expert behavior impact the recovered +rewards. Building on these findings, we provide the first sample complexity +analysis for the multi-agent IRL problem. Finally, we provide a numerical +evaluation of our theoretical results. + +
+
+ comment: Currently under review +
+
+
+
+
+ + ♻ ☆ CoSy: Evaluating Textual Explanations of Neurons + + +
+ A crucial aspect of understanding the complex nature of Deep Neural Networks +(DNNs) is the ability to explain learned concepts within their latent +representations. While methods exist to connect neurons to human-understandable +textual descriptions, evaluating the quality of these explanations is +challenging due to the lack of a unified quantitative approach. We introduce +CoSy (Concept Synthesis), a novel, architecture-agnostic framework for +evaluating textual explanations of latent neurons. Given textual explanations, +our proposed framework uses a generative model conditioned on textual input to +create data points representing the explanations. By comparing the neuron's +response to these generated data points and control data points, we can +estimate the quality of the explanation. We validate our framework through +sanity checks and benchmark various neuron description methods for Computer +Vision tasks, revealing significant differences in quality. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Finite-sample performance of the maximum likelihood estimator in + logistic regression + + +
+ Logistic regression is a classical model for describing the probabilistic +dependence of binary responses to multivariate covariates. We consider the +predictive performance of the maximum likelihood estimator (MLE) for logistic +regression, assessed in terms of logistic risk. We consider two questions: +first, that of the existence of the MLE (which occurs when the dataset is not +linearly separated), and second that of its accuracy when it exists. These +properties depend on both the dimension of covariates and on the signal +strength. In the case of Gaussian covariates and a well-specified logistic +model, we obtain sharp non-asymptotic guarantees for the existence and excess +logistic risk of the MLE. We then generalize these results in two ways: first, +to non-Gaussian covariates satisfying a certain two-dimensional margin +condition, and second to the general case of statistical learning with a +possibly misspecified logistic model. Finally, we consider the case of a +Bernoulli design, where the behavior of the MLE is highly sensitive to the +parameter direction. + +
+
+ comment: Simplified some statements and added a proof sketch in Sec. 4 +
+
+
+
+
+ + ♻ ☆ Calib3D: Calibrating Model Preferences for Reliable 3D Scene + Understanding WACV 2025 + + +
+ Safety-critical 3D scene understanding tasks necessitate not only accurate +but also confident predictions from 3D perception models. This study introduces +Calib3D, a pioneering effort to benchmark and scrutinize the reliability of 3D +scene understanding models from an uncertainty estimation viewpoint. We +comprehensively evaluate 28 state-of-the-art models across 10 diverse 3D +datasets, uncovering insightful phenomena that cope with both the aleatoric and +epistemic uncertainties in 3D scene understanding. We discover that despite +achieving impressive levels of accuracy, existing models frequently fail to +provide reliable uncertainty estimates -- a pitfall that critically undermines +their applicability in safety-sensitive contexts. Through extensive analysis of +key factors such as network capacity, LiDAR representations, rasterization +resolutions, and 3D data augmentation techniques, we correlate these aspects +directly with the model calibration efficacy. Furthermore, we introduce DeptS, +a novel depth-aware scaling approach aimed at enhancing 3D model calibration. +Extensive experiments across a wide range of configurations validate the +superiority of our method. We hope this work could serve as a cornerstone for +fostering reliable 3D scene understanding. Code and benchmark toolkit are +publicly available. + +
+
+ comment: WACV 2025; 26 pages, 8 figures, 12 tables; Code at + https://github.com/ldkong1205/Calib3D +
+
+
+
+
+ + ♻ ☆ Bayesian evidence estimation from posterior samples with normalizing + flows + + +
+ We propose a novel method ($floZ$), based on normalizing flows, to estimate +the Bayesian evidence (and its numerical uncertainty) from a pre-existing set +of samples drawn from the unnormalized posterior distribution. We validate it +on distributions whose evidence is known analytically, up to 15 parameter space +dimensions, and compare with two state-of-the-art techniques for estimating the +evidence: nested sampling (which computes the evidence as its main target) and +a $k$-nearest-neighbors technique that produces evidence estimates from +posterior samples. Provided representative samples from the target posterior +are available, our method is more robust to posterior distributions with sharp +features, especially in higher dimensions. For a simple multivariate Gaussian, +we demonstrate its accuracy for up to 200 dimensions with $10^5$ posterior +samples. $floZ$ has wide applicability, e.g., to estimate evidence from +variational inference, Markov Chain Monte Carlo samples, or any other method +that delivers samples and their likelihood from the unnormalized posterior +density. As a physical application, we use $floZ$ to compute the Bayes factor +for the presence of the first overtone in the ringdown signal of the +gravitational wave data of GW150914, finding good agreement with nested +sampling. + +
+
+ comment: 15 pages, 8 figures, 1 table +
+
+
+
+
+ + ♻ ☆ In-context learning and Occam's razor + + +
+ A central goal of machine learning is generalization. While the No Free Lunch +Theorem states that we cannot obtain theoretical guarantees for generalization +without further assumptions, in practice we observe that simple models which +explain the training data generalize best: a principle called Occam's razor. +Despite the need for simple models, most current approaches in machine learning +only minimize the training error, and at best indirectly promote simplicity +through regularization or architecture design. Here, we draw a connection +between Occam's razor and in-context learning: an emergent ability of certain +sequence models like Transformers to learn at inference time from past +observations in a sequence. In particular, we show that the next-token +prediction loss used to train in-context learners is directly equivalent to a +data compression technique called prequential coding, and that minimizing this +loss amounts to jointly minimizing both the training error and the complexity +of the model that was implicitly learned from context. Our theory and the +empirical experiments we use to support it not only provide a normative account +of in-context learning, but also elucidate the shortcomings of current +in-context learning methods, suggesting ways in which they can be improved. We +make our code available at https://github.com/3rdCore/PrequentialCode. + +
+
+
+
+
+ + ♻ ☆ Reachable Polyhedral Marching (RPM): An Exact Analysis Tool for + Deep-Learned Control Systems + + +
+ Neural networks are increasingly used in robotics as policies, state +transition models, state estimation models, or all of the above. With these +components being learned from data, it is important to be able to analyze what +behaviors were learned and how this affects closed-loop performance. In this +paper we take steps toward this goal by developing methods for computing +control invariant sets and regions of attraction (ROAs) of dynamical systems +represented as neural networks. We focus our attention on feedforward neural +networks with the rectified linear unit (ReLU) activation, which are known to +implement continuous piecewise-affine (PWA) functions. We describe the +Reachable Polyhedral Marching (RPM) algorithm for enumerating the affine pieces +of a neural network through an incremental connected walk. We then use this +algorithm to compute exact forward and backward reachable sets, from which we +provide methods for computing control invariant sets and ROAs. Our approach is +unique in that we find these sets incrementally, without Lyapunov-based tools. +In our examples we demonstrate the ability of our approach to find non-convex +control invariant sets and ROAs on tasks with learned van der Pol oscillator +and pendulum models. Further, we provide an accelerated algorithm for computing +ROAs that leverages the incremental and connected enumeration of affine regions +that RPM provides. We show this acceleration to lead to a 15x speedup in our +examples. Finally, we apply our methods to find a set of states that are +stabilized by an image-based controller for an aircraft runway control problem. + +
+
+ comment: Submitted to IEEE Transactions on Neural Networks and Learning + Systems. arXiv admin note: text overlap with arXiv:2011.11609 +
+
+
+
+
+ + ♻ ☆ A Complexity-Based Theory of Compositionality + + +
+ Compositionality is believed to be fundamental to intelligence. In humans, it +underlies the structure of thought, language, and higher-level reasoning. In +AI, compositional representations can enable a powerful form of +out-of-distribution generalization, in which a model systematically adapts to +novel combinations of known concepts. However, while we have strong intuitions +about what compositionality is, there currently exists no formal definition for +it that is measurable and mathematical. Here, we propose such a definition, +which we call representational compositionality, that accounts for and extends +our intuitions about compositionality. The definition is conceptually simple, +quantitative, grounded in algorithmic information theory, and applicable to any +representation. Intuitively, representational compositionality states that a +compositional representation satisfies three properties. First, it must be +expressive. Second, it must be possible to re-describe the representation as a +function of discrete symbolic sequences with re-combinable parts, analogous to +sentences in natural language. Third, the function that relates these symbolic +sequences to the representation, analogous to semantics in natural language, +must be simple. Through experiments on both synthetic and real world data, we +validate our definition of compositionality and show how it unifies disparate +intuitions from across the literature in both AI and cognitive science. We also +show that representational compositionality, while theoretically intractable, +can be readily estimated using standard deep learning tools. Our definition has +the potential to inspire the design of novel, theoretically-driven models that +better capture the mechanisms of compositional thought. + +
+
+
+
+
+ + ♻ ☆ Model-GLUE: Democratized LLM Scaling for A Large Model Zoo in the Wild NeurIPS 2024 + + +
+ As Large Language Models (LLMs) excel across tasks and specialized domains, +scaling LLMs based on existing models has garnered significant attention, which +faces the challenge of decreasing performance when combining disparate models. +Various techniques have been proposed for the aggregation of pre-trained LLMs, +including model merging, Mixture-of-Experts, and stacking. Despite their +merits, a comprehensive comparison and synergistic application of them to a +diverse model zoo is yet to be adequately addressed. In light of this research +gap, this paper introduces Model-GLUE, a holistic LLM scaling guideline. First, +our work starts with a benchmarking of existing LLM scaling techniques, +especially selective merging, and variants of mixture. Utilizing the insights +from the benchmark results, we formulate an optimal strategy for the selection +and aggregation of a heterogeneous model zoo characterizing different +architectures and initialization.Our methodology involves the clustering of +mergeable models and optimal merging strategy selection, and the integration of +clusters through a model mixture. Finally, evidenced by our experiments on a +diverse Llama-2-based model zoo, Model-GLUE shows an average performance +enhancement of 5.61%, achieved without additional training. Codes are available +at: https://github.com/Model-GLUE/Model-GLUE. + +
+
+ comment: 24 pages, 4 figures, accepted to NeurIPS 2024 Datasets and Benchmarks + Track +
+
+
+
+
+ + ♻ ☆ PBP: Post-training Backdoor Purification for Malware Classifiers NDSS 2025 + + +
+ In recent years, the rise of machine learning (ML) in cybersecurity has +brought new challenges, including the increasing threat of backdoor poisoning +attacks on ML malware classifiers. For instance, adversaries could inject +malicious samples into public malware repositories, contaminating the training +data and potentially misclassifying malware by the ML model. Current +countermeasures predominantly focus on detecting poisoned samples by leveraging +disagreements within the outputs of a diverse set of ensemble models on +training data points. However, these methods are not suitable for scenarios +where Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove +backdoors from a model after it has been trained. Addressing this scenario, we +introduce PBP, a post-training defense for malware classifiers that mitigates +various types of backdoor embeddings without assuming any specific backdoor +embedding mechanism. Our method exploits the influence of backdoor attacks on +the activation distribution of neural networks, independent of the +trigger-embedding method. In the presence of a backdoor attack, the activation +distribution of each layer is distorted into a mixture of distributions. By +regulating the statistics of the batch normalization layers, we can guide a +backdoored model to perform similarly to a clean one. Our method demonstrates +substantial advantages over several state-of-the-art methods, as evidenced by +experiments on two datasets, two types of backdoor methods, and various attack +configurations. Notably, our approach requires only a small portion of the +training data -- only 1\% -- to purify the backdoor and reduce the attack +success rate from 100\% to almost 0\%, a 100-fold improvement over the baseline +methods. Our code is available at +\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}. + +
+
+ comment: Accepted at NDSS 2025 +
+
+
+
+
+ + ♻ ☆ SwiftKV: Fast Prefill-Optimized Inference with Knowledge-Preserving + Model Transformation + + +
+ LLM inference for popular enterprise use cases, such as summarization, RAG, +and code-generation, typically observes orders of magnitude longer prompt +lengths than generation lengths. This characteristic leads to high cost of +prefill and increased response latency. In this paper, we present SwiftKV, a +novel model transformation and distillation procedure specifically designed to +reduce the time and cost of processing prompt tokens while preserving high +quality of generated tokens. SwiftKV combines three key mechanisms: i) +SingleInputKV, which prefills later layers' KV cache using a much earlier +layer's output, allowing prompt tokens to skip much of the model computation, +ii) AcrossKV, which merges the KV caches of neighboring layers to reduce the +memory footprint and support larger batch size for higher throughput, and iii) +a knowledge-preserving distillation procedure that can adapt existing LLMs for +SwiftKV with minimal accuracy impact and low compute and data requirement. For +Llama-3.1-8B and 70B, SwiftKV reduces the compute requirement of prefill by 50% +and the memory requirement of the KV cache by 62.5% while incurring minimum +quality degradation across a wide range of tasks. In the end-to-end inference +serving using an optimized vLLM implementation, SwiftKV realizes up to 2x +higher aggregate throughput and 60% lower time per output token. It can achieve +a staggering 560 TFlops/GPU of normalized inference throughput, which +translates to 16K tokens/s for Llama-3.1-70B in 16-bit precision on 4x H100 +GPUs. Our training, inference, and model implementations are open-sourced and +can be found through +https://huggingface.co/collections/Snowflake/swiftkv-models-674f7d7474eb789e185d31cb. + +
+
+
+
+
+ + ♻ ☆ Dockformer: A transformer-based molecular docking paradigm for + large-scale virtual screening + + +
+ Molecular docking is a crucial step in drug development, which enables the +virtual screening of compound libraries to identify potential ligands that +target proteins of interest. However, the computational complexity of +traditional docking models increases as the size of the compound library +increases. Recently, deep learning algorithms can provide data-driven research +and development models to increase the speed of the docking process. +Unfortunately, few models can achieve superior screening performance compared +to that of traditional models. Therefore, a novel deep learning-based docking +approach named Dockformer is introduced in this study. Dockformer leverages +multimodal information to capture the geometric topology and structural +knowledge of molecules and can directly generate binding conformations with the +corresponding confidence measures in an end-to-end manner. The experimental +results show that Dockformer achieves success rates of 90.53% and 82.71% on the +PDBbind core set and PoseBusters benchmarks, respectively, and more than a +100-fold increase in the inference process speed, outperforming almost all +state-of-the-art docking methods. In addition, the ability of Dockformer to +identify the main protease inhibitors of coronaviruses is demonstrated in a +real-world virtual screening scenario. Considering its high docking accuracy +and screening efficiency, Dockformer can be regarded as a powerful and robust +tool in the field of drug design. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ On the Benefits of Active Data Collection in Operator Learning + + +
+ We investigate active data collection strategies for operator learning when +the target operator is linear and the input functions are drawn from a +mean-zero stochastic process with continuous covariance kernels. With an active +data collection strategy, we establish an error convergence rate in terms of +the decay rate of the eigenvalues of the covariance kernel. Thus, with +sufficiently rapid eigenvalue decay of the covariance kernels, arbitrarily fast +error convergence rates can be achieved. This contrasts with the passive +(i.i.d.) data collection strategies, where the convergence rate is never faster +than $\sim n^{-1}$. In fact, for our setting, we establish a +\emph{non-vanishing} lower bound for any passive data collection strategy, +regardless of the eigenvalues decay rate of the covariance kernel. Overall, our +results show the benefit of active over passive data collection strategies in +operator learning. + +
+
+ comment: Added experiments +
+
+
+
+
+ + ♻ ☆ Fast and reliable uncertainty quantification with neural network + ensembles for industrial image classification + + +
+ Image classification with neural networks (NNs) is widely used in industrial +processes, situations where the model likely encounters unknown objects during +deployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make +confident yet incorrect predictions when confronted with OOD data. To increase +the models' reliability, they should quantify the uncertainty in their own +predictions, communicating when the output should (not) be trusted. Deep +ensembles, composed of multiple independent NNs, have been shown to perform +strongly but are computationally expensive. Recent research has proposed more +efficient NN ensembles, namely the snapshot, batch, and multi-input +multi-output ensemble. This study investigates the predictive and uncertainty +performance of efficient NN ensembles in the context of image classification +for industrial processes. It is the first to provide a comprehensive comparison +and it proposes a novel Diversity Quality metric to quantify the ensembles' +performance on the in-distribution and OOD sets in one single metric. The +results highlight the batch ensemble as a cost-effective and competitive +alternative to the deep ensemble. It matches the deep ensemble in both +uncertainty and accuracy while exhibiting considerable savings in training +time, test time, and memory storage. + +
+
+ comment: Submitted to Annals of Operations Research +
+
+
+
+
+ + ♻ ☆ Molmo and PixMo: Open Weights and Open Data for State-of-the-Art + Vision-Language Models + + +
+ Today's most advanced vision-language models (VLMs) remain proprietary. The +strongest open-weight models rely heavily on synthetic data from proprietary +VLMs to achieve good performance, effectively distilling these closed VLMs into +open ones. As a result, the community has been missing foundational knowledge +about how to build performant VLMs from scratch. We present Molmo, a new family +of VLMs that are state-of-the-art in their class of openness. Our key +contribution is a collection of new datasets called PixMo, including a dataset +of highly detailed image captions for pre-training, a free-form image Q&A +dataset for fine-tuning, and an innovative 2D pointing dataset, all collected +without the use of external VLMs. The success of our approach relies on careful +modeling choices, a well-tuned training pipeline, and, most critically, the +quality of our newly collected datasets. Our best-in-class 72B model not only +outperforms others in the class of open weight and data models, but also +outperforms larger proprietary models including Claude 3.5 Sonnet, and Gemini +1.5 Pro and Flash, second only to GPT-4o based on both academic benchmarks and +on a large human evaluation. Our model weights, new datasets, and source code +are available at https://molmo.allenai.org/blog. + +
+
+ comment: Updated with ablations and more technical details +
+
+
+
+
+ + ♻ ☆ Adaptive Circuit Behavior and Generalization in Mechanistic + Interpretability + + +
+ Mechanistic interpretability aims to understand the inner workings of large +neural networks by identifying circuits, or minimal subgraphs within the model +that implement algorithms responsible for performing specific tasks. These +circuits are typically discovered and analyzed using a narrowly defined prompt +format. However, given the abilities of large language models (LLMs) to +generalize across various prompt formats for the same task, it remains unclear +how well these circuits generalize. For instance, it is unclear whether the +models generalization results from reusing the same circuit components, the +components behaving differently, or the use of entirely different components. +In this paper, we investigate the generality of the indirect object +identification (IOI) circuit in GPT-2 small, which is well-studied and believed +to implement a simple, interpretable algorithm. We evaluate its performance on +prompt variants that challenge the assumptions of this algorithm. Our findings +reveal that the circuit generalizes surprisingly well, reusing all of its +components and mechanisms while only adding additional input edges. Notably, +the circuit generalizes even to prompt variants where the original algorithm +should fail; we discover a mechanism that explains this which we term S2 +Hacking. Our findings indicate that circuits within LLMs may be more flexible +and general than previously recognized, underscoring the importance of studying +circuit generalization to better understand the broader capabilities of these +models. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ When Stability meets Sufficiency: Informative Explanations that do not + Overwhelm + + +
+ Recent studies evaluating various criteria for explainable artificial +intelligence (XAI) suggest that fidelity, stability, and comprehensibility are +among the most important metrics considered by users of AI across a diverse +collection of usage contexts. We consider these criteria as applied to +feature-based attribution methods, which are amongst the most prevalent in XAI +literature. Going beyond standard correlation, methods have been proposed that +highlight what should be minimally sufficient to justify the classification of +an input (viz. pertinent positives). While minimal sufficiency is an attractive +property akin to comprehensibility, the resulting explanations are often too +sparse for a human to understand and evaluate the local behavior of the model. +To overcome these limitations, we incorporate the criteria of stability and +fidelity and propose a novel method called Path-Sufficient Explanations Method +(PSEM) that outputs a sequence of stable and sufficient explanations for a +given input of strictly decreasing size (or value) -- from original input to a +minimally sufficient explanation -- which can be thought to trace the local +boundary of the model in a stable manner, thus providing better intuition about +the local model behavior for the specific input. We validate these claims, both +qualitatively and quantitatively, with experiments that show the benefit of +PSEM across three modalities (image, tabular and text) as well as versus other +path explanations. A user study depicts the strength of the method in +communicating the local behavior, where (many) users are able to correctly +determine the prediction made by a model. + +
+
+ comment: Published at TMLR +
+
+
+
+
+ + ♻ ☆ Looking at Model Debiasing through the Lens of Anomaly Detection WACV + + +
+ It is widely recognized that deep neural networks are sensitive to bias in +the data. This means that during training these models are likely to learn +spurious correlations between data and labels, resulting in limited +generalization abilities and low performance. In this context, model debiasing +approaches can be devised aiming at reducing the model's dependency on such +unwanted correlations, either leveraging the knowledge of bias information or +not. In this work, we focus on the latter and more realistic scenario, showing +the importance of accurately predicting the bias-conflicting and bias-aligned +samples to obtain compelling performance in bias mitigation. On this ground, we +propose to conceive the problem of model bias from an out-of-distribution +perspective, introducing a new bias identification method based on anomaly +detection. We claim that when data is mostly biased, bias-conflicting samples +can be regarded as outliers with respect to the bias-aligned distribution in +the feature space of a biased model, thus allowing for precisely detecting them +with an anomaly detection method. Coupling the proposed bias identification +approach with bias-conflicting data upsampling and augmentation in a two-step +strategy, we reach state-of-the-art performance on synthetic and real benchmark +datasets. Ultimately, our proposed approach shows that the data bias issue does +not necessarily require complex debiasing methods, given that an accurate bias +identification procedure is defined. Source code is available at +https://github.com/Malga-Vision/MoDAD + +
+
+ comment: 13 pages, 8 figures; Accepted at IEEE/CVF Winter Conference on + Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ♻ ☆ GV-Rep: A Large-Scale Dataset for Genetic Variant Representation + Learning + + +
+ Genetic variants (GVs) are defined as differences in the DNA sequences among +individuals and play a crucial role in diagnosing and treating genetic +diseases. The rapid decrease in next generation sequencing cost has led to an +exponential increase in patient-level GV data. This growth poses a challenge +for clinicians who must efficiently prioritize patient-specific GVs and +integrate them with existing genomic databases to inform patient management. To +addressing the interpretation of GVs, genomic foundation models (GFMs) have +emerged. However, these models lack standardized performance assessments, +leading to considerable variability in model evaluations. This poses the +question: How effectively do deep learning methods classify unknown GVs and +align them with clinically-verified GVs? We argue that representation learning, +which transforms raw data into meaningful feature spaces, is an effective +approach for addressing both indexing and classification challenges. We +introduce a large-scale Genetic Variant dataset, named GV-Rep, featuring +variable-length contexts and detailed annotations, designed for deep learning +models to learn GV representations across various traits, diseases, tissue +types, and experimental contexts. Our contributions are three-fold: (i) +Construction of a comprehensive dataset with 7 million records, each labeled +with characteristics of the corresponding variants, alongside additional data +from 17,548 gene knockout tests across 1,107 cell types, 1,808 variant +combinations, and 156 unique clinically verified GVs from real-world patients. +(ii) Analysis of the structure and properties of the dataset. (iii) +Experimentation of the dataset with pre-trained GFMs. The results show a +significant gap between GFMs current capabilities and accurate GV +representation. We hope this dataset will help advance genomic deep learning to +bridge this gap. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Learning Semantic Association Rules from Internet of Things Data + + +
+ Association Rule Mining (ARM) is the task of discovering commonalities in +data in the form of logical implications. ARM is used in the Internet of Things +(IoT) for different tasks including monitoring and decision-making. However, +existing methods give limited consideration to IoT-specific requirements such +as heterogeneity and volume. Furthermore, they do not utilize important static +domain-specific description data about IoT systems, which is increasingly +represented as knowledge graphs. In this paper, we propose a novel ARM pipeline +for IoT data that utilizes both dynamic sensor data and static IoT system +metadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method +(Aerial) as part of the pipeline to address the high volume of IoT data and +reduce the total number of rules that are resource-intensive to process. Aerial +learns a neural representation of a given data and extracts association rules +from this representation by exploiting the reconstruction (decoding) mechanism +of an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show +that ARM on both static and dynamic IoT data results in more generically +applicable rules while Aerial can learn a more concise set of high-quality +association rules than the state-of-the-art with full coverage over the +datasets. + +
+
+
+
+
+ + ♻ ☆ DeiSAM: Segment Anything with Deictic Prompting NeurIPS 2024 + + +
+ Large-scale, pre-trained neural networks have demonstrated strong +capabilities in various tasks, including zero-shot image segmentation. To +identify concrete objects in complex scenes, humans instinctively rely on +deictic descriptions in natural language, i.e., referring to something +depending on the context such as "The object that is on the desk and behind the +cup.". However, deep learning approaches cannot reliably interpret such deictic +representations due to their lack of reasoning capabilities in complex +scenarios. To remedy this issue, we propose DeiSAM -- a combination of large +pre-trained neural networks with differentiable logic reasoners -- for deictic +promptable segmentation. Given a complex, textual segmentation description, +DeiSAM leverages Large Language Models (LLMs) to generate first-order logic +rules and performs differentiable forward reasoning on generated scene graphs. +Subsequently, DeiSAM segments objects by matching them to the logically +inferred image regions. As part of our evaluation, we propose the Deictic +Visual Genome (DeiVG) dataset, containing paired visual input and complex, +deictic textual prompts. Our empirical results demonstrate that DeiSAM is a +substantial improvement over purely data-driven baselines for deictic +promptable segmentation. + +
+
+ comment: Published as a conference paper at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Marrying Causal Representation Learning with Dynamical Systems for + Science NeurIPS 2024 + + +
+ Causal representation learning promises to extend causal models to hidden +causal variables from raw entangled measurements. However, most progress has +focused on proving identifiability results in different settings, and we are +not aware of any successful real-world application. At the same time, the field +of dynamical systems benefited from deep learning and scaled to countless +applications but does not allow parameter identification. In this paper, we +draw a clear connection between the two and their key assumptions, allowing us +to apply identifiable methods developed in causal representation learning to +dynamical systems. At the same time, we can leverage scalable differentiable +solvers developed for differential equations to build models that are both +identifiable and practical. Overall, we learn explicitly controllable models +that isolate the trajectory-specific parameters for further downstream tasks +such as out-of-distribution classification or treatment effect estimation. We +experiment with a wind simulator with partially known factors of variation. We +also apply the resulting model to real-world climate data and successfully +answer downstream causal questions in line with existing literature on climate +change. + +
+
+ comment: NeurIPS 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Stochastic Monkeys at Play: Random Augmentations Cheaply Break LLM + Safety Alignment + + +
+ Safety alignment of Large Language Models (LLMs) has recently become a +critical objective of model developers. In response, a growing body of work has +been investigating how safety alignment can be bypassed through various +jailbreaking methods, such as adversarial attacks. However, these jailbreak +methods can be rather costly or involve a non-trivial amount of creativity and +effort, introducing the assumption that malicious users are high-resource or +sophisticated. In this paper, we study how simple random augmentations to the +input prompt affect safety alignment effectiveness in state-of-the-art LLMs, +such as Llama 3 and Qwen 2. We perform an in-depth evaluation of 17 different +models and investigate the intersection of safety under random augmentations +with multiple dimensions: augmentation type, model size, quantization, +fine-tuning-based defenses, and decoding strategies (e.g., sampling +temperature). We show that low-resource and unsophisticated attackers, i.e. +$\textit{stochastic monkeys}$, can significantly improve their chances of +bypassing alignment with just 25 random augmentations per prompt. Source code +and data: https://github.com/uiuc-focal-lab/stochastic-monkeys/ + +
+
+ comment: v2: Updated with changes from peer review rebuttal. v1: Version under + peer review +
+
+
+
+
+ + ♻ ☆ Group Distributionally Robust Optimization can Suppress Class Imbalance + Effect in Network Traffic Classification + + +
+ Internet services have led to the eruption of network traffic, and machine +learning on these Internet data has become an indispensable tool, especially +when the application is risk-sensitive. This paper focuses on network traffic +classification in the presence of class imbalance, which fundamentally and +ubiquitously exists in Internet data analysis. This existence of class +imbalance mostly drifts the optimal decision boundary, resulting in a less +optimal solution for machine learning models. To alleviate the effect, we +propose to design strategies for alleviating the class imbalance through the +lens of group distributionally robust optimization. Our approach iteratively +updates the non-parametric weights for separate classes and optimizes the +learning model by minimizing reweighted losses. We interpret the optimization +process from a Stackelberg game and perform extensive experiments on typical +benchmarks. Results show that our approach can not only suppress the negative +effect of class imbalance but also improve the comprehensive performance in +prediction. + +
+
+
+
+
+ + ♻ ☆ Efficiently Learning at Test-Time: Active Fine-Tuning of LLMs + + +
+ Recent efforts in fine-tuning language models often rely on automatic data +selection, commonly using Nearest Neighbors retrieval from large datasets. +However, we theoretically show that this approach tends to select redundant +data, limiting its effectiveness or even hurting performance. To address this, +we introduce SIFT, a data selection algorithm designed to reduce uncertainty +about the model's response given a prompt, which unifies ideas from retrieval +and active learning. Whereas Nearest Neighbor retrieval typically fails in the +presence of information duplication, SIFT accounts for information duplication +and optimizes the overall information gain of the selected examples. We focus +our evaluations on fine-tuning at test-time for prompt-specific language +modeling on the Pile dataset, and show that SIFT consistently outperforms +Nearest Neighbor retrieval, with minimal computational overhead. Moreover, we +show that our uncertainty estimates can predict the performance gain of +test-time fine-tuning, and use this to develop an adaptive algorithm that +invests test-time compute proportional to realized performance gains. We +provide the $\texttt{activeft}$ (Active Fine-Tuning) library which can be used +as a drop-in replacement for Nearest Neighbor retrieval. + +
+
+
+
+
+ + ♻ ☆ Memory-efficient Continual Learning with Neural Collapse Contrastive WACV 2025 + + +
+ Contrastive learning has significantly improved representation quality, +enhancing knowledge transfer across tasks in continual learning (CL). However, +catastrophic forgetting remains a key challenge, as contrastive based methods +primarily focus on "soft relationships" or "softness" between samples, which +shift with changing data distributions and lead to representation overlap +across tasks. Recently, the newly identified Neural Collapse phenomenon has +shown promise in CL by focusing on "hard relationships" or "hardness" between +samples and fixed prototypes. However, this approach overlooks "softness", +crucial for capturing intra-class variability, and this rigid focus can also +pull old class representations toward current ones, increasing forgetting. +Building on these insights, we propose Focal Neural Collapse Contrastive +(FNC2), a novel representation learning loss that effectively balances both +soft and hard relationships. Additionally, we introduce the Hardness-Softness +Distillation (HSD) loss to progressively preserve the knowledge gained from +these relationships across tasks. Our method outperforms state-of-the-art +approaches, particularly in minimizing memory reliance. Remarkably, even +without the use of memory, our approach rivals rehearsal-based methods, +offering a compelling solution for data privacy concerns. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ PePR: Performance Per Resource Unit as a Metric to Promote Small-Scale + Deep Learning in Medical Image Analysis + + +
+ The recent advances in deep learning (DL) have been accelerated by access to +large-scale data and compute. These large-scale resources have been used to +train progressively larger models which are resource intensive in terms of +compute, data, energy, and carbon emissions. These costs are becoming a new +type of entry barrier to researchers and practitioners with limited access to +resources at such scale, particularly in the Global South. In this work, we +take a comprehensive look at the landscape of existing DL models for medical +image analysis tasks and demonstrate their usefulness in settings where +resources are limited. To account for the resource consumption of DL models, we +introduce a novel measure to estimate the performance per resource unit, which +we call the PePR score. Using a diverse family of 131 unique DL architectures +(spanning 1M to 130M trainable parameters) and three medical image datasets, we +capture trends about the performance-resource trade-offs. In applications like +medical image analysis, we argue that small-scale, specialized models are +better than striving for large-scale models. Furthermore, we show that using +existing pretrained models that are fine-tuned on new data can significantly +reduce the computational resources and data required compared to training +models from scratch. We hope this work will encourage the community to focus on +improving AI equity by developing methods and models with smaller resource +footprints. + +
+
+ comment: Accepted to be published at the Northern Lights Deep Learning + Conference (NLDL), 2025. Source code available at + https://github.com/saintslab/PePR +
+
+
+
+
+ + ♻ ☆ What should a neuron aim for? Designing local objective functions based + on information theory + + +
+ In modern deep neural networks, the learning dynamics of the individual +neurons is often obscure, as the networks are trained via global optimization. +Conversely, biological systems build on self-organized, local learning, +achieving robustness and efficiency with limited global information. We here +show how self-organization between individual artificial neurons can be +achieved by designing abstract bio-inspired local learning goals. These goals +are parameterized using a recent extension of information theory, Partial +Information Decomposition (PID), which decomposes the information that a set of +information sources holds about an outcome into unique, redundant and +synergistic contributions. Our framework enables neurons to locally shape the +integration of information from various input classes, i.e. feedforward, +feedback, and lateral, by selecting which of the three inputs should contribute +uniquely, redundantly or synergistically to the output. This selection is +expressed as a weighted sum of PID terms, which, for a given problem, can be +directly derived from intuitive reasoning or via numerical optimization, +offering a window into understanding task-relevant local information +processing. Achieving neuron-level interpretability while enabling strong +performance using local learning, our work advances a principled +information-theoretic foundation for local learning strategies. + +
+
+ comment: 24 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Learning on Model Weights using Tree Experts + + +
+ The increasing availability of public models begs the question: can we train +neural networks that use other networks as input? Such models allow us to study +different aspects of a given neural network, for example, determining the +categories in a model's training dataset. However, machine learning on model +weights is challenging as they often exhibit significant variation unrelated to +the models' semantic properties (nuisance variation). Here, we identify a key +property of real-world models: most public models belong to a small set of +Model Trees, where all models within a tree are fine-tuned from a common +ancestor (e.g., a foundation model). Importantly, we find that within each tree +there is less nuisance variation between models. Concretely, while learning +across Model Trees requires complex architectures, even a linear classifier +trained on a single model layer often works within trees. While effective, +these linear classifiers are computationally expensive, especially when dealing +with larger models that have many parameters. To address this, we introduce +Probing Experts (ProbeX), a theoretically motivated and lightweight method. +Notably, ProbeX is the first probing method specifically designed to learn from +the weights of a single hidden model layer. We demonstrate the effectiveness of +ProbeX by predicting the categories in a model's training dataset based only on +its weights. Excitingly, ProbeX can also map the weights of Stable Diffusion +into a shared weight-language embedding space, enabling zero-shot model +classification. + +
+
+ comment: Project page: https://horwitz.ai/probex/ +
+
+
+
+
+ + ♻ ☆ Words in Motion: Extracting Interpretable Control Vectors for Motion + Transformers + + +
+ Transformer-based models generate hidden states that are difficult to +interpret. In this work, we aim to interpret these hidden states and control +them at inference, with a focus on motion forecasting. We use linear probes to +measure neural collapse towards interpretable motion features in hidden states. +High probing accuracy implies meaningful directions and distances between +hidden states of opposing features, which we use to fit interpretable control +vectors for activation steering at inference. To optimize our control vectors, +we use sparse autoencoders with fully-connected, convolutional, MLPMixer layers +and various activation functions. Notably, we show that enforcing sparsity in +hidden states leads to a more linear relationship between control vector +temperatures and forecasts. Our approach enables mechanistic interpretability +and zero-shot generalization to unseen dataset characteristics with negligible +computational overhead. Our implementation is available at +https://github.com/kit-mrt/future-motion + +
+
+ comment: Add autoencoders with convolutional, MLPMixer layers, and JumpReLU + activations +
+
+
+
+
+ + ♻ ☆ VGGHeads: 3D Multi Head Alignment with a Large-Scale Synthetic Dataset + + +
+ Human head detection, keypoint estimation, and 3D head model fitting are +essential tasks with many applications. However, traditional real-world +datasets often suffer from bias, privacy, and ethical concerns, and they have +been recorded in laboratory environments, which makes it difficult for trained +models to generalize. Here, we introduce \method -- a large-scale synthetic +dataset generated with diffusion models for human head detection and 3D mesh +estimation. Our dataset comprises over 1 million high-resolution images, each +annotated with detailed 3D head meshes, facial landmarks, and bounding boxes. +Using this dataset, we introduce a new model architecture capable of +simultaneous head detection and head mesh reconstruction from a single image in +a single step. Through extensive experimental evaluations, we demonstrate that +models trained on our synthetic data achieve strong performance on real images. +Furthermore, the versatility of our dataset makes it applicable across a broad +spectrum of tasks, offering a general and comprehensive representation of human +heads. + +
+
+
+
+
+ + ♻ ☆ Online SLA Decomposition: Enabling Real-Time Adaptation to Evolving + Systems ICML + + +
+ When a network slice spans multiple technology domains, it is crucial for +each domain to uphold the End-to-End (E2E) Service Level Agreement (SLA) +associated with the slice. Consequently, the E2E SLA must be properly +decomposed into partial SLAs that are assigned to each domain involved. In a +network slice management system with a two-level architecture, comprising an +E2E service orchestrator and local domain controllers, we consider that the +orchestrator has access solely to historical data regarding the responses of +local controllers to previous requests, and this information is used to +construct a risk model for each domain. In this study, we extend our previous +work by investigating the dynamic nature of real-world systems and introducing +an online learning-decomposition framework to tackle the dynamicity. We propose +a framework that periodically updates the risk models based on the most recent +feedback. This approach leverages key components such as online gradient +descent and FIFO memory buffers, which enhance the stability and robustness of +the overall process. Our empirical study on an analytic model-based simulator +demonstrates that the proposed framework outperforms the state-of-the-art +static approach, providing more accurate and resilient SLA decomposition even +under varying conditions and limited data scenarios. + +
+
+ comment: The paper has been submitted to IEEE ICMLCN 2025 +
+
+
+
+
+ + ♻ ☆ Deep learning empowered sensor fusion boosts infant movement + classification + + +
+ To assess the integrity of the developing nervous system, the Prechtl general +movement assessment (GMA) is recognized for its clinical value in diagnosing +neurological impairments in early infancy. GMA has been increasingly augmented +through machine learning approaches intending to scale-up its application, +circumvent costs in the training of human assessors and further standardize +classification of spontaneous motor patterns. Available deep learning tools, +all of which are based on single sensor modalities, are however still +considerably inferior to that of well-trained human assessors. These approaches +are hardly comparable as all models are designed, trained and evaluated on +proprietary/silo-data sets. With this study we propose a sensor fusion approach +for assessing fidgety movements (FMs). FMs were recorded from 51 typically +developing participants. We compared three different sensor modalities +(pressure, inertial, and visual sensors). Various combinations and two sensor +fusion approaches (late and early fusion) for infant movement classification +were tested to evaluate whether a multi-sensor system outperforms single +modality assessments. Convolutional neural network (CNN) architectures were +used to classify movement patterns. The performance of the three-sensor fusion +(classification accuracy of 94.5%) was significantly higher than that of any +single modality evaluated. We show that the sensor fusion approach is a +promising avenue for automated classification of infant motor patterns. The +development of a robust sensor fusion system may significantly enhance AI-based +early recognition of neurofunctions, ultimately facilitating automated early +detection of neurodevelopmental conditions. + +
+
+
+
+
+ + ♻ ☆ AdamMCMC: Combining Metropolis Adjusted Langevin with Momentum-based + Optimization + + +
+ Uncertainty estimation is a key issue when considering the application of +deep neural network methods in science and engineering. In this work, we +introduce a novel algorithm that quantifies epistemic uncertainty via Monte +Carlo sampling from a tempered posterior distribution. It combines the well +established Metropolis Adjusted Langevin Algorithm (MALA) with momentum-based +optimization using Adam and leverages a prolate proposal distribution, to +efficiently draw from the posterior. We prove that the constructed chain admits +the Gibbs posterior as invariant distribution and approximates this posterior +in total variation distance. Furthermore, we demonstrate the efficiency of the +resulting algorithm and the merit of the proposed changes on a state-of-the-art +classifier from high-energy particle physics. + +
+
+ comment: 16 pages, 5 figures; adapted Theorem 2 +
+
+
+
+
+ + ♻ ☆ Iterative Reweighted Framework Based Algorithms for Sparse Linear + Regression with Generalized Elastic Net Penalty + + +
+ The elastic net penalty is frequently employed in high-dimensional statistics +for parameter regression and variable selection. It is particularly beneficial +compared to lasso when the number of predictors greatly surpasses the number of +observations. However, empirical evidence has shown that the $\ell_q$-norm +penalty (where $0 < q < 1$) often provides better regression compared to the +$\ell_1$-norm penalty, demonstrating enhanced robustness in various scenarios. +In this paper, we explore a generalized elastic net model that employs a +$\ell_r$-norm (where $r \geq 1$) in loss function to accommodate various types +of noise, and employs a $\ell_q$-norm (where $0 < q < 1$) to replace the +$\ell_1$-norm in elastic net penalty. Theoretically, we establish the +computable lower bounds for the nonzero entries of the generalized first-order +stationary points of the proposed generalized elastic net model. For +implementation, we develop two efficient algorithms based on the locally +Lipschitz continuous $\epsilon$-approximation to $\ell_q$-norm. The first +algorithm employs an alternating direction method of multipliers (ADMM), while +the second utilizes a proximal majorization-minimization method (PMM), where +the subproblems are addressed using the semismooth Newton method (SNN). We also +perform extensive numerical experiments with both simulated and real data, +showing that both algorithms demonstrate superior performance. Notably, the +PMM-SSN is efficient than ADMM, even though the latter provides a simpler +implementation. + +
+
+
+
+
+ + ♻ ☆ Kernel-Based Optimal Control: An Infinitesimal Generator Approach + + +
+ This paper presents a novel approach for optimal control of nonlinear +stochastic systems using infinitesimal generator learning within +infinite-dimensional reproducing kernel Hilbert spaces. Our learning framework +leverages data samples of system dynamics and stage cost functions, with only +control penalties and constraints provided. The proposed method directly learns +the diffusion operator of a controlled Fokker-Planck-Kolmogorov equation in an +infinite-dimensional hypothesis space. This operator models the continuous-time +evolution of the probability measure of the control system's state. We +demonstrate that this approach seamlessly integrates with modern convex +operator-theoretic Hamilton-Jacobi-Bellman recursions, enabling a data-driven +solution to the optimal control problem. Furthermore, our statistical learning +framework includes nonparametric estimators for uncontrolled forward +infinitesimal generators as a special case. Numerical experiments, ranging from +synthetic differential equations to simulated robotic systems, showcase the +advantages of our approach compared to both modern data-driven and classical +nonlinear programming methods for optimal control. + +
+
+
+
+
+ + ♻ ☆ Relax and Merge: A Simple Yet Effective Framework for Solving Fair + $k$-Means and $k$-sparse Wasserstein Barycenter Problems + + +
+ The fairness of clustering algorithms has gained widespread attention across +various areas, including machine learning, In this paper, we study fair +$k$-means clustering in Euclidean space. Given a dataset comprising several +groups, the fairness constraint requires that each cluster should contain a +proportion of points from each group within specified lower and upper bounds. +Due to these fairness constraints, determining the optimal locations of $k$ +centers is a quite challenging task. We propose a novel ``Relax and Merge'' +framework that returns a $(1+4\rho + O(\epsilon))$-approximate solution, where +$\rho$ is the approximate ratio of an off-the-shelf vanilla $k$-means algorithm +and $O(\epsilon)$ can be an arbitrarily small positive number. If equipped with +a PTAS of $k$-means, our solution can achieve an approximation ratio of +$(5+O(\epsilon))$ with only a slight violation of the fairness constraints, +which improves the current state-of-the-art approximation guarantee. +Furthermore, using our framework, we can also obtain a $(1+4\rho ++O(\epsilon))$-approximate solution for the $k$-sparse Wasserstein Barycenter +problem, which is a fundamental optimization problem in the field of optimal +transport, and a $(2+6\rho)$-approximate solution for the strictly fair +$k$-means clustering with no violation, both of which are better than the +current state-of-the-art methods. In addition, the empirical results +demonstrate that our proposed algorithm can significantly outperform baseline +approaches in terms of clustering cost. + +
+
+
+
+
+ + ♻ ☆ Scaling Laws for Task-Optimized Models of the Primate Visual Ventral + Stream + + +
+ When trained on large-scale object classification datasets, certain +artificial neural network models begin to approximate core object recognition +(COR) behaviors and neural response patterns in the primate visual ventral +stream (VVS). While recent machine learning advances suggest that scaling model +size, dataset size, and compute resources improve task performance, the impact +of scaling on brain alignment remains unclear. In this study, we explore +scaling laws for modeling the primate VVS by systematically evaluating over 600 +models trained under controlled conditions on benchmarks spanning V1, V2, V4, +IT and COR behaviors. We observe that while behavioral alignment continues to +scale with larger models, neural alignment saturates. This observation remains +true across model architectures and training datasets, even though models with +stronger inductive bias and datasets with higher-quality images are more +compute-efficient. Increased scaling is especially beneficial for higher-level +visual areas, where small models trained on few samples exhibit only poor +alignment. Finally, we develop a scaling recipe, indicating that a greater +proportion of compute should be allocated to data samples over model size. Our +results suggest that while scaling alone might suffice for alignment with human +core object recognition behavior, it will not yield improved models of the +brain's visual ventral stream with current architectures and datasets, +highlighting the need for novel strategies in building brain-like models. + +
+
+ comment: 10 pages for the main paper, 23 pages in total. 7 main figures and 7 + supplementary figures. Code, model weights, and benchmark results can be + accessed at https://github.com/epflneuroailab/scaling-primate-vvs - In + version 2, Figure 7 and the related discussion are added, and the appendix is + updated +
+
+
+
+
+ + ♻ ☆ Tight PAC-Bayesian Risk Certificates for Contrastive Learning + + +
+ Contrastive representation learning is a modern paradigm for learning +representations of unlabeled data via augmentations -- precisely, contrastive +models learn to embed semantically similar pairs of samples (positive pairs) +closer than independently drawn samples (negative samples). In spite of its +empirical success and widespread use in foundation models, statistical theory +for contrastive learning remains less explored. Recent works have developed +generalization error bounds for contrastive losses, but the resulting risk +certificates are either vacuous (certificates based on Rademacher complexity or +$f$-divergence) or require strong assumptions about samples that are +unreasonable in practice. The present paper develops non-vacuous PAC-Bayesian +risk certificates for contrastive representation learning, considering the +practical considerations of the popular SimCLR framework. Notably, we take into +account that SimCLR reuses positive pairs of augmented data as negative samples +for other data, thereby inducing strong dependence and making classical PAC or +PAC-Bayesian bounds inapplicable. We further refine existing bounds on the +downstream classification loss by incorporating SimCLR-specific factors, +including data augmentation and temperature scaling, and derive risk +certificates for the contrastive zero-one risk. The resulting bounds for +contrastive loss and downstream prediction are much tighter than those of +previous risk certificates, as demonstrated by experiments on CIFAR-10. + +
+
+
+
+
+ + ♻ ☆ LoRA-Ensemble: Efficient Uncertainty Modelling for Self-attention + Networks + + +
+ Numerous crucial tasks in real-world decision-making rely on machine learning +algorithms with calibrated uncertainty estimates. However, modern methods often +yield overconfident and uncalibrated predictions. Various approaches involve +training an ensemble of separate models to quantify the uncertainty related to +the model itself, known as epistemic uncertainty. In an explicit +implementation, the ensemble approach has high computational cost and high +memory requirements. This particular challenge is evident in state-of-the-art +neural networks such as transformers, where even a single network is already +demanding in terms of compute and memory. Consequently, efforts are made to +emulate the ensemble model without actually instantiating separate ensemble +members, referred to as implicit ensembling. We introduce LoRA-Ensemble, a +parameter-efficient deep ensemble method for self-attention networks, which is +based on Low-Rank Adaptation (LoRA). Initially developed for efficient LLM +fine-tuning, we extend LoRA to an implicit ensembling approach. By employing a +single pre-trained self-attention network with weights shared across all +members, we train member-specific low-rank matrices for the attention +projections. Our method exhibits superior calibration compared to explicit +ensembles and achieves similar or better accuracy across various prediction +tasks and datasets. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ PDNNet: PDN-Aware GNN-CNN Heterogeneous Network for Dynamic IR Drop + Prediction + + +
+ IR drop on the power delivery network (PDN) is closely related to PDN's +configuration and cell current consumption. As the integrated circuit (IC) +design is growing larger, dynamic IR drop simulation becomes computationally +unaffordable and machine learning based IR drop prediction has been explored as +a promising solution. Although CNN-based methods have been adapted to IR drop +prediction task in several works, the shortcomings of overlooking PDN +configuration is non-negligible. In this paper, we consider not only how to +properly represent cell-PDN relation, but also how to model IR drop following +its physical nature in the feature aggregation procedure. Thus, we propose a +novel graph structure, PDNGraph, to unify the representations of the PDN +structure and the fine-grained cell-PDN relation. We further propose a +dual-branch heterogeneous network, PDNNet, incorporating two parallel GNN-CNN +branches to favorably capture the above features during the learning process. +Several key designs are presented to make the dynamic IR drop prediction highly +effective and interpretable. We are the first work to apply graph structure to +deep-learning based dynamic IR drop prediction method. Experiments show that +PDNNet outperforms the state-of-the-art CNN-based methods and achieves 545x +speedup compared to the commercial tool, which demonstrates the superiority of +our method. + +
+
+
+
+
+ + ♻ ☆ R-MTLLMF: Resilient Multi-Task Large Language Model Fusion at the + Wireless Edge + + +
+ Multi-task large language models (MTLLMs) are important for many applications +at the wireless edge, where users demand specialized models to handle multiple +tasks efficiently. However, training MTLLMs is complex and exhaustive, +particularly when tasks are subject to change. Recently, the concept of model +fusion via task vectors has emerged as an efficient approach for combining +fine-tuning parameters to produce an MTLLM. In this paper, the problem of +enabling edge users to collaboratively craft such MTLMs via tasks vectors is +studied, under the assumption of worst-case adversarial attacks. To this end, +first the influence of adversarial noise to multi-task model fusion is +investigated and a relationship between the so-called weight disentanglement +error and the mean squared error (MSE) is derived. Using hypothesis testing, it +is directly shown that the MSE increases interference between task vectors, +thereby rendering model fusion ineffective. Then, a novel resilient MTLLM +fusion (R-MTLLMF) is proposed, which leverages insights about the LLM +architecture and fine-tuning process to safeguard task vector aggregation under +adversarial noise by realigning the MTLLM. The proposed R-MTLLMF is then +compared for both worst-case and ideal transmission scenarios to study the +impact of the wireless channel. Extensive model fusion experiments with vision +LLMs demonstrate R-MTLLMF's effectiveness, achieving close-to-baseline +performance across eight different tasks in ideal noise scenarios and +significantly outperforming unprotected model fusion in worst-case scenarios. +The results further advocate for additional physical layer protection for a +holistic approach to resilience, from both a wireless and LLM perspective. + +
+
+
+
+
+ + ♻ ☆ Continual Low-Rank Scaled Dot-product Attention + + +
+ Transformers are widely used for their ability to capture data relations in +sequence processing, with great success for a wide range of static tasks. +However, the computational and memory footprint of their main component, i.e., +the Scaled Dot-product Attention, is commonly overlooked. This makes their +adoption in applications involving stream data processing with constraints in +response latency, computational and memory resources infeasible. Some works +have proposed methods to lower the computational cost of transformers, i.e. +low-rank approximations, sparsity in attention, and efficient formulations for +Continual Inference. In this paper, we introduce a new formulation of the +Scaled Dot-product Attention based on the Nystr\"om approximation that is +suitable for Continual Inference. In experiments on Online Audio Classification +and Online Action Detection tasks, the proposed Continual Scaled Dot-product +Attention can lower the number of operations by up to three orders of magnitude +compared to the original Transformers while retaining the predictive +performance of competing models. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Improving Fine-Grained Control via Aggregation of Multiple Diffusion + Models + + +
+ While many diffusion models perform well when controlling for particular +aspect among style, character, and interaction, they struggle with fine-grained +control due to dataset limitations and intricate model architecture design. +This paper introduces a novel algorithm, Aggregation of Multiple Diffusion +Models (AMDM), which synthesizes features from multiple diffusion models into a +specified model, activating specific features for fine-grained control. +Experimental results demonstrate that AMDM significantly improves fine-grained +control without training, proving its effectiveness. Additionally, it reveals +that diffusion models initially focus on features such as position, attributes, +and style, with later stages improving generation quality and consistency. AMDM +offers a new perspective for tackling the challenges of fine-grained +conditional control generation in diffusion models: We can fully utilize +existing or develop new conditional diffusion models that control specific +aspects, and then aggregate them using AMDM algorithm. This eliminates the need +for constructing complex datasets, designing intricate model architectures, and +incurring high training costs. Code is available at: +https://github.com/Hammour-steak/AMDM. + +
+
+
+
+
+ + ♻ ☆ Adaptive Optimizers with Sparse Group Lasso for Neural Networks in CTR + Prediction ECML + + +
+ We develop a novel framework that adds the regularizers of the sparse group +lasso to a family of adaptive optimizers in deep learning, such as Momentum, +Adagrad, Adam, AMSGrad, AdaHessian, and create a new class of optimizers, which +are named Group Momentum, Group Adagrad, Group Adam, Group AMSGrad and Group +AdaHessian, etc., accordingly. We establish theoretically proven convergence +guarantees in the stochastic convex settings, based on primal-dual methods. We +evaluate the regularized effect of our new optimizers on three large-scale +real-world ad click datasets with state-of-the-art deep learning models. The +experimental results reveal that compared with the original optimizers with the +post-processing procedure which uses the magnitude pruning method, the +performance of the models can be significantly improved on the same sparsity +level. Furthermore, in comparison to the cases without magnitude pruning, our +methods can achieve extremely high sparsity with significantly better or highly +competitive performance. The code is available at +https://github.com/intelligent-machine-learning/tfplus/tree/main/tfplus. + +
+
+ comment: 24 pages. Published as a conference paper at ECML PKDD 2021. This + version includes Appendix which was not included in the published version + because of page limit +
+
+
+
+
+ + ♻ ☆ COOL: Efficient and Reliable Chain-Oriented Objective Logic with Neural + Networks Feedback Control for Program Synthesis + + +
+ Program synthesis methods, whether formal or neural-based, lack fine-grained +control and flexible modularity, which limits their adaptation to complex +software development. These limitations stem from rigid Domain-Specific +Language (DSL) frameworks and neural network incorrect predictions. To this +end, we propose the Chain of Logic (CoL), which organizes the synthesis process +into an activity flow and provides heuristic control to guide the process. +Furthermore, by integrating neural networks with libraries and introducing a +Neural Network Feedback Control (NNFC) mechanism, our approach modularizes +synthesis and mitigates the impact of neural network mispredictions. +Experiments on relational and symbolic synthesis tasks show that CoL +significantly enhances the efficiency and reliability of DSL program synthesis +across multiple metrics. Specifically, CoL improves accuracy by 70% while +reducing tree operations by 91% and time by 95%. Additionally, NNFC further +boosts accuracy by 6%, with a 64% reduction in tree operations under +challenging conditions such as insufficient training data, increased +difficulty, and multidomain synthesis. These improvements confirm COOL as a +highly efficient and reliable program synthesis framework. + +
+
+ comment: 31 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Quality In / Quality Out: Data quality more relevant than model choice + in anomaly detection with the UGR'16 + + +
+ Autonomous or self-driving networks are expected to provide a solution to the +myriad of extremely demanding new applications with minimal human supervision. +For this purpose, the community relies on the development of new Machine +Learning (ML) models and techniques. %, like the celebrated Deep Learning (DL). +However, ML can only be as good as the data it is fitted with, and data quality +is an elusive concept difficult to assess. In this paper, we show that +relatively minor modifications on a benchmark dataset (UGR'16, a flow-based +real-traffic dataset for anomaly detection) cause significantly more impact on +model performance than the specific ML technique considered. We also show that +the measured model performance is uncertain, as a result of labelling +inaccuracies. Our findings illustrate that the widely adopted approach of +comparing a set of models in terms of performance results (e.g., in terms of +accuracy or ROC curves) may lead to incorrect conclusions when done without a +proper understanding of dataset biases and sensitivity. We contribute a +methodology to interpret a model response that can be useful for this +understanding. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Synthetic Data via Foundation Model APIs 1: + Images ICLR 2024 + + +
+ Generating differentially private (DP) synthetic data that closely resembles +the original private data is a scalable way to mitigate privacy concerns in the +current data-driven world. In contrast to current practices that train +customized models for this task, we aim to generate DP Synthetic Data via APIs +(DPSDA), where we treat foundation models as blackboxes and only utilize their +inference APIs. Such API-based, training-free approaches are easier to deploy +as exemplified by the recent surge in the number of API-based apps. These +approaches can also leverage the power of large foundation models which are +only accessible via their inference APIs. However, this comes with greater +challenges due to strictly more restrictive model access and the need to +protect privacy from the API provider. + In this paper, we present a new framework called Private Evolution (PE) to +solve this problem and show its initial promise on synthetic images. +Surprisingly, PE can match or even outperform state-of-the-art (SOTA) methods +without any model training. For example, on CIFAR10 (with ImageNet as the +public data), we achieve FID <= 7.9 with privacy cost {\epsilon} = 0.67, +significantly improving the previous SOTA from {\epsilon} = 32. We further +demonstrate the promise of applying PE on large foundation models such as +Stable Diffusion to tackle challenging private datasets with a small number of +high-resolution images. The code and data are released at +https://github.com/microsoft/DPSDA. + +
+
+ comment: Published in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Representation Alignment for Generation: Training Diffusion Transformers + Is Easier Than You Think + + +
+ Recent studies have shown that the denoising process in (generative) +diffusion models can induce meaningful (discriminative) representations inside +the model, though the quality of these representations still lags behind those +learned through recent self-supervised learning methods. We argue that one main +bottleneck in training large-scale diffusion models for generation lies in +effectively learning these representations. Moreover, training can be made +easier by incorporating high-quality external visual representations, rather +than relying solely on the diffusion models to learn them independently. We +study this by introducing a straightforward regularization called +REPresentation Alignment (REPA), which aligns the projections of noisy input +hidden states in denoising networks with clean image representations obtained +from external, pretrained visual encoders. The results are striking: our simple +strategy yields significant improvements in both training efficiency and +generation quality when applied to popular diffusion and flow-based +transformers, such as DiTs and SiTs. For instance, our method can speed up SiT +training by over 17.5$\times$, matching the performance (without +classifier-free guidance) of a SiT-XL model trained for 7M steps in less than +400K steps. In terms of final generation quality, our approach achieves +state-of-the-art results of FID=1.42 using classifier-free guidance with the +guidance interval. + +
+
+ comment: Preprint. Project page: https://sihyun.me/REPA +
+
+
+
+
+ + ♻ ☆ Sharpness-Aware Minimization Revisited: Weighted Sharpness as a + Regularization Term KDD '23 + + +
+ Deep Neural Networks (DNNs) generalization is known to be closely related to +the flatness of minima, leading to the development of Sharpness-Aware +Minimization (SAM) for seeking flatter minima and better generalization. In +this paper, we revisit the loss of SAM and propose a more general method, +called WSAM, by incorporating sharpness as a regularization term. We prove its +generalization bound through the combination of PAC and Bayes-PAC techniques, +and evaluate its performance on various public datasets. The results +demonstrate that WSAM achieves improved generalization, or is at least highly +competitive, compared to the vanilla optimizer, SAM and its variants. The code +is available at +https://github.com/intelligent-machine-learning/atorch/tree/main/atorch/optimizers. + +
+
+ comment: 10 pages. Accepted as a conference paper at KDD '23 +
+
+
+
+
+ + ♻ ☆ Context Matters: Leveraging Contextual Features for Time Series + Forecasting + + +
+ Time series forecasts are often influenced by exogenous contextual features +in addition to their corresponding history. For example, in financial settings, +it is hard to accurately predict a stock price without considering public +sentiments and policy decisions in the form of news articles, tweets, etc. +Though this is common knowledge, the current state-of-the-art (SOTA) +forecasting models fail to incorporate such contextual information, owing to +its heterogeneity and multimodal nature. To address this, we introduce +ContextFormer, a novel plug-and-play method to surgically integrate multimodal +contextual information into existing pre-trained forecasting models. +ContextFormer effectively distills forecast-specific information from rich +multimodal contexts, including categorical, continuous, time-varying, and even +textual information, to significantly enhance the performance of existing base +forecasters. ContextFormer outperforms SOTA forecasting models by up to 30% on +a range of real-world datasets spanning energy, traffic, environmental, and +financial domains. + +
+
+
+
+
+ + ♻ ☆ Developing a Thailand solar irradiance map using Himawari-8 satellite + imageries and deep learning models + + +
+ This paper presents an online platform showing Thailand solar irradiance map +every 30 minutes, available at https://www.cusolarforecast.com. The methodology +for estimating global horizontal irradiance (GHI) across Thailand relies on +cloud index extracted from Himawari-8 satellite imagery, Ineichen clear-sky +model with locally-tuned Linke turbidity, and machine learning models. The +methods take clear-sky irradiance, cloud index, re-analyzed GHI and temperature +data from the MERRA-2 database, and date-time as inputs for GHI estimation +models, including LightGBM, LSTM, Informer, and Transformer. These are +benchmarked with the estimate from a commercial service X by evaluation of +15-minute ground GHI data from 53 ground stations over 1.5 years during +2022-2023. The results show that the four models exhibit comparable overall MAE +performance to the service X. The best model is LightGBM with an overall MAE of +78.58 W/sqm and RMSE of 118.97 W/sqm, while the service X achieves the lowest +MAE, RMSE, and MBE in cloudy condition. Obtaining re-analyzed MERRA-2 data for +the whole Thailand region is not economically feasible for deployment. When +removing these features, the Informer model has a winning performance in MAE of +78.67 W/sqm. The obtained performance aligns with existing literature by taking +the climate zone and time granularity of data into consideration. As the map +shows an estimate of GHI over 93,000 grids with a frequent update, the paper +also describes a computational framework for displaying the entire map. It +tests the runtime performance of deep learning models in the GHI estimation +process. + +
+
+ comment: 23 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ HoPE: A Novel Positional Encoding Without Long-Term Decay for Enhanced + Context Awareness and Extrapolation + + +
+ Many positional encodings (PEs) are designed to exhibit long-term decay, +based on an entrenched and long-standing inductive opinion: tokens farther away +from the current position carry less relevant information. We argue that +long-term decay is outdated in the era of LLMs, as LLMs are now applied to +tasks demanding precise retrieval of in-context information from arbitrary +positions. Firstly, we present empirical analyses on various PEs, demonstrating +that models inherently learn attention with only a local-decay pattern while +forming a U-shape pattern globally, contradicting the principle of long-term +decay. Furthermore, we conduct a detailed analysis of rotary position encoding +(RoPE, a prevalent relative positional encoding in LLMs), and found that the +U-shape attention is caused by some learned components, which are also the key +factor limiting RoPE's expressiveness and extrapolation.Inspired by these +insights, we propose High-frequency rotary Position Encoding (HoPE). HoPE +replaces the specific components in RoPE with position-independent ones, +retaining only high-frequency signals, which also breaks the principle of +long-term decay in theory. HoPE achieves two major advantages: (1) Without +constraints imposed by long-term decay, contradictory factors that limit +spontaneous attention optimization and model extrapolation performance are +removed. (2) Components representing positions and semantics are are optimized. +These enhances model's context awareness and extrapolation, as validated by +extensive experiments. + +
+
+
+
+
+ + ♻ ☆ From interpretability to inference: an estimation framework for + universal approximators + + +
+ We present a novel framework for estimation and inference with the broad +class of universal approximators. Estimation is based on the decomposition of +model predictions into Shapley values. Inference relies on analyzing the bias +and variance properties of individual Shapley components. We show that Shapley +value estimation is asymptotically unbiased, and we introduce Shapley +regressions as a tool to uncover the true data generating process from noisy +data alone. The well-known case of the linear regression is the special case in +our framework if the model is linear in parameters. We present theoretical, +numerical, and empirical results for the estimation of heterogeneous treatment +effects as our guiding example. + +
+
+ comment: 37 pages, 5 figures, 3 tables, 1 algorithm +
+
+
+
+
+ + ♻ ☆ Practical Operator Sketching Framework for Accelerating Iterative + Data-Driven Solutions in Inverse Problems + + +
+ We propose a new operator-sketching paradigm for designing efficient +iterative data-driven reconstruction (IDR) schemes, e.g. Plug-and-Play +algorithms and deep unrolling networks. These IDR schemes are currently the +state-of-the-art solutions for imaging inverse problems. However, for +high-dimensional imaging tasks, especially X-ray CT and MRI imaging, these IDR +schemes typically become inefficient both in terms of computation, due to the +need of computing multiple times the high-dimensional forward and adjoint +operators. In this work, we explore and propose a universal dimensionality +reduction framework for accelerating IDR schemes in solving imaging inverse +problems, based on leveraging the sketching techniques from stochastic +optimization. Using this framework, we derive a number of accelerated IDR +schemes, such as the plug-and-play multi-stage sketched gradient (PnP-MS2G) and +sketching-based primal-dual (LSPD and Sk-LSPD) deep unrolling networks. +Meanwhile, for fully accelerating PnP schemes when the denoisers are +computationally expensive, we provide novel stochastic lazy denoising schemes +(Lazy-PnP and Lazy-PnP-EQ), leveraging the ProxSkip scheme in optimization and +equivariant image denoisers, which can massively accelerate the PnP algorithms +with improved practicality. We provide theoretical analysis for recovery +guarantees of instances of the proposed framework. Our numerical experiments on +natural image processing and tomographic image reconstruction demonstrate the +remarkable effectiveness of our sketched IDR schemes. + +
+
+
+
+
+ + ♻ ☆ Elements of Sequential Monte Carlo + + +
+ A core problem in statistics and probabilistic machine learning is to compute +probability distributions and expectations. This is the fundamental problem of +Bayesian statistics and machine learning, which frames all inference as +expectations with respect to the posterior distribution. The key challenge is +to approximate these intractable expectations. In this tutorial, we review +sequential Monte Carlo (SMC), a random-sampling-based class of methods for +approximate inference. First, we explain the basics of SMC, discuss practical +issues, and review theoretical results. We then examine two of the main user +design choices: the proposal distributions and the so called intermediate +target distributions. We review recent results on how variational inference and +amortization can be used to learn efficient proposals and target distributions. +Next, we discuss the SMC estimate of the normalizing constant, how this can be +used for pseudo-marginal inference and inference evaluation. Throughout the +tutorial we illustrate the use of SMC on various models commonly used in +machine learning, such as stochastic recurrent neural networks, probabilistic +graphical models, and probabilistic programs. + +
+
+ comment: Foundations and Trends in Machine Learning +
+
+
+
+
+ + ♻ ☆ Combining Stochastic Defenses to Resist Gradient Inversion: An Ablation + Study + + +
+ Gradient Inversion (GI) attacks are a ubiquitous threat in Federated Learning +(FL) as they exploit gradient leakage to reconstruct supposedly private +training data. Common defense mechanisms such as Differential Privacy (DP) or +stochastic Privacy Modules (PMs) introduce randomness during gradient +computation to prevent such attacks. However, we pose that if an attacker +effectively mimics a client's stochastic gradient computation, the attacker can +circumvent the defense and reconstruct clients' private training data. This +paper introduces several targeted GI attacks that leverage this principle to +bypass common defense mechanisms. As a result, we demonstrate that no +individual defense provides sufficient privacy protection. To address this +issue, we propose to combine multiple defenses. We conduct an extensive +ablation study to evaluate the influence of various combinations of defenses on +privacy protection and model utility. We observe that only the combination of +DP and a stochastic PM was sufficient to decrease the Attack Success Rate (ASR) +from 100% to 0%, thus preserving privacy. Moreover, we found that this +combination of defenses consistently achieves the best trade-off between +privacy and model utility. + +
+
+ comment: This version represents a comprehensive rework of the initial study, + including substantial updates to the methodology, analysis, and conclusions. + 26 pages, 2 figures, 5 tables +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Feature Coding in the Era of Large Models: Dataset, Test Conditions, and + Benchmark + + +
+ Large models have achieved remarkable performance across various tasks, yet +they incur significant computational costs and privacy concerns during both +training and inference. Distributed deployment has emerged as a potential +solution, but it necessitates the exchange of intermediate information between +model segments, with feature representations serving as crucial information +carriers. To optimize information exchange, feature coding methods are applied +to reduce transmission and storage overhead. Despite its importance, feature +coding for large models remains an under-explored area. In this paper, we draw +attention to large model feature coding and make three contributions to this +field. First, we introduce a comprehensive dataset encompassing diverse +features generated by three representative types of large models. Second, we +establish unified test conditions, enabling standardized evaluation pipelines +and fair comparisons across future feature coding studies. Third, we introduce +two baseline methods derived from widely used image coding techniques and +benchmark their performance on the proposed dataset. These contributions aim to +advance the field of feature coding, facilitating more efficient large model +deployment. All source code and the dataset will be made available on GitHub. + +
+
+
+
+
+ + ♻ ☆ Identity-Preserving Text-to-Video Generation by Frequency Decomposition + + +
+ Identity-preserving text-to-video (IPT2V) generation aims to create +high-fidelity videos with consistent human identity. It is an important task in +video generation but remains an open problem for generative models. This paper +pushes the technical frontier of IPT2V in two directions that have not been +resolved in literature: (1) A tuning-free pipeline without tedious case-by-case +finetuning, and (2) A frequency-aware heuristic identity-preserving DiT-based +control scheme. We propose ConsisID, a tuning-free DiT-based controllable IPT2V +model to keep human identity consistent in the generated video. Inspired by +prior findings in frequency analysis of diffusion transformers, it employs +identity-control signals in the frequency domain, where facial features can be +decomposed into low-frequency global features and high-frequency intrinsic +features. First, from a low-frequency perspective, we introduce a global facial +extractor, which encodes reference images and facial key points into a latent +space, generating features enriched with low-frequency information. These +features are then integrated into shallow layers of the network to alleviate +training challenges associated with DiT. Second, from a high-frequency +perspective, we design a local facial extractor to capture high-frequency +details and inject them into transformer blocks, enhancing the model's ability +to preserve fine-grained features. We propose a hierarchical training strategy +to leverage frequency information for identity preservation, transforming a +vanilla pre-trained video generation model into an IPT2V model. Extensive +experiments demonstrate that our frequency-aware heuristic scheme provides an +optimal control solution for DiT-based models. Thanks to this scheme, our +ConsisID generates high-quality, identity-preserving videos, making strides +towards more effective IPT2V. + +
+
+ comment: 12 pages, 8 figures, Code: https://github.com/PKU-YuanGroup/ConsisID +
+
+
+
+
+ + ♻ ☆ Memories are One-to-Many Mapping Alleviators in Talking Face Generation + + +
+ Talking face generation aims at generating photo-realistic video portraits of +a target person driven by input audio. Due to its nature of one-to-many mapping +from the input audio to the output video (e.g., one speech content may have +multiple feasible visual appearances), learning a deterministic mapping like +previous works brings ambiguity during training, and thus causes inferior +visual results. Although this one-to-many mapping could be alleviated in part +by a two-stage framework (i.e., an audio-to-expression model followed by a +neural-rendering model), it is still insufficient since the prediction is +produced without enough information (e.g., emotions, wrinkles, etc.). In this +paper, we propose MemFace to complement the missing information with an +implicit memory and an explicit memory that follow the sense of the two stages +respectively. More specifically, the implicit memory is employed in the +audio-to-expression model to capture high-level semantics in the +audio-expression shared space, while the explicit memory is employed in the +neural-rendering model to help synthesize pixel-level details. Our experimental +results show that our proposed MemFace surpasses all the state-of-the-art +results across multiple scenarios consistently and significantly. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence + (2024). Project page: see https://memoryface.github.io +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Navigation World Models + + +
+ Navigation is a fundamental skill of agents with visual-motor capabilities. +We introduce a Navigation World Model (NWM), a controllable video generation +model that predicts future visual observations based on past observations and +navigation actions. To capture complex environment dynamics, NWM employs a +Conditional Diffusion Transformer (CDiT), trained on a diverse collection of +egocentric videos of both human and robotic agents, and scaled up to 1 billion +parameters. In familiar environments, NWM can plan navigation trajectories by +simulating them and evaluating whether they achieve the desired goal. Unlike +supervised navigation policies with fixed behavior, NWM can dynamically +incorporate constraints during planning. Experiments demonstrate its +effectiveness in planning trajectories from scratch or by ranking trajectories +sampled from an external policy. Furthermore, NWM leverages its learned visual +priors to imagine trajectories in unfamiliar environments from a single input +image, making it a flexible and powerful tool for next-generation navigation +systems. + +
+
+ comment: project page: https://www.amirbar.net/nwm/ +
+
+
+
+
+ + ☆ Style3D: Attention-guided Multi-view Style Transfer for 3D Object + Generation + + +
+ We present Style3D, a novel approach for generating stylized 3D objects from +a content image and a style image. Unlike most previous methods that require +case- or style-specific training, Style3D supports instant 3D object +stylization. Our key insight is that 3D object stylization can be decomposed +into two interconnected processes: multi-view dual-feature alignment and +sparse-view spatial reconstruction. We introduce MultiFusion Attention, an +attention-guided technique to achieve multi-view stylization from the +content-style pair. Specifically, the query features from the content image +preserve geometric consistency across multiple views, while the key and value +features from the style image are used to guide the stylistic transfer. This +dual-feature alignment ensures that spatial coherence and stylistic fidelity +are maintained across multi-view images. Finally, a large 3D reconstruction +model is introduced to generate coherent stylized 3D objects. By establishing +an interplay between structural and stylistic features across multiple views, +our approach enables a holistic 3D stylization process. Extensive experiments +demonstrate that Style3D offers a more flexible and scalable solution for +generating style-consistent 3D assets, surpassing existing methods in both +computational efficiency and visual quality. + +
+
+
+
+
+ + ☆ Sparse-view Pose Estimation and Reconstruction via Analysis by + Generative Synthesis NeurIPS 2024 + + +
+ Inferring the 3D structure underlying a set of multi-view images typically +requires solving two co-dependent tasks -- accurate 3D reconstruction requires +precise camera poses, and predicting camera poses relies on (implicitly or +explicitly) modeling the underlying 3D. The classical framework of analysis by +synthesis casts this inference as a joint optimization seeking to explain the +observed pixels, and recent instantiations learn expressive 3D representations +(e.g., Neural Fields) with gradient-descent-based pose refinement of initial +pose estimates. However, given a sparse set of observed views, the observations +may not provide sufficient direct evidence to obtain complete and accurate 3D. +Moreover, large errors in pose estimation may not be easily corrected and can +further degrade the inferred 3D. To allow robust 3D reconstruction and pose +estimation in this challenging setup, we propose SparseAGS, a method that +adapts this analysis-by-synthesis approach by: a) including +novel-view-synthesis-based generative priors in conjunction with photometric +objectives to improve the quality of the inferred 3D, and b) explicitly +reasoning about outliers and using a discrete search with a continuous +optimization-based strategy to correct them. We validate our framework across +real-world and synthetic datasets in combination with several off-the-shelf +pose estimation systems as initialization. We find that it significantly +improves the base systems' pose accuracy while yielding high-quality 3D +reconstructions that outperform the results from current multi-view +reconstruction baselines. + +
+
+ comment: NeurIPS 2024. Project website: https://qitaozhao.github.io/SparseAGS +
+
+
+
+
+ + ☆ Streaming Detection of Queried Event Start + + +
+ Robotics, autonomous driving, augmented reality, and many embodied computer +vision applications must quickly react to user-defined events unfolding in real +time. We address this setting by proposing a novel task for multimodal video +understanding-Streaming Detection of Queried Event Start (SDQES). The goal of +SDQES is to identify the beginning of a complex event as described by a natural +language query, with high accuracy and low latency. We introduce a new +benchmark based on the Ego4D dataset, as well as new task-specific metrics to +study streaming multimodal detection of diverse events in an egocentric video +setting. Inspired by parameter-efficient fine-tuning methods in NLP and for +video tasks, we propose adapter-based baselines that enable image-to-video +transfer learning, allowing for efficient online video modeling. We evaluate +three vision-language backbones and three adapter architectures on both +short-clip and untrimmed video settings. + +
+
+
+
+
+ + ☆ FreeSim: Toward Free-viewpoint Camera Simulation in Driving Scenes + + +
+ We propose FreeSim, a camera simulation method for autonomous driving. +FreeSim emphasizes high-quality rendering from viewpoints beyond the recorded +ego trajectories. In such viewpoints, previous methods have unacceptable +degradation because the training data of these viewpoints is unavailable. To +address such data scarcity, we first propose a generative enhancement model +with a matched data construction strategy. The resulting model can generate +high-quality images in a viewpoint slightly deviated from the recorded +trajectories, conditioned on the degraded rendering of this viewpoint. We then +propose a progressive reconstruction strategy, which progressively adds +generated images of unrecorded views into the reconstruction process, starting +from slightly off-trajectory viewpoints and moving progressively farther away. +With this progressive generation-reconstruction pipeline, FreeSim supports +high-quality off-trajectory view synthesis under large deviations of more than +3 meters. + +
+
+ comment: Project page: https://drive-sim.github.io/freesim +
+
+
+
+
+ + ☆ Inst-IT: Boosting Multimodal Instance Understanding via Explicit Visual + Prompt Instruction Tuning + + +
+ Large Multimodal Models (LMMs) have made significant breakthroughs with the +advancement of instruction tuning. However, while existing models can +understand images and videos at a holistic level, they still struggle with +instance-level understanding that requires a more nuanced comprehension and +alignment. Instance-level understanding is crucial, as it focuses on the +specific elements that we are most interested in. Excitingly, existing works +find that the state-of-the-art LMMs exhibit strong instance understanding +capabilities when provided with explicit visual cues. Motivated by this, we +introduce an automated annotation pipeline assisted by GPT-4o to extract +instance-level information from images and videos through explicit visual +prompting for instance guidance. Building upon this pipeline, we proposed +Inst-IT, a solution to enhance LMMs in Instance understanding via explicit +visual prompt Instruction Tuning. Inst-IT consists of a benchmark to diagnose +multimodal instance-level understanding, a large-scale instruction-tuning +dataset, and a continuous instruction-tuning training paradigm to effectively +enhance spatial-temporal instance understanding capabilities of existing LMMs. +Experimental results show that, with the boost of Inst-IT, our models not only +achieve outstanding performance on Inst-IT Bench but also demonstrate +significant improvements across various generic image and video understanding +benchmarks. This highlights that our dataset not only boosts instance-level +understanding but also strengthens the overall capabilities of generic image +and video comprehension. + +
+
+ comment: Project page at https://inst-it.github.io +
+
+
+
+
+ + ☆ FLAIR: VLM with Fine-grained Language-informed Image Representations + + +
+ CLIP has shown impressive results in aligning images and texts at scale. +However, its ability to capture detailed visual features remains limited +because CLIP matches images and texts at a global level. To address this issue, +we propose FLAIR, Fine-grained Language-informed Image Representations, an +approach that utilizes long and detailed image descriptions to learn localized +image embeddings. By sampling diverse sub-captions that describe fine-grained +details about an image, we train our vision-language model to produce not only +global embeddings but also text-specific image representations. Our model +introduces text-conditioned attention pooling on top of local image tokens to +produce fine-grained image representations that excel at retrieving detailed +image content. We achieve state-of-the-art performance on both, existing +multimodal retrieval benchmarks, as well as, our newly introduced fine-grained +retrieval task which evaluates vision-language models' ability to retrieve +partial image content. Furthermore, our experiments demonstrate the +effectiveness of FLAIR trained on 30M image-text pairs in capturing +fine-grained visual information, including zero-shot semantic segmentation, +outperforming models trained on billions of pairs. Code is available at +https://github.com/ExplainableML/flair . + +
+
+
+
+
+ + ☆ MIDI: Multi-Instance Diffusion for Single Image to 3D Scene Generation + + +
+ This paper introduces MIDI, a novel paradigm for compositional 3D scene +generation from a single image. Unlike existing methods that rely on +reconstruction or retrieval techniques or recent approaches that employ +multi-stage object-by-object generation, MIDI extends pre-trained image-to-3D +object generation models to multi-instance diffusion models, enabling the +simultaneous generation of multiple 3D instances with accurate spatial +relationships and high generalizability. At its core, MIDI incorporates a novel +multi-instance attention mechanism, that effectively captures inter-object +interactions and spatial coherence directly within the generation process, +without the need for complex multi-step processes. The method utilizes partial +object images and global scene context as inputs, directly modeling object +completion during 3D generation. During training, we effectively supervise the +interactions between 3D instances using a limited amount of scene-level data, +while incorporating single-object data for regularization, thereby maintaining +the pre-trained generalization ability. MIDI demonstrates state-of-the-art +performance in image-to-scene generation, validated through evaluations on +synthetic data, real-world scene data, and stylized scene images generated by +text-to-image diffusion models. + +
+
+ comment: Project page: https://huanngzh.github.io/MIDI-Page/ +
+
+
+
+
+ + ☆ PaliGemma 2: A Family of Versatile VLMs for Transfer + + +
+ PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM) +based on the Gemma 2 family of language models. We combine the SigLIP-So400m +vision encoder that was also used by PaliGemma with the whole range of Gemma 2 +models, from the 2B one all the way up to the 27B model. We train these models +at three resolutions (224px, 448px, and 896px) in multiple stages to equip them +with broad knowledge for transfer via fine-tuning. The resulting family of base +models covering different model sizes and resolutions allows us to investigate +factors impacting transfer performance (such as learning rate) and to analyze +the interplay between the type of task, model size, and resolution. We further +increase the number and breadth of transfer tasks beyond the scope of PaliGemma +including different OCR-related tasks such as table structure recognition, +molecular structure recognition, music score recognition, as well as long +fine-grained captioning and radiography report generation, on which PaliGemma 2 +obtains state-of-the-art results. + +
+
+
+
+
+ + ☆ Imagine360: Immersive 360 Video Generation from Perspective Anchor + + +
+ $360^\circ$ videos offer a hyper-immersive experience that allows the viewers +to explore a dynamic scene from full 360 degrees. To achieve more user-friendly +and personalized content creation in $360^\circ$ video format, we seek to lift +standard perspective videos into $360^\circ$ equirectangular videos. To this +end, we introduce Imagine360, the first perspective-to-$360^\circ$ video +generation framework that creates high-quality $360^\circ$ videos with rich and +diverse motion patterns from video anchors. Imagine360 learns fine-grained +spherical visual and motion patterns from limited $360^\circ$ video data with +several key designs. 1) Firstly we adopt the dual-branch design, including a +perspective and a panorama video denoising branch to provide local and global +constraints for $360^\circ$ video generation, with motion module and spatial +LoRA layers fine-tuned on extended web $360^\circ$ videos. 2) Additionally, an +antipodal mask is devised to capture long-range motion dependencies, enhancing +the reversed camera motion between antipodal pixels across hemispheres. 3) To +handle diverse perspective video inputs, we propose elevation-aware designs +that adapt to varying video masking due to changing elevations across frames. +Extensive experiments show Imagine360 achieves superior graphics quality and +motion coherence among state-of-the-art $360^\circ$ video generation methods. +We believe Imagine360 holds promise for advancing personalized, immersive +$360^\circ$ video creation. + +
+
+ comment: Project page: https://ys-imtech.github.io/projects/Imagine360 +
+
+
+
+
+ + ☆ Perception Tokens Enhance Visual Reasoning in Multimodal Language Models + + +
+ Multimodal language models (MLMs) still face challenges in fundamental visual +perception tasks where specialized models excel. Tasks requiring reasoning +about 3D structures benefit from depth estimation, and reasoning about 2D +object instances benefits from object detection. Yet, MLMs can not produce +intermediate depth or boxes to reason over. Finetuning MLMs on relevant data +doesn't generalize well and outsourcing computation to specialized vision tools +is too compute-intensive and memory-inefficient. To address this, we introduce +Perception Tokens, intrinsic image representations designed to assist reasoning +tasks where language is insufficient. Perception tokens act as auxiliary +reasoning tokens, akin to chain-of-thought prompts in language models. For +example, in a depth-related task, an MLM augmented with perception tokens can +reason by generating a depth map as tokens, enabling it to solve the problem +effectively. We propose AURORA, a training method that augments MLMs with +perception tokens for improved reasoning over visual inputs. AURORA leverages a +VQVAE to transform intermediate image representations, such as depth maps into +a tokenized format and bounding box tokens, which is then used in a multi-task +training framework. AURORA achieves notable improvements across counting +benchmarks: +10.8% on BLINK, +11.3% on CVBench, and +8.3% on SEED-Bench, +outperforming finetuning approaches in generalization across datasets. It also +improves on relative depth: over +6% on BLINK. With perception tokens, AURORA +expands the scope of MLMs beyond language-based reasoning, paving the way for +more effective visual reasoning capabilities. + +
+
+
+
+
+ + ☆ Feed-Forward Bullet-Time Reconstruction of Dynamic Scenes from Monocular + Videos + + +
+ Recent advancements in static feed-forward scene reconstruction have +demonstrated significant progress in high-quality novel view synthesis. +However, these models often struggle with generalizability across diverse +environments and fail to effectively handle dynamic content. We present BTimer +(short for BulletTimer), the first motion-aware feed-forward model for +real-time reconstruction and novel view synthesis of dynamic scenes. Our +approach reconstructs the full scene in a 3D Gaussian Splatting representation +at a given target ('bullet') timestamp by aggregating information from all the +context frames. Such a formulation allows BTimer to gain scalability and +generalization by leveraging both static and dynamic scene datasets. Given a +casual monocular dynamic video, BTimer reconstructs a bullet-time scene within +150ms while reaching state-of-the-art performance on both static and dynamic +scene datasets, even compared with optimization-based approaches. + +
+
+ comment: Project website: + https://research.nvidia.com/labs/toronto-ai/bullet-timer/ +
+
+
+
+
+ + ☆ Seeing Beyond Views: Multi-View Driving Scene Video Generation with + Holistic Attention + + +
+ Generating multi-view videos for autonomous driving training has recently +gained much attention, with the challenge of addressing both cross-view and +cross-frame consistency. Existing methods typically apply decoupled attention +mechanisms for spatial, temporal, and view dimensions. However, these +approaches often struggle to maintain consistency across dimensions, +particularly when handling fast-moving objects that appear at different times +and viewpoints. In this paper, we present CogDriving, a novel network designed +for synthesizing high-quality multi-view driving videos. CogDriving leverages a +Diffusion Transformer architecture with holistic-4D attention modules, enabling +simultaneous associations across the spatial, temporal, and viewpoint +dimensions. We also propose a lightweight controller tailored for CogDriving, +i.e., Micro-Controller, which uses only 1.1% of the parameters of the standard +ControlNet, enabling precise control over Bird's-Eye-View layouts. To enhance +the generation of object instances crucial for autonomous driving, we propose a +re-weighted learning objective, dynamically adjusting the learning weights for +object instances during training. CogDriving demonstrates strong performance on +the nuScenes validation set, achieving an FVD score of 37.8, highlighting its +ability to generate realistic driving videos. The project can be found at +https://luhannan.github.io/CogDrivingPage/. + +
+
+
+
+
+ + ☆ Dense Scene Reconstruction from Light-Field Images Affected by Rolling + Shutter + + +
+ This paper presents a dense depth estimation approach from light-field (LF) +images that is able to compensate for strong rolling shutter (RS) effects. Our +method estimates RS compensated views and dense RS compensated disparity maps. +We present a two-stage method based on a 2D Gaussians Splatting that allows for +a ``render and compare" strategy with a point cloud formulation. In the first +stage, a subset of sub-aperture images is used to estimate an RS agnostic 3D +shape that is related to the scene target shape ``up to a motion". In the +second stage, the deformation of the 3D shape is computed by estimating an +admissible camera motion. We demonstrate the effectiveness and advantages of +this approach through several experiments conducted for different scenes and +types of motions. Due to lack of suitable datasets for evaluation, we also +present a new carefully designed synthetic dataset of RS LF images. The source +code, trained models and dataset will be made publicly available at: +https://github.com/ICB-Vision-AI/DenseRSLF + +
+
+
+
+
+ + NVComposer: Boosting Generative Novel View Synthesis with Multiple + Sparse and Unposed Images + + +
+ Recent advancements in generative models have significantly improved novel +view synthesis (NVS) from multi-view data. However, existing methods depend on +external multi-view alignment processes, such as explicit pose estimation or +pre-reconstruction, which limits their flexibility and accessibility, +especially when alignment is unstable due to insufficient overlap or occlusions +between views. In this paper, we propose NVComposer, a novel approach that +eliminates the need for explicit external alignment. NVComposer enables the +generative model to implicitly infer spatial and geometric relationships +between multiple conditional views by introducing two key components: 1) an +image-pose dual-stream diffusion model that simultaneously generates target +novel views and condition camera poses, and 2) a geometry-aware feature +alignment module that distills geometric priors from dense stereo models during +training. Extensive experiments demonstrate that NVComposer achieves +state-of-the-art performance in generative multi-view NVS tasks, removing the +reliance on external alignment and thus improving model accessibility. Our +approach shows substantial improvements in synthesis quality as the number of +unposed input views increases, highlighting its potential for more flexible and +accessible generative NVS systems. + +
+
+ comment: Project webpage: https://lg-li.github.io/project/nvcomposer +
+
+
+
+
+ + ☆ Distilling Diffusion Models to Efficient 3D LiDAR Scene Completion + + +
+ Diffusion models have been applied to 3D LiDAR scene completion due to their +strong training stability and high completion quality. However, the slow +sampling speed limits the practical application of diffusion-based scene +completion models since autonomous vehicles require an efficient perception of +surrounding environments. This paper proposes a novel distillation method +tailored for 3D LiDAR scene completion models, dubbed $\textbf{ScoreLiDAR}$, +which achieves efficient yet high-quality scene completion. ScoreLiDAR enables +the distilled model to sample in significantly fewer steps after distillation. +To improve completion quality, we also introduce a novel $\textbf{Structural +Loss}$, which encourages the distilled model to capture the geometric structure +of the 3D LiDAR scene. The loss contains a scene-wise term constraining the +holistic structure and a point-wise term constraining the key landmark points +and their relative configuration. Extensive experiments demonstrate that +ScoreLiDAR significantly accelerates the completion time from 30.55 to 5.37 +seconds per frame ($>$5$\times$) on SemanticKITTI and achieves superior +performance compared to state-of-the-art 3D LiDAR scene completion models. Our +code is publicly available at https://github.com/happyw1nd/ScoreLiDAR. + +
+
+ comment: https://github.com/happyw1nd/ScoreLiDAR +
+
+
+
+
+ + ☆ KKLIP: Knowledge Distillation Exploiting K-means Clustering for + Language-Image Pre-Training + + +
+ Recently, CLIP has emerged as a valuable model for aligning image and text +information in multi-modal scenarios. However, researchers have observed +limitations in the ability of CLIP's text and image encoders to extract +detailed knowledge from caption-image pairs. In response, this paper introduces +KKLIP, a novel approach designed to enhance the quality of CLIP by +incorporating a new knowledge distillation (KD) method derived from Llama 2. +Our method comprises three objectives: Text Embedding Distillation, Concept +Learning, and Contrastive Learning. Firstly, Text Embedding Distillation +involves training the KKLIP text encoder to emulate the teacher model, Llama 2. +Secondly, Concept Learning assigns a soft concept label to each caption-image +pair through offline k-means clustering of text information from Llama 2, +allowing KKLIP to learn from these soft concept labels. Finally, Contrastive +Learning harmonizes text and image embeddings. Our experimental results +demonstrate that KKLIP enhances the quality of both text and image encoders. + +
+
+
+
+
+ + ☆ Distillation of Diffusion Features for Semantic Correspondence WACV 2025 + + +
+ Semantic correspondence, the task of determining relationships between +different parts of images, underpins various applications including 3D +reconstruction, image-to-image translation, object tracking, and visual place +recognition. Recent studies have begun to explore representations learned in +large generative image models for semantic correspondence, demonstrating +promising results. Building on this progress, current state-of-the-art methods +rely on combining multiple large models, resulting in high computational +demands and reduced efficiency. In this work, we address this challenge by +proposing a more computationally efficient approach. We propose a novel +knowledge distillation technique to overcome the problem of reduced efficiency. +We show how to use two large vision foundation models and distill the +capabilities of these complementary models into one smaller model that +maintains high accuracy at reduced computational cost. Furthermore, we +demonstrate that by incorporating 3D data, we are able to further improve +performance, without the need for human-annotated correspondences. Overall, our +empirical results demonstrate that our distilled model with 3D data +augmentation achieves performance superior to current state-of-the-art methods +while significantly reducing computational load and enhancing practicality for +real-world applications, such as semantic video correspondence. Our code and +weights are publicly available on our project page. + +
+
+ comment: WACV 2025, Page: https://compvis.github.io/distilldift +
+
+
+
+
+ + ☆ A Bidirectional Siamese Recurrent Neural Network for Accurate Gait + Recognition Using Body Landmarks + + +
+ Gait recognition is a significant biometric technique for person +identification, particularly in scenarios where other physiological biometrics +are impractical or ineffective. In this paper, we address the challenges +associated with gait recognition and present a novel approach to improve its +accuracy and reliability. The proposed method leverages advanced techniques, +including sequential gait landmarks obtained through the Mediapipe pose +estimation model, Procrustes analysis for alignment, and a Siamese +biGRU-dualStack Neural Network architecture for capturing temporal +dependencies. Extensive experiments were conducted on large-scale cross-view +datasets to demonstrate the effectiveness of the approach, achieving high +recognition accuracy compared to other models. The model demonstrated +accuracies of 95.7%, 94.44%, 87.71%, and 86.6% on CASIA-B, SZU RGB-D, OU-MVLP, +and Gait3D datasets respectively. The results highlight the potential +applications of the proposed method in various practical domains, indicating +its significant contribution to the field of gait recognition. + +
+
+
+
+
+ + ☆ Data Fusion of Semantic and Depth Information in the Context of Object + Detection + + +
+ Considerable study has already been conducted regarding autonomous driving in +modern era. An autonomous driving system must be extremely good at detecting +objects surrounding the car to ensure safety. In this paper, classification, +and estimation of an object's (pedestrian) position (concerning an ego 3D +coordinate system) are studied and the distance between the ego vehicle and the +object in the context of autonomous driving is measured. To classify the +object, faster Region-based Convolution Neural Network (R-CNN) with inception +v2 is utilized. First, a network is trained with customized dataset to estimate +the reference position of objects as well as the distance from the vehicle. +From camera calibration to computing the distance, cutting-edge technologies of +computer vision algorithms in a series of processes are applied to generate a +3D reference point of the region of interest. The foremost step in this process +is generating a disparity map using the concept of stereo vision. + +
+
+
+
+
+ + ☆ Urban4D: Semantic-Guided 4D Gaussian Splatting for Urban Scene + Reconstruction + + +
+ Reconstructing dynamic urban scenes presents significant challenges due to +their intrinsic geometric structures and spatiotemporal dynamics. Existing +methods that attempt to model dynamic urban scenes without leveraging priors on +potentially moving regions often produce suboptimal results. Meanwhile, +approaches based on manual 3D annotations yield improved reconstruction quality +but are impractical due to labor-intensive labeling. In this paper, we revisit +the potential of 2D semantic maps for classifying dynamic and static Gaussians +and integrating spatial and temporal dimensions for urban scene representation. +We introduce Urban4D, a novel framework that employs a semantic-guided +decomposition strategy inspired by advances in deep 2D semantic map generation. +Our approach distinguishes potentially dynamic objects through reliable +semantic Gaussians. To explicitly model dynamic objects, we propose an +intuitive and effective 4D Gaussian splatting (4DGS) representation that +aggregates temporal information through learnable time embeddings for each +Gaussian, predicting their deformations at desired timestamps using a +multilayer perceptron (MLP). For more accurate static reconstruction, we also +design a k-nearest neighbor (KNN)-based consistency regularization to handle +the ground surface due to its low-texture characteristic. Extensive experiments +on real-world datasets demonstrate that Urban4D not only achieves comparable or +better quality than previous state-of-the-art methods but also effectively +captures dynamic objects while maintaining high visual fidelity for static +elements. + +
+
+
+
+
+ + ☆ Measure Anything: Real-time, Multi-stage Vision-based Dimensional + Measurement using Segment Anything + + +
+ We present Measure Anything, a comprehensive vision-based framework for +dimensional measurement of objects with circular cross-sections, leveraging the +Segment Anything Model (SAM). Our approach estimates key geometric features -- +including diameter, length, and volume -- for rod-like geometries with varying +curvature and general objects with constant skeleton slope. The framework +integrates segmentation, mask processing, skeleton construction, and 2D-3D +transformation, packaged in a user-friendly interface. We validate our +framework by estimating the diameters of Canola stems -- collected from +agricultural fields in North Dakota -- which are thin and non-uniform, posing +challenges for existing methods. Measuring its diameters is critical, as it is +a phenotypic traits that correlates with the health and yield of Canola crops. +This application also exemplifies the potential of Measure Anything, where +integrating intelligent models -- such as keypoint detection -- extends its +scalability to fully automate the measurement process for high-throughput +applications. Furthermore, we showcase its versatility in robotic grasping, +leveraging extracted geometric features to identify optimal grasp points. + +
+
+
+
+
+ + ☆ Training-Free Mitigation of Language Reasoning Degradation After + Multimodal Instruction Tuning + + +
+ Multimodal models typically combine a powerful large language model (LLM) +with a vision encoder and are then trained on multimodal data via instruction +tuning. While this process adapts LLMs to multimodal settings, it remains +unclear whether this adaptation compromises their original language reasoning +capabilities. In this work, we explore the effects of multimodal instruction +tuning on language reasoning performance. We focus on LLaVA, a leading +multimodal framework that integrates LLMs such as Vicuna or Mistral with the +CLIP vision encoder. We compare the performance of the original LLMs with their +multimodal-adapted counterparts across eight language reasoning tasks. Our +experiments yield several key insights. First, the impact of multimodal +learning varies between Vicuna and Mistral: we observe a degradation in +language reasoning for Mistral but improvements for Vicuna across most tasks. +Second, while multimodal instruction learning consistently degrades performance +on mathematical reasoning tasks (e.g., GSM8K), it enhances performance on +commonsense reasoning tasks (e.g., CommonsenseQA). Finally, we demonstrate that +a training-free model merging technique can effectively mitigate the language +reasoning degradation observed in multimodal-adapted Mistral and even improve +performance on visual tasks. + +
+
+
+
+
+ + ☆ Gesture Classification in Artworks Using Contextual Image Features + + +
+ Recognizing gestures in artworks can add a valuable dimension to art +understanding and help to acknowledge the role of the sense of smell in +cultural heritage. We propose a method to recognize smell gestures in +historical artworks. We show that combining local features with global image +context improves classification performance notably on different backbones. + +
+
+
+
+
+ + ☆ Pre-trained Multiple Latent Variable Generative Models are good + defenders against Adversarial Attacks + + +
+ Attackers can deliberately perturb classifiers' input with subtle noise, +altering final predictions. Among proposed countermeasures, adversarial +purification employs generative networks to preprocess input images, filtering +out adversarial noise. In this study, we propose specific generators, defined +Multiple Latent Variable Generative Models (MLVGMs), for adversarial +purification. These models possess multiple latent variables that naturally +disentangle coarse from fine features. Taking advantage of these properties, we +autoencode images to maintain class-relevant information, while discarding and +re-sampling any detail, including adversarial noise. The procedure is +completely training-free, exploring the generalization abilities of pre-trained +MLVGMs on the adversarial purification downstream task. Despite the lack of +large models, trained on billions of samples, we show that smaller MLVGMs are +already competitive with traditional methods, and can be used as foundation +models. Official code released at https://github.com/SerezD/gen_adversarial. + +
+
+
+
+
+ + ☆ PlanarSplatting: Accurate Planar Surface Reconstruction in 3 Minutes + + +
+ This paper presents PlanarSplatting, an ultra-fast and accurate surface +reconstruction approach for multiview indoor images. We take the 3D planes as +the main objective due to their compactness and structural expressiveness in +indoor scenes, and develop an explicit optimization framework that learns to +fit the expected surface of indoor scenes by splatting the 3D planes into 2.5D +depth and normal maps. As our PlanarSplatting operates directly on the 3D plane +primitives, it eliminates the dependencies on 2D/3D plane detection and plane +matching and tracking for planar surface reconstruction. Furthermore, the +essential merits of plane-based representation plus CUDA-based implementation +of planar splatting functions, PlanarSplatting reconstructs an indoor scene in +3 minutes while having significantly better geometric accuracy. Thanks to our +ultra-fast reconstruction speed, the largest quantitative evaluation on the +ScanNet and ScanNet++ datasets over hundreds of scenes clearly demonstrated the +advantages of our method. We believe that our accurate and ultrafast planar +surface reconstruction method will be applied in the structured data curation +for surface reconstruction in the future. The code of our CUDA implementation +will be publicly available. Project page: +https://icetttb.github.io/PlanarSplatting/ + +
+
+ comment: Project page: https://icetttb.github.io/PlanarSplatting/ +
+
+
+
+
+ + ☆ CleanDIFT: Diffusion Features without Noise + + +
+ Internal features from large-scale pre-trained diffusion models have recently +been established as powerful semantic descriptors for a wide range of +downstream tasks. Works that use these features generally need to add noise to +images before passing them through the model to obtain the semantic features, +as the models do not offer the most useful features when given images with +little to no noise. We show that this noise has a critical impact on the +usefulness of these features that cannot be remedied by ensembling with +different random noises. We address this issue by introducing a lightweight, +unsupervised fine-tuning method that enables diffusion backbones to provide +high-quality, noise-free semantic features. We show that these features readily +outperform previous diffusion features by a wide margin in a wide variety of +extraction setups and downstream tasks, offering better performance than even +ensemble-based methods at a fraction of the cost. + +
+
+ comment: for the project page and code, view + https://compvis.github.io/CleanDIFT/ +
+
+
+
+
+ + ☆ SINGER: Vivid Audio-driven Singing Video Generation with Multi-scale + Spectral Diffusion Model + + +
+ Recent advancements in generative models have significantly enhanced talking +face video generation, yet singing video generation remains underexplored. The +differences between human talking and singing limit the performance of existing +talking face video generation models when applied to singing. The fundamental +differences between talking and singing-specifically in audio characteristics +and behavioral expressions-limit the effectiveness of existing models. We +observe that the differences between singing and talking audios manifest in +terms of frequency and amplitude. To address this, we have designed a +multi-scale spectral module to help the model learn singing patterns in the +spectral domain. Additionally, we develop a spectral-filtering module that aids +the model in learning the human behaviors associated with singing audio. These +two modules are integrated into the diffusion model to enhance singing video +generation performance, resulting in our proposed model, SINGER. Furthermore, +the lack of high-quality real-world singing face videos has hindered the +development of the singing video generation community. To address this gap, we +have collected an in-the-wild audio-visual singing dataset to facilitate +research in this area. Our experiments demonstrate that SINGER is capable of +generating vivid singing videos and outperforms state-of-the-art methods in +both objective and subjective evaluations. + +
+
+
+
+
+ + ☆ 2DGS-Room: Seed-Guided 2D Gaussian Splatting with Geometric Constrains + for High-Fidelity Indoor Scene Reconstruction + + +
+ The reconstruction of indoor scenes remains challenging due to the inherent +complexity of spatial structures and the prevalence of textureless regions. +Recent advancements in 3D Gaussian Splatting have improved novel view synthesis +with accelerated processing but have yet to deliver comparable performance in +surface reconstruction. In this paper, we introduce 2DGS-Room, a novel method +leveraging 2D Gaussian Splatting for high-fidelity indoor scene reconstruction. +Specifically, we employ a seed-guided mechanism to control the distribution of +2D Gaussians, with the density of seed points dynamically optimized through +adaptive growth and pruning mechanisms. To further improve geometric accuracy, +we incorporate monocular depth and normal priors to provide constraints for +details and textureless regions respectively. Additionally, multi-view +consistency constraints are employed to mitigate artifacts and further enhance +reconstruction quality. Extensive experiments on ScanNet and ScanNet++ datasets +demonstrate that our method achieves state-of-the-art performance in indoor +scene reconstruction. + +
+
+
+
+
+ + ☆ Deep Learning for Sea Surface Temperature Reconstruction under Cloud + Occlusion + + +
+ Sea Surface Temperature (SST) is crucial for understanding Earth's oceans and +climate, significantly influencing weather patterns, ocean currents, marine +ecosystem health, and the global energy balance. Large-scale SST monitoring +relies on satellite infrared radiation detection, but cloud cover presents a +major challenge, creating extensive observational gaps and hampering our +ability to fully capture large-scale ocean temperature patterns. Efforts to +address these gaps in existing L4 datasets have been made, but they often +exhibit notable local and seasonal biases, compromising data reliability and +accuracy. To tackle this challenge, we employed deep neural networks to +reconstruct cloud-covered portions of satellite imagery while preserving the +integrity of observed values in cloud-free areas, using MODIS satellite derived +observations of SST. Our best-performing architecture showed significant skill +improvements over established methodologies, achieving substantial reductions +in error metrics when benchmarked against widely used approaches and datasets. +These results underscore the potential of advanced AI techniques to enhance the +completeness of satellite observations in Earth-science remote sensing, +providing more accurate and reliable datasets for environmental assessments, +data-driven model training, climate research, and seamless integration into +model data assimilation workflows. + +
+
+
+
+
+ + ☆ PrefixKV: Adaptive Prefix KV Cache is What Vision Instruction-Following + Models Need for Efficient Generation + + +
+ Recently, large vision-language models (LVLMs) have rapidly gained popularity +for their strong generation and reasoning capabilities given diverse multimodal +inputs. However, these models incur significant computational and memory +overhead during inference, which greatly hinders the efficient deployment in +practical scenarios. The extensive key-value (KV) cache, necessitated by the +lengthy input and output sequences, notably contributes to the high inference +cost. Based on this, recent works have investigated ways to reduce the KV cache +size for higher efficiency. Although effective, they generally overlook the +distinct importance distributions of KV vectors across layers and maintain the +same cache size for each layer during the next token prediction. This results +in the significant contextual information loss for certain layers, leading to +notable performance decline. To address this, we present PrefixKV. It reframes +the challenge of determining KV cache sizes for all layers into the task of +searching for the optimal global prefix configuration. With an adaptive +layer-wise KV retention recipe based on binary search, the maximum contextual +information can thus be preserved in each layer, facilitating the generation. +Extensive experiments demonstrate that our method achieves the state-of-the-art +performance compared with others. It exhibits superior inference efficiency and +generation quality trade-offs, showing promising potential for practical +applications. Code is available at \url{https://github.com/THU-MIG/PrefixKV}. + +
+
+ comment: 12 pages, 5 figures; +
+
+
+
+
+ + ☆ Skel3D: Skeleton Guided Novel View Synthesis + + +
+ In this paper, we present an approach for monocular open-set novel view +synthesis (NVS) that leverages object skeletons to guide the underlying +diffusion model. Building upon a baseline that utilizes a pre-trained 2D image +generator, our method takes advantage of the Objaverse dataset, which includes +animated objects with bone structures. By introducing a skeleton guide layer +following the existing ray conditioning normalization (RCN) layer, our approach +enhances pose accuracy and multi-view consistency. The skeleton guide layer +provides detailed structural information for the generative model, improving +the quality of synthesized views. Experimental results demonstrate that our +skeleton-guided method significantly enhances consistency and accuracy across +diverse object categories within the Objaverse dataset. Our method outperforms +existing state-of-the-art NVS techniques both quantitatively and qualitatively, +without relying on explicit 3D representations. + +
+
+
+
+
+ + ☆ Benchmarking Pretrained Attention-based Models for Real-Time Recognition + in Robot-Assisted Esophagectomy SP + + +
+ Esophageal cancer is among the most common types of cancer worldwide. It is +traditionally treated using open esophagectomy, but in recent years, +robot-assisted minimally invasive esophagectomy (RAMIE) has emerged as a +promising alternative. However, robot-assisted surgery can be challenging for +novice surgeons, as they often suffer from a loss of spatial orientation. +Computer-aided anatomy recognition holds promise for improving surgical +navigation, but research in this area remains limited. In this study, we +developed a comprehensive dataset for semantic segmentation in RAMIE, featuring +the largest collection of vital anatomical structures and surgical instruments +to date. Handling this diverse set of classes presents challenges, including +class imbalance and the recognition of complex structures such as nerves. This +study aims to understand the challenges and limitations of current +state-of-the-art algorithms on this novel dataset and problem. Therefore, we +benchmarked eight real-time deep learning models using two pretraining +datasets. We assessed both traditional and attention-based networks, +hypothesizing that attention-based networks better capture global patterns and +address challenges such as occlusion caused by blood or other tissues. The +benchmark includes our RAMIE dataset and the publicly available CholecSeg8k +dataset, enabling a thorough assessment of surgical segmentation tasks. Our +findings indicate that pretraining on ADE20k, a dataset for semantic +segmentation, is more effective than pretraining on ImageNet. Furthermore, +attention-based models outperform traditional convolutional neural networks, +with SegNeXt and Mask2Former achieving higher Dice scores, and Mask2Former +additionally excelling in average symmetric surface distance. + +
+
+ comment: Accepted for presentation at the SPIE Medical Imaging Conference, + 2025 +
+
+
+
+
+ + ☆ Implicit Priors Editing in Stable Diffusion via Targeted Token + Adjustment + + +
+ Implicit assumptions and priors are often necessary in text-to-image +generation tasks, especially when textual prompts lack sufficient context. +However, these assumptions can sometimes reflect outdated concepts, +inaccuracies, or societal bias embedded in the training data. We present +Embedding-only Editing (Embedit), a method designed to efficiently adjust +implict assumptions and priors in the model without affecting its +interpretation of unrelated objects or overall performance. Given a "source" +prompt (e.g., "rose") that elicits an implicit assumption (e.g., rose is red) +and a "destination" prompt that specifies the desired attribute (e.g., "blue +rose"), Embedit fine-tunes only the word token embedding (WTE) of the target +object ("rose") to optimize the last hidden state of text encoder in Stable +Diffusion, a SOTA text-to-image model. This targeted adjustment prevents +unintended effects on other objects in the model's knowledge base, as the WTEs +for unrelated objects and the model weights remain unchanged. Consequently, +when a prompt does not contain the edited object, all representations, and the +model outputs are identical to those of the original, unedited model. Our +method is highly efficient, modifying only 768 parameters for Stable Diffusion +1.4 and 2048 for XL in a single edit, matching the WTE dimension of each +respective model. This minimal scope, combined with rapid execution, makes +Embedit highly practical for real-world applications. Additionally, changes are +easily reversible by restoring the original WTE layers. Our experimental +results demonstrate that Embedit consistently outperforms previous methods +across various models, tasks, and editing scenarios (both single and sequential +multiple edits), achieving at least a 6.01% improvement (from 87.17% to +93.18%). + +
+
+
+
+
+ + ☆ Mapping using Transformers for Volumes -- Network for Super-Resolution + with Long-Range Interactions + + +
+ Until now, it has been difficult for volumetric super-resolution to utilize +the recent advances in transformer-based models seen in 2D super-resolution. +The memory required for self-attention in 3D volumes limits the receptive +field. Therefore, long-range interactions are not used in 3D to the extent done +in 2D and the strength of transformers is not realized. We propose a +multi-scale transformer-based model based on hierarchical attention blocks +combined with carrier tokens at multiple scales to overcome this. Here +information from larger regions at coarse resolution is sequentially carried on +to finer-resolution regions to predict the super-resolved image. Using +transformer layers at each resolution, our coarse-to-fine modeling limits the +number of tokens at each scale and enables attention over larger regions than +what has previously been possible. We experimentally compare our method, +MTVNet, against state-of-the-art volumetric super-resolution models on five 3D +datasets demonstrating the advantage of an increased receptive field. This +advantage is especially pronounced for images that are larger than what is seen +in popularly used 3D datasets. Our code is available at +https://github.com/AugustHoeg/MTVNet + +
+
+ comment: 14 pages, 8 Figures with supplementary material +
+
+
+
+
+ + ☆ Volumetrically Consistent 3D Gaussian Rasterization + + +
+ Recently, 3D Gaussian Splatting (3DGS) has enabled photorealistic view +synthesis at high inference speeds. However, its splatting-based rendering +model makes several approximations to the rendering equation, reducing physical +accuracy. We show that splatting and its approximations are unnecessary, even +within a rasterizer; we instead volumetrically integrate 3D Gaussians directly +to compute the transmittance across them analytically. We use this analytic +transmittance to derive more physically-accurate alpha values than 3DGS, which +can directly be used within their framework. The result is a method that more +closely follows the volume rendering equation (similar to ray-tracing) while +enjoying the speed benefits of rasterization. Our method represents opaque +surfaces with higher accuracy and fewer points than 3DGS. This enables it to +outperform 3DGS for view synthesis (measured in SSIM and LPIPS). Being +volumetrically consistent also enables our method to work out of the box for +tomography. We match the state-of-the-art 3DGS-based tomography method with +fewer points. Being volumetrically consistent also enables our method to work +out of the box for tomography. We match the state-of-the-art 3DGS-based +tomography method with fewer points. + +
+
+
+
+
+ + ☆ SGSST: Scaling Gaussian Splatting StyleTransfer + + +
+ Applying style transfer to a full 3D environment is a challenging task that +has seen many developments since the advent of neural rendering. 3D Gaussian +splatting (3DGS) has recently pushed further many limits of neural rendering in +terms of training speed and reconstruction quality. This work introduces SGSST: +Scaling Gaussian Splatting Style Transfer, an optimization-based method to +apply style transfer to pretrained 3DGS scenes. We demonstrate that a new +multiscale loss based on global neural statistics, that we name SOS for +Simultaneously Optimized Scales, enables style transfer to ultra-high +resolution 3D scenes. Not only SGSST pioneers 3D scene style transfer at such +high image resolutions, it also produces superior visual quality as assessed by +thorough qualitative, quantitative and perceptual comparisons. + +
+
+
+
+
+ + ☆ TASR: Timestep-Aware Diffusion Model for Image Super-Resolution + + +
+ Diffusion models have recently achieved outstanding results in the field of +image super-resolution. These methods typically inject low-resolution (LR) +images via ControlNet.In this paper, we first explore the temporal dynamics of +information infusion through ControlNet, revealing that the input from LR +images predominantly influences the initial stages of the denoising process. +Leveraging this insight, we introduce a novel timestep-aware diffusion model +that adaptively integrates features from both ControlNet and the pre-trained +Stable Diffusion (SD). Our method enhances the transmission of LR information +in the early stages of diffusion to guarantee image fidelity and stimulates the +generation ability of the SD model itself more in the later stages to enhance +the detail of generated images. To train this method, we propose a +timestep-aware training strategy that adopts distinct losses at varying +timesteps and acts on disparate modules. Experiments on benchmark datasets +demonstrate the effectiveness of our method. Code: +https://github.com/SleepyLin/TASR + +
+
+
+
+
+ + ☆ Intuitive Axial Augmentation Using Polar-Sine-Based Piecewise Distortion + for Medical Slice-Wise Segmentation + + +
+ Most data-driven models for medical image analysis rely on universal +augmentations to improve performance. Experimental evidence has confirmed their +effectiveness, but the unclear mechanism underlying them poses a barrier to the +widespread acceptance and trust in such methods within the medical community. +We revisit and acknowledge the unique characteristics of medical images apart +from traditional digital images, and consequently, proposed a medical-specific +augmentation algorithm that is more elastic and aligns well with radiology scan +procedure. The method performs piecewise affine with sinusoidal distorted ray +according to radius on polar coordinates, thus simulating uncertain postures of +human lying flat on the scanning table. Our method could generate human +visceral distribution without affecting the fundamental relative position on +axial plane. Two non-adaptive algorithms, namely Meta-based Scan Table Removal +and Similarity-Guided Parameter Search, are introduced to bolster robustness of +our augmentation method. Experiments show our method improves accuracy across +multiple famous segmentation frameworks without requiring more data samples. +Our preview code is available in: https://github.com/MGAMZ/PSBPD. + +
+
+
+
+
+ + ☆ Fairer Analysis and Demographically Balanced Face Generation for Fairer + Face Verification + + +
+ Face recognition and verification are two computer vision tasks whose +performances have advanced with the introduction of deep representations. +However, ethical, legal, and technical challenges due to the sensitive nature +of face data and biases in real-world training datasets hinder their +development. Generative AI addresses privacy by creating fictitious identities, +but fairness problems remain. Using the existing DCFace SOTA framework, we +introduce a new controlled generation pipeline that improves fairness. Through +classical fairness metrics and a proposed in-depth statistical analysis based +on logit models and ANOVA, we show that our generation pipeline improves +fairness more than other bias mitigation approaches while slightly improving +raw performance. + +
+
+
+
+
+ + ☆ DIVE: Taming DINO for Subject-Driven Video Editing + + +
+ Building on the success of diffusion models in image generation and editing, +video editing has recently gained substantial attention. However, maintaining +temporal consistency and motion alignment still remains challenging. To address +these issues, this paper proposes DINO-guided Video Editing (DIVE), a framework +designed to facilitate subject-driven editing in source videos conditioned on +either target text prompts or reference images with specific identities. The +core of DIVE lies in leveraging the powerful semantic features extracted from a +pretrained DINOv2 model as implicit correspondences to guide the editing +process. Specifically, to ensure temporal motion consistency, DIVE employs DINO +features to align with the motion trajectory of the source video. Extensive +experiments on diverse real-world videos demonstrate that our framework can +achieve high-quality editing results with robust motion consistency, +highlighting the potential of DINO to contribute to video editing. For precise +subject editing, DIVE incorporates the DINO features of reference images into a +pretrained text-to-image model to learn Low-Rank Adaptations (LoRAs), +effectively registering the target subject's identity. Project page: +https://dino-video-editing.github.io + +
+
+
+
+
+ + ☆ UniVAD: A Training-free Unified Model for Few-shot Visual Anomaly + Detection + + +
+ Visual Anomaly Detection (VAD) aims to identify abnormal samples in images +that deviate from normal patterns, covering multiple domains, including +industrial, logical, and medical fields. Due to the domain gaps between these +fields, existing VAD methods are typically tailored to each domain, with +specialized detection techniques and model architectures that are difficult to +generalize across different domains. Moreover, even within the same domain, +current VAD approaches often follow a "one-category-one-model" paradigm, +requiring large amounts of normal samples to train class-specific models, +resulting in poor generalizability and hindering unified evaluation across +domains. To address this issue, we propose a generalized few-shot VAD method, +UniVAD, capable of detecting anomalies across various domains, such as +industrial, logical, and medical anomalies, with a training-free unified model. +UniVAD only needs few normal samples as references during testing to detect +anomalies in previously unseen objects, without training on the specific +domain. Specifically, UniVAD employs a Contextual Component Clustering ($C^3$) +module based on clustering and vision foundation models to segment components +within the image accurately, and leverages Component-Aware Patch Matching +(CAPM) and Graph-Enhanced Component Modeling (GECM) modules to detect anomalies +at different semantic levels, which are aggregated to produce the final +detection result. We conduct experiments on nine datasets spanning industrial, +logical, and medical fields, and the results demonstrate that UniVAD achieves +state-of-the-art performance in few-shot anomaly detection tasks across +multiple domains, outperforming domain-specific anomaly detection models. The +code will be made publicly available. + +
+
+ comment: project page: https://uni-vad.github.io/ +
+
+
+
+
+ + ☆ A Stitch in Time Saves Nine: Small VLM is a Precise Guidance for + accelerating Large VLMs + + +
+ Vision-language models (VLMs) have shown remarkable success across various +multi-modal tasks, yet large VLMs encounter significant efficiency challenges +due to processing numerous visual tokens. A promising approach to accelerating +large VLM inference is using partial information, such as attention maps from +specific layers, to assess token importance and prune less essential tokens. +However, our study reveals three key insights: (i) Partial attention +information is insufficient for accurately identifying critical visual tokens, +resulting in suboptimal performance, especially at low token retention ratios; +(ii) Global attention information, such as the attention map aggregated across +all layers, more effectively preserves essential tokens and maintains +comparable performance under aggressive pruning. However, the attention maps +from all layers requires a full inference pass, which increases computational +load and is therefore impractical in existing methods; and (iii) The global +attention map aggregated from a small VLM closely resembles that of a large +VLM, suggesting an efficient alternative. Based on these findings, we introduce +a \textbf{training-free} method, \underline{\textbf{S}}mall VLM +\underline{\textbf{G}}uidance for accelerating \underline{\textbf{L}}arge VLMs +(\textbf{SGL}). Specifically, we employ the attention map aggregated from a +small VLM to guide visual token pruning in a large VLM. Additionally, an early +exiting mechanism is developed to fully use the small VLM's predictions, +dynamically invoking the larger VLM only when necessary, yielding a superior +trade-off between accuracy and computation. Extensive evaluations across 11 +benchmarks demonstrate the effectiveness and generalizability of SGL, achieving +up to 91\% pruning ratio for visual tokens while retaining competitive +performance. + +
+
+
+
+
+ + ☆ Domain-Agnostic Stroke Lesion Segmentation Using Physics-Constrained + Synthetic Data + + +
+ Segmenting stroke lesions in Magnetic Resonance Imaging (MRI) is challenging +due to diverse clinical imaging domains, with existing models struggling to +generalise across different MRI acquisition parameters and sequences. In this +work, we propose two novel physics-constrained approaches using synthetic +quantitative MRI (qMRI) images to enhance the robustness and generalisability +of segmentation models. We trained a qMRI estimation model to predict qMRI maps +from MPRAGE images, which were used to simulate diverse MRI sequences for +segmentation training. A second approach built upon prior work in synthetic +data for stroke lesion segmentation, generating qMRI maps from a dataset of +tissue labels. The proposed approaches improved over the baseline nnUNet on a +variety of out-of-distribution datasets, with the second approach outperforming +the prior synthetic data method. + +
+
+
+
+
+ + ☆ Geometry-guided Cross-view Diffusion for One-to-many Cross-view Image + Synthesis + + +
+ This paper presents a novel approach for cross-view synthesis aimed at +generating plausible ground-level images from corresponding satellite imagery +or vice versa. We refer to these tasks as satellite-to-ground (Sat2Grd) and +ground-to-satellite (Grd2Sat) synthesis, respectively. Unlike previous works +that typically focus on one-to-one generation, producing a single output image +from a single input image, our approach acknowledges the inherent one-to-many +nature of the problem. This recognition stems from the challenges posed by +differences in illumination, weather conditions, and occlusions between the two +views. To effectively model this uncertainty, we leverage recent advancements +in diffusion models. Specifically, we exploit random Gaussian noise to +represent the diverse possibilities learnt from the target view data. We +introduce a Geometry-guided Cross-view Condition (GCC) strategy to establish +explicit geometric correspondences between satellite and street-view features. +This enables us to resolve the geometry ambiguity introduced by camera pose +between image pairs, boosting the performance of cross-view image synthesis. +Through extensive quantitative and qualitative analyses on three benchmark +cross-view datasets, we demonstrate the superiority of our proposed +geometry-guided cross-view condition over baseline methods, including recent +state-of-the-art approaches in cross-view image synthesis. Our method generates +images of higher quality, fidelity, and diversity than other state-of-the-art +approaches. + +
+
+
+
+
+ + ☆ Equivariant Representation Learning for Augmentation-based + Self-Supervised Learning via Image Reconstruction + + +
+ Augmentation-based self-supervised learning methods have shown remarkable +success in self-supervised visual representation learning, excelling in +learning invariant features but often neglecting equivariant ones. This +limitation reduces the generalizability of foundation models, particularly for +downstream tasks requiring equivariance. We propose integrating an image +reconstruction task as an auxiliary component in augmentation-based +self-supervised learning algorithms to facilitate equivariant feature learning +without additional parameters. Our method implements a cross-attention +mechanism to blend features learned from two augmented views, subsequently +reconstructing one of them. This approach is adaptable to various datasets and +augmented-pair based learning methods. We evaluate its effectiveness on +learning equivariant features through multiple linear regression tasks and +downstream applications on both artificial (3DIEBench) and natural (ImageNet) +datasets. Results consistently demonstrate significant improvements over +standard augmentation-based self-supervised learning methods and +state-of-the-art approaches, particularly excelling in scenarios involving +combined augmentations. Our method enhances the learning of both invariant and +equivariant features, leading to more robust and generalizable visual +representations for computer vision tasks. + +
+
+
+
+
+ + ☆ Composed Image Retrieval for Training-Free Domain Conversion WACV 2025 + + +
+ This work addresses composed image retrieval in the context of domain +conversion, where the content of a query image is retrieved in the domain +specified by the query text. We show that a strong vision-language model +provides sufficient descriptive power without additional training. The query +image is mapped to the text input space using textual inversion. Unlike common +practice that invert in the continuous space of text tokens, we use the +discrete word space via a nearest-neighbor search in a text vocabulary. With +this inversion, the image is softly mapped across the vocabulary and is made +more robust using retrieval-based augmentation. Database images are retrieved +by a weighted ensemble of text queries combining mapped words with the domain +text. Our method outperforms prior art by a large margin on standard and newly +introduced benchmarks. Code: https://github.com/NikosEfth/freedom + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ☆ Diffusion-VLA: Scaling Robot Foundation Models via Unified Diffusion and + Autoregression + + +
+ In this paper, we present DiffusionVLA, a novel framework that seamlessly +combines the autoregression model with the diffusion model for learning +visuomotor policy. Central to our approach is a next-token prediction +objective, enabling the model to reason effectively over the user's query in +the context of current observations. Subsequently, a diffusion model is +attached to generate robust action outputs. To enhance policy learning through +self-reasoning, we introduce a novel reasoning injection module that integrates +reasoning phrases directly into the policy learning process. The whole +framework is simple and flexible, making it easy to deploy and upgrade. We +conduct extensive experiments using multiple real robots to validate the +effectiveness of DiffusionVLA. Our tests include a challenging factory sorting +task, where DiffusionVLA successfully categorizes objects, including those not +seen during training. We observe that the reasoning module makes the model +interpretable. It allows observers to understand the model thought process and +identify potential causes of policy failures. Additionally, we test +DiffusionVLA on a zero-shot bin-picking task, achieving 63.7\% accuracy on 102 +previously unseen objects. Our method demonstrates robustness to visual +changes, such as distractors and new backgrounds, and easily adapts to new +embodiments. Furthermore, DiffusionVLA can follow novel instructions and retain +conversational ability. Notably, DiffusionVLA is data-efficient and fast at +inference; our smallest DiffusionVLA-2B runs 82Hz on a single A6000 GPU and can +train from scratch on less than 50 demonstrations for a complex task. Finally, +we scale the model from 2B to 72B parameters, showcasing improved +generalization capabilities with increased model size. + +
+
+ comment: The project page is available at: http://diffusion-vla.github.io +
+
+
+
+
+ + ☆ Black-Box Forgery Attacks on Semantic Watermarks for Diffusion Models + + +
+ Integrating watermarking into the generation process of latent diffusion +models (LDMs) simplifies detection and attribution of generated content. +Semantic watermarks, such as Tree-Rings and Gaussian Shading, represent a novel +class of watermarking techniques that are easy to implement and highly robust +against various perturbations. However, our work demonstrates a fundamental +security vulnerability of semantic watermarks. We show that attackers can +leverage unrelated models, even with different latent spaces and architectures +(UNet vs DiT), to perform powerful and realistic forgery attacks. Specifically, +we design two watermark forgery attacks. The first imprints a targeted +watermark into real images by manipulating the latent representation of an +arbitrary image in an unrelated LDM to get closer to the latent representation +of a watermarked image. We also show that this technique can be used for +watermark removal. The second attack generates new images with the target +watermark by inverting a watermarked image and re-generating it with an +arbitrary prompt. Both attacks just need a single reference image with the +target watermark. Overall, our findings question the applicability of semantic +watermarks by revealing that attackers can easily forge or remove these +watermarks under realistic conditions. + +
+
+ comment: 23 pages, 21 figures, 6 tables +
+
+
+
+
+ + ☆ RFSR: Improving ISR Diffusion Models via Reward Feedback Learning + + +
+ Generative diffusion models (DM) have been extensively utilized in image +super-resolution (ISR). Most of the existing methods adopt the denoising loss +from DDPMs for model optimization. We posit that introducing reward feedback +learning to finetune the existing models can further improve the quality of the +generated images. In this paper, we propose a timestep-aware training strategy +with reward feedback learning. Specifically, in the initial denoising stages of +ISR diffusion, we apply low-frequency constraints to super-resolution (SR) +images to maintain structural stability. In the later denoising stages, we use +reward feedback learning to improve the perceptual and aesthetic quality of the +SR images. In addition, we incorporate Gram-KL regularization to alleviate +stylization caused by reward hacking. Our method can be integrated into any +diffusion-based ISR model in a plug-and-play manner. Experiments show that ISR +diffusion models, when fine-tuned with our method, significantly improve the +perceptual and aesthetic quality of SR images, achieving excellent subjective +results. Code: https://github.com/sxpro/RFSR + +
+
+
+
+
+ + ☆ NeRF and Gaussian Splatting SLAM in the Wild + + +
+ Navigating outdoor environments with visual Simultaneous Localization and +Mapping (SLAM) systems poses significant challenges due to dynamic scenes, +lighting variations, and seasonal changes, requiring robust solutions. While +traditional SLAM methods struggle with adaptability, deep learning-based +approaches and emerging neural radiance fields as well as Gaussian +Splatting-based SLAM methods, offer promising alternatives. However, these +methods have primarily been evaluated in controlled indoor environments with +stable conditions, leaving a gap in understanding their performance in +unstructured and variable outdoor settings. This study addresses this gap by +evaluating these methods in natural outdoor environments, focusing on camera +tracking accuracy, robustness to environmental factors, and computational +efficiency, highlighting distinct trade-offs. Extensive evaluations demonstrate +that neural SLAM methods achieve superior robustness, particularly under +challenging conditions such as low light, but at a high computational cost. At +the same time, traditional methods perform the best across seasons but are +highly sensitive to variations in lighting conditions. The code of the +benchmark is publicly available at +https://github.com/iis-esslingen/nerf-3dgs-benchmark. + +
+
+ comment: 5 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Is JPEG AI going to change image forensics? + + +
+ In this paper, we investigate the counter-forensic effects of the forthcoming +JPEG AI standard based on neural image compression, focusing on two critical +areas: deepfake image detection and image splicing localization. Neural image +compression leverages advanced neural network algorithms to achieve higher +compression rates while maintaining image quality. However, it introduces +artifacts that closely resemble those generated by image synthesis techniques +and image splicing pipelines, complicating the work of researchers when +discriminating pristine from manipulated content. We comprehensively analyze +JPEG AI's counter-forensic effects through extensive experiments on several +state-of-the-art detectors and datasets. Our results demonstrate that an +increase in false alarms impairs the performance of leading forensic detectors +when analyzing genuine content processed through JPEG AI. By exposing the +vulnerabilities of the available forensic tools we aim to raise the urgent need +for multimedia forensics researchers to include JPEG AI images in their +experimental setups and develop robust forensic techniques to distinguish +between neural compression artifacts and actual manipulations. + +
+
+
+
+
+ + ☆ GERD: Geometric event response data generation + + +
+ Event-based vision sensors are appealing because of their time resolution, +higher dynamic range, and low-power consumption. They also provide data that is +fundamentally different from conventional frame-based cameras: events are +sparse, discrete, and require integration in time. Unlike conventional models +grounded in established geometric and physical principles, event-based models +lack comparable foundations. We introduce a method to generate event-based data +under controlled transformations. Specifically, we subject a prototypical +object to transformations that change over time to produce carefully curated +event videos. We hope this work simplifies studies for geometric approaches in +event-based vision. GERD is available at https://github.com/ncskth/gerd + +
+
+
+
+
+ + ☆ DynamicControl: Adaptive Condition Selection for Improved Text-to-Image + Generation + + +
+ To enhance the controllability of text-to-image diffusion models, current +ControlNet-like models have explored various control signals to dictate image +attributes. However, existing methods either handle conditions inefficiently or +use a fixed number of conditions, which does not fully address the complexity +of multiple conditions and their potential conflicts. This underscores the need +for innovative approaches to manage multiple conditions effectively for more +reliable and detailed image synthesis. To address this issue, we propose a +novel framework, DynamicControl, which supports dynamic combinations of diverse +control signals, allowing adaptive selection of different numbers and types of +conditions. Our approach begins with a double-cycle controller that generates +an initial real score sorting for all input conditions by leveraging +pre-trained conditional generation models and discriminative models. This +controller evaluates the similarity between extracted conditions and input +conditions, as well as the pixel-level similarity with the source image. Then, +we integrate a Multimodal Large Language Model (MLLM) to build an efficient +condition evaluator. This evaluator optimizes the ordering of conditions based +on the double-cycle controller's score ranking. Our method jointly optimizes +MLLMs and diffusion models, utilizing MLLMs' reasoning capabilities to +facilitate multi-condition text-to-image (T2I) tasks. The final sorted +conditions are fed into a parallel multi-control adapter, which learns feature +maps from dynamic visual conditions and integrates them to modulate ControlNet, +thereby enhancing control over generated images. Through both quantitative and +qualitative comparisons, DynamicControl demonstrates its superiority over +existing methods in terms of controllability, generation quality and +composability under various conditional controls. + +
+
+
+
+
+ + ☆ AIM: Adaptive Inference of Multi-Modal LLMs via Token Merging and + Pruning + + +
+ Large language models (LLMs) have enabled the creation of multi-modal LLMs +that exhibit strong comprehension of visual data such as images and videos. +However, these models usually rely on extensive visual tokens from visual +encoders, leading to high computational demands, which limits their +applicability in resource-constrained environments and for long-context tasks. +In this work, we propose a training-free adaptive inference method for +multi-modal LLMs that can accommodate a broad range of efficiency requirements +with a minimum performance drop. Our method consists of a) iterative token +merging based on embedding similarity before LLMs, and b) progressive token +pruning within LLM layers based on multi-modal importance. With a minimalist +design, our method can be applied to both video and image LLMs. Extensive +experiments on diverse video and image benchmarks demonstrate that, our method +substantially reduces computation load (e.g., a $\textbf{7-fold}$ reduction in +FLOPs) while preserving the performance of video and image LLMs. Further, under +a similar computational cost, our method outperforms the state-of-the-art +methods in long video understanding (e.g., $\textbf{+4.6}$ on MLVU). +Additionally, our in-depth analysis provides insights into token redundancy and +LLM layer behaviors, offering guidance for future research in designing +efficient multi-modal LLMs. Our code will be available at +https://github.com/LaVi-Lab/AIM. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Task-driven Image Fusion with Learnable Fusion Loss + + +
+ Multi-modal image fusion aggregates information from multiple sensor sources, +achieving superior visual quality and perceptual characteristics compared to +any single source, often enhancing downstream tasks. However, current fusion +methods for downstream tasks still use predefined fusion objectives that +potentially mismatch the downstream tasks, limiting adaptive guidance and +reducing model flexibility. To address this, we propose Task-driven Image +Fusion (TDFusion), a fusion framework incorporating a learnable fusion loss +guided by task loss. Specifically, our fusion loss includes learnable +parameters modeled by a neural network called the loss generation module. This +module is supervised by the loss of downstream tasks in a meta-learning manner. +The learning objective is to minimize the task loss of the fused images, once +the fusion module has been optimized by the fusion loss. Iterative updates +between the fusion module and the loss module ensure that the fusion network +evolves toward minimizing task loss, guiding the fusion process toward the task +objectives. TDFusion's training relies solely on the loss of downstream tasks, +making it adaptable to any specific task. It can be applied to any architecture +of fusion and task networks. Experiments demonstrate TDFusion's performance in +both fusion and task-related applications, including four public fusion +datasets, semantic segmentation, and object detection. The code will be +released. + +
+
+
+
+
+ + ☆ MaterialPicker: Multi-Modal Material Generation with Diffusion + Transformers + + +
+ High-quality material generation is key for virtual environment authoring and +inverse rendering. We propose MaterialPicker, a multi-modal material generator +leveraging a Diffusion Transformer (DiT) architecture, improving and +simplifying the creation of high-quality materials from text prompts and/or +photographs. Our method can generate a material based on an image crop of a +material sample, even if the captured surface is distorted, viewed at an angle +or partially occluded, as is often the case in photographs of natural scenes. +We further allow the user to specify a text prompt to provide additional +guidance for the generation. We finetune a pre-trained DiT-based video +generator into a material generator, where each material map is treated as a +frame in a video sequence. We evaluate our approach both quantitatively and +qualitatively and show that it enables more diverse material generation and +better distortion correction than previous work. + +
+
+
+
+
+ + ☆ Beyond [cls]: Exploring the true potential of Masked Image Modeling + representations + + +
+ Masked Image Modeling (MIM) has emerged as a popular method for +Self-Supervised Learning (SSL) of visual representations. However, for +high-level perception tasks, MIM-pretrained models offer lower out-of-the-box +representation quality than the Joint-Embedding Architectures (JEA) - another +prominent SSL paradigm. To understand this performance gap, we analyze the +information flow in Vision Transformers (ViT) learned by both approaches. We +reveal that whereas JEAs construct their representation on a selected set of +relevant image fragments, MIM models aggregate nearly whole image content. +Moreover, we demonstrate that MIM-trained ViTs retain valuable information +within their patch tokens, which is not effectively captured by the global +[cls] token representations. Therefore, selective aggregation of relevant patch +tokens, without any fine-tuning, results in consistently higher-quality of MIM +representations. To our knowledge, we are the first to highlight the lack of +effective representation aggregation as an emergent issue of MIM and propose +directions to address it, contributing to future advances in Self-Supervised +Learning. + +
+
+
+
+
+ + ☆ Continual Low-Rank Scaled Dot-product Attention + + +
+ Transformers are widely used for their ability to capture data relations in +sequence processing, with great success for a wide range of static tasks. +However, the computational and memory footprint of their main component, i.e., +the Scaled Dot-product Attention, is commonly overlooked. This makes their +adoption in applications involving stream data processing with constraints in +response latency, computational and memory resources infeasible. Some works +have proposed methods to lower the computational cost of transformers, i.e. +low-rank approximations, sparsity in attention, and efficient formulations for +Continual Inference. In this paper, we introduce a new formulation of the +Scaled Dot-product Attention based on the Nystr\"om approximation that is +suitable for Continual Inference. In experiments on Online Audio Classification +and Online Action Detection tasks, the proposed Continual Scaled Dot-product +Attention can lower the number of operations by up to three orders of magnitude +compared to the original Transformers while retaining the predictive +performance of competing models. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Semi-Supervised Transfer Boosting (SS-TrBoosting) + + +
+ Semi-supervised domain adaptation (SSDA) aims at training a high-performance +model for a target domain using few labeled target data, many unlabeled target +data, and plenty of auxiliary data from a source domain. Previous works in SSDA +mainly focused on learning transferable representations across domains. +However, it is difficult to find a feature space where the source and target +domains share the same conditional probability distribution. Additionally, +there is no flexible and effective strategy extending existing unsupervised +domain adaptation (UDA) approaches to SSDA settings. In order to solve the +above two challenges, we propose a novel fine-tuning framework, semi-supervised +transfer boosting (SS-TrBoosting). Given a well-trained deep learning-based UDA +or SSDA model, we use it as the initial model, generate additional base +learners by boosting, and then use all of them as an ensemble. More +specifically, half of the base learners are generated by supervised domain +adaptation, and half by semi-supervised learning. Furthermore, for more +efficient data transmission and better data privacy protection, we propose a +source data generation approach to extend SS-TrBoosting to semi-supervised +source-free domain adaptation (SS-SFDA). Extensive experiments showed that +SS-TrBoosting can be applied to a variety of existing UDA, SSDA and SFDA +approaches to further improve their performance. + +
+
+
+
+
+ + ☆ Parametric Enhancement of PerceptNet: A Human-Inspired Approach for + Image Quality Assessment + + +
+ While deep learning models can learn human-like features at earlier levels, +which suggests their utility in modeling human vision, few attempts exist to +incorporate these features by design. Current approaches mostly optimize all +parameters blindly, only constraining minor architectural aspects. This paper +demonstrates how parametrizing neural network layers enables more +biologically-plausible operations while reducing trainable parameters and +improving interpretability. We constrain operations to functional forms present +in human vision, optimizing only these functions' parameters rather than all +convolutional tensor elements independently. We present two parametric model +versions: one with hand-chosen biologically plausible parameters, and another +fitted to human perception experimental data. We compare these with a +non-parametric version. All models achieve comparable state-of-the-art results, +with parametric versions showing orders of magnitude parameter reduction for +minimal performance loss. The parametric models demonstrate improved +interpretability and training behavior. Notably, the model fitted to human +perception, despite biological initialization, converges to biologically +incorrect results. This raises scientific questions and highlights the need for +diverse evaluation methods to measure models' humanness, rather than assuming +task performance correlates with human-like behavior. + +
+
+
+
+
+ + ☆ Fab-ME: A Vision State-Space and Attention-Enhanced Framework for Fabric + Defect Detection + + +
+ Effective defect detection is critical for ensuring the quality, +functionality, and economic value of textile products. However, existing +methods face challenges in achieving high accuracy, real-time performance, and +efficient global information extraction. To address these issues, we propose +Fab-ME, an advanced framework based on YOLOv8s, specifically designed for the +accurate detection of 20 fabric defect types. Our contributions include the +introduction of the cross-stage partial bottleneck with two convolutions (C2F) +vision state-space (C2F-VMamba) module, which integrates visual state-space +(VSS) blocks into the YOLOv8s feature fusion network neck, enhancing the +capture of intricate details and global context while maintaining high +processing speeds. Additionally, we incorporate an enhanced multi-scale channel +attention (EMCA) module into the final layer of the feature extraction network, +significantly improving sensitivity to small targets. Experimental results on +the Tianchi fabric defect detection dataset demonstrate that Fab-ME achieves a +3.3\% improvement in mAP@0.5 compared to the original YOLOv8s, validating its +effectiveness for precise and efficient fabric defect detection. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Biologically-inspired Semi-supervised Semantic Segmentation for + Biomedical Imaging + + +
+ We propose a novel two-stage semi-supervised learning approach for training +downsampling-upsampling semantic segmentation architectures. The first stage +does not use backpropagation. Rather, it exploits the bio-inspired Hebbian +principle "fire together, wire together" as a local learning rule for updating +the weights of both convolutional and transpose-convolutional layers, allowing +unsupervised discovery of data features. In the second stage, the model is +fine-tuned with standard backpropagation on a small subset of labeled data. We +evaluate our methodology through experiments conducted on several widely used +biomedical datasets, deeming that this domain is paramount in computer vision +and is notably impacted by data scarcity. Results show that our proposed method +outperforms SOTA approaches across different levels of label availability. +Furthermore, we show that using our unsupervised stage to initialize the SOTA +approaches leads to performance improvements. The code to replicate our +experiments can be found at: +https://github.com/ciampluca/hebbian-medical-image-segmentation + +
+
+
+
+
+ + ☆ Optimizing Dense Visual Predictions Through Multi-Task Coherence and + Prioritization WACV 2025 + + +
+ Multi-Task Learning (MTL) involves the concurrent training of multiple tasks, +offering notable advantages for dense prediction tasks in computer vision. MTL +not only reduces training and inference time as opposed to having multiple +single-task models, but also enhances task accuracy through the interaction of +multiple tasks. However, existing methods face limitations. They often rely on +suboptimal cross-task interactions, resulting in task-specific predictions with +poor geometric and predictive coherence. In addition, many approaches use +inadequate loss weighting strategies, which do not address the inherent +variability in task evolution during training. To overcome these challenges, we +propose an advanced MTL model specifically designed for dense vision tasks. Our +model leverages state-of-the-art vision transformers with task-specific +decoders. To enhance cross-task coherence, we introduce a trace-back method +that improves both cross-task geometric and predictive features. Furthermore, +we present a novel dynamic task balancing approach that projects task losses +onto a common scale and prioritizes more challenging tasks during training. +Extensive experiments demonstrate the superiority of our method, establishing +new state-of-the-art performance across two benchmark datasets. The code is +available at:https://github.com/Klodivio355/MT-CP + +
+
+ comment: Accepted by WACV 2025 +
+
+
+
+
+ + ☆ Towards Understanding and Quantifying Uncertainty for Text-to-Image + Generation + + +
+ Uncertainty quantification in text-to-image (T2I) generative models is +crucial for understanding model behavior and improving output reliability. In +this paper, we are the first to quantify and evaluate the uncertainty of T2I +models with respect to the prompt. Alongside adapting existing approaches +designed to measure uncertainty in the image space, we also introduce +Prompt-based UNCertainty Estimation for T2I models (PUNC), a novel method +leveraging Large Vision-Language Models (LVLMs) to better address uncertainties +arising from the semantics of the prompt and generated images. PUNC utilizes a +LVLM to caption a generated image, and then compares the caption with the +original prompt in the more semantically meaningful text space. PUNC also +enables the disentanglement of both aleatoric and epistemic uncertainties via +precision and recall, which image-space approaches are unable to do. Extensive +experiments demonstrate that PUNC outperforms state-of-the-art uncertainty +estimation techniques across various settings. Uncertainty quantification in +text-to-image generation models can be used on various applications including +bias detection, copyright protection, and OOD detection. We also introduce a +comprehensive dataset of text prompts and generation pairs to foster further +research in uncertainty quantification for generative models. Our findings +illustrate that PUNC not only achieves competitive performance but also enables +novel applications in evaluating and improving the trustworthiness of +text-to-image models. + +
+
+ comment: 28 pages and 22 figures +
+
+
+
+
+ + ☆ PatchDPO: Patch-level DPO for Finetuning-free Personalized Image + Generation + + +
+ Finetuning-free personalized image generation can synthesize customized +images without test-time finetuning, attracting wide research interest owing to +its high efficiency. Current finetuning-free methods simply adopt a single +training stage with a simple image reconstruction task, and they typically +generate low-quality images inconsistent with the reference images during +test-time. To mitigate this problem, inspired by the recent DPO (i.e., direct +preference optimization) technique, this work proposes an additional training +stage to improve the pre-trained personalized generation models. However, +traditional DPO only determines the overall superiority or inferiority of two +samples, which is not suitable for personalized image generation because the +generated images are commonly inconsistent with the reference images only in +some local image patches. To tackle this problem, this work proposes PatchDPO +that estimates the quality of image patches within each generated image and +accordingly trains the model. To this end, PatchDPO first leverages the +pre-trained vision model with a proposed self-supervised training method to +estimate the patch quality. Next, PatchDPO adopts a weighted training approach +to train the model with the estimated patch quality, which rewards the image +patches with high quality while penalizing the image patches with low quality. +Experiment results demonstrate that PatchDPO significantly improves the +performance of multiple pre-trained personalized generation models, and +achieves state-of-the-art performance on both single-object and multi-object +personalized image generation. Our code is available at +https://github.com/hqhQAQ/PatchDPO. + +
+
+
+
+
+ + ☆ IRisPath: Enhancing Off-Road Navigation with Robust IR-RGB Fusion for + Improved Day and Night Traversability + + +
+ Autonomous off-road navigation is required for applications in agriculture, +construction, search and rescue and defence. Traditional on-road autonomous +methods struggle with dynamic terrains, leading to poor vehicle control on +off-road. Recent deep-learning models have used perception sensors along with +kinesthetic feedback for navigation on such terrains. However, this approach +has out-of-domain uncertainty. Factors like change in weather and time of day +impacts the performance of the model. We propose a multi modal fusion network +FuseIsPath capable of using LWIR and RGB images to provide robustness against +dynamic weather and light conditions. To aid further works in this domain, we +also open-source a day-night dataset with LWIR and RGB images along with +pseudo-labels for traversability. In order to co-register the two images we +developed a novel method for targetless extrinsic calibration of LWIR, LiDAR +and RGB cameras with translation accuracy of 1.7cm and rotation accuracy of +0.827degree. + +
+
+
+
+
+ + ☆ Are Explanations Helpful? A Comparative Analysis of Explainability + Methods in Skin Lesion Classifiers + + +
+ Deep Learning has shown outstanding results in computer vision tasks; +healthcare is no exception. However, there is no straightforward way to expose +the decision-making process of DL models. Good accuracy is not enough for skin +cancer predictions. Understanding the model's behavior is crucial for clinical +application and reliable outcomes. In this work, we identify desiderata for +explanations in skin-lesion models. We analyzed seven methods, four based on +pixel-attribution (Grad-CAM, Score-CAM, LIME, SHAP) and three on high-level +concepts (ACE, ICE, CME), for a deep neural network trained on the +International Skin Imaging Collaboration Archive. Our findings indicate that +while these techniques reveal biases, there is room for improving the +comprehensiveness of explanations to achieve transparency in skin-lesion +models. + +
+
+ comment: 6 pages. Paper accepted at 20th International Symposium on Medical + Information Processing and Analysis (SIPAIM) +
+
+
+
+
+ + ☆ Multi-Level Correlation Network For Few-Shot Image Classification + + +
+ Few-shot image classification(FSIC) aims to recognize novel classes given few +labeled images from base classes. Recent works have achieved promising +classification performance, especially for metric-learning methods, where a +measure at only image feature level is usually used. In this paper, we argue +that measure at such a level may not be effective enough to generalize from +base to novel classes when using only a few images. Instead, a multi-level +descriptor of an image is taken for consideration in this paper. We propose a +multi-level correlation network (MLCN) for FSIC to tackle this problem by +effectively capturing local information. Concretely, we present the +self-correlation module and cross-correlation module to learn the semantic +correspondence relation of local information based on learned representations. +Moreover, we propose a pattern-correlation module to capture the pattern of +fine-grained images and find relevant structural patterns between base classes +and novel classes. Extensive experiments and analysis show the effectiveness of +our proposed method on four widely-used FSIC benchmarks. The code for our +approach is available at: https://github.com/Yunkai696/MLCN. + +
+
+
+
+
+ + ☆ Appearance Matching Adapter for Exemplar-based Semantic Image Synthesis + + +
+ Exemplar-based semantic image synthesis aims to generate images aligned with +given semantic content while preserving the appearance of an exemplar image. +Conventional structure-guidance models, such as ControlNet, are limited in that +they cannot directly utilize exemplar images as input, relying instead solely +on text prompts to control appearance. Recent tuning-free approaches address +this limitation by transferring local appearance from the exemplar image to the +synthesized image through implicit cross-image matching in the augmented +self-attention mechanism of pre-trained diffusion models. However, these +methods face challenges when applied to content-rich scenes with significant +geometric deformations, such as driving scenes. In this paper, we propose the +Appearance Matching Adapter (AM-Adapter), a learnable framework that enhances +cross-image matching within augmented self-attention by incorporating semantic +information from segmentation maps. To effectively disentangle generation and +matching processes, we adopt a stage-wise training approach. Initially, we +train the structure-guidance and generation networks, followed by training the +AM-Adapter while keeping the other networks frozen. During inference, we +introduce an automated exemplar retrieval method to efficiently select exemplar +image-segmentation pairs. Despite utilizing a limited number of learnable +parameters, our method achieves state-of-the-art performance, excelling in both +semantic alignment preservation and local appearance fidelity. Extensive +ablation studies further validate our design choices. Code and pre-trained +weights will be publicly available.: https://cvlab-kaist.github.io/AM-Adapter/ + +
+
+
+
+
+ + ☆ Splats in Splats: Embedding Invisible 3D Watermark within Gaussian + Splatting + + +
+ 3D Gaussian splatting (3DGS) has demonstrated impressive 3D reconstruction +performance with explicit scene representations. Given the widespread +application of 3DGS in 3D reconstruction and generation tasks, there is an +urgent need to protect the copyright of 3DGS assets. However, existing +copyright protection techniques for 3DGS overlook the usability of 3D assets, +posing challenges for practical deployment. Here we describe WaterGS, the first +3DGS watermarking framework that embeds 3D content in 3DGS itself without +modifying any attributes of the vanilla 3DGS. To achieve this, we take a deep +insight into spherical harmonics (SH) and devise an importance-graded SH +coefficient encryption strategy to embed the hidden SH coefficients. +Furthermore, we employ a convolutional autoencoder to establish a mapping +between the original Gaussian primitives' opacity and the hidden Gaussian +primitives' opacity. Extensive experiments indicate that WaterGS significantly +outperforms existing 3D steganography techniques, with 5.31% higher scene +fidelity and 3X faster rendering speed, while ensuring security, robustness, +and user experience. Codes and data will be released at +https://water-gs.github.io. + +
+
+
+
+
+ + ☆ ObjectFinder: Open-Vocabulary Assistive System for Interactive Object + Search by Blind People + + +
+ Assistive technology can be leveraged by blind people when searching for +objects in their daily lives. We created ObjectFinder, an open-vocabulary +interactive object-search prototype, which combines object detection with scene +description and navigation. It enables blind persons to detect and navigate to +objects of their choice. Our approach used co-design for the development of the +prototype. We further conducted need-finding interviews to better understand +challenges in object search, followed by a study with the ObjectFinder +prototype in a laboratory setting simulating a living room and an office, with +eight blind users. Additionally, we compared the prototype with BeMyEyes and +Lookout for object search. We found that most participants felt more +independent with ObjectFinder and preferred it over the baselines when deployed +on more efficient hardware, as it enhances mental mapping and allows for active +target definition. Moreover, we identified factors for future directions for +the development of object-search systems. + +
+
+
+
+
+ + ☆ Few-Shot Learning with Adaptive Weight Masking in Conditional GANs + + +
+ Deep learning has revolutionized various fields, yet its efficacy is hindered +by overfitting and the requirement of extensive annotated data, particularly in +few-shot learning scenarios where limited samples are available. This paper +introduces a novel approach to few-shot learning by employing a Residual Weight +Masking Conditional Generative Adversarial Network (RWM-CGAN) for data +augmentation. The proposed model integrates residual units within the generator +to enhance network depth and sample quality, coupled with a weight mask +regularization technique in the discriminator to improve feature learning from +small-sample categories. This method addresses the core issues of robustness +and generalization in few-shot learning by providing a controlled and clear +augmentation of the sample space. Extensive experiments demonstrate that +RWM-CGAN not only expands the sample space effectively but also enriches the +diversity and quality of generated samples, leading to significant improvements +in detection and classification accuracy on public datasets. The paper +contributes to the advancement of few-shot learning by offering a practical +solution to the challenges posed by data scarcity and the need for rapid +generalization to new tasks or categories. + +
+
+
+
+
+ + ☆ MultiGO: Towards Multi-level Geometry Learning for Monocular 3D Textured + Human Reconstruction + + +
+ This paper investigates the research task of reconstructing the 3D clothed +human body from a monocular image. Due to the inherent ambiguity of single-view +input, existing approaches leverage pre-trained SMPL(-X) estimation models or +generative models to provide auxiliary information for human reconstruction. +However, these methods capture only the general human body geometry and +overlook specific geometric details, leading to inaccurate skeleton +reconstruction, incorrect joint positions, and unclear cloth wrinkles. In +response to these issues, we propose a multi-level geometry learning framework. +Technically, we design three key components: skeleton-level enhancement, +joint-level augmentation, and wrinkle-level refinement modules. Specifically, +we effectively integrate the projected 3D Fourier features into a Gaussian +reconstruction model, introduce perturbations to improve joint depth estimation +during training, and refine the human coarse wrinkles by resembling the +de-noising process of diffusion model. Extensive quantitative and qualitative +experiments on two out-of-distribution test sets show the superior performance +of our approach compared to state-of-the-art (SOTA) methods. + +
+
+
+
+
+ + ☆ Lightweight Multiplane Images Network for Real-Time Stereoscopic + Conversion from Planar Video + + +
+ With the rapid development of stereoscopic display technologies, especially +glasses-free 3D screens, and virtual reality devices, stereoscopic conversion +has become an important task to address the lack of high-quality stereoscopic +image and video resources. Current stereoscopic conversion algorithms typically +struggle to balance reconstruction performance and inference efficiency. This +paper proposes a planar video real-time stereoscopic conversion network based +on multi-plane images (MPI), which consists of a detail branch for generating +MPI and a depth-semantic branch for perceiving depth information. Unlike models +that depend on explicit depth map inputs, the proposed method employs a +lightweight depth-semantic branch to extract depth-aware features implicitly. +To optimize the lightweight branch, a heavy training but light inference +strategy is adopted, which involves designing a coarse-to-fine auxiliary branch +that is only used during the training stage. In addition, the proposed method +simplifies the MPI rendering process for stereoscopic conversion scenarios to +further accelerate the inference. Experimental results demonstrate that the +proposed method can achieve comparable performance to some state-of-the-art +(SOTA) models and support real-time inference at 2K resolution. Compared to the +SOTA TMPI algorithm, the proposed method obtains similar subjective quality +while achieving over $40\times$ inference acceleration. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ Expanding Event Modality Applications through a Robust CLIP-Based + Encoder + + +
+ This paper introduces a powerful encoder that transfers CLIP`s capabilities +to event-based data, enhancing its utility and expanding its applicability +across diverse domains. While large-scale datasets have significantly advanced +image-based models, the scarcity of comprehensive event datasets has limited +performance potential in event modality. To address this challenge, we adapt +CLIP`s architecture to align event embeddings with image embeddings, supporting +zero-shot learning and preserving text alignment while mitigating catastrophic +forgetting. Our encoder achieves strong performance in object recognition, with +competitive results in zero-shot and few-shot learning tasks. Notably, it +generalizes effectively to events extracted from video data without requiring +additional training, highlighting its versatility. Additionally, we integrate +this encoder within a cross-modality framework that facilitates interaction +across five modalities-Image, Event, Text, Sound, and Depth-expanding the +possibilities for cross-modal applications. Overall, this work underscores the +transformative potential of a robust event encoder, broadening the scope and +utility of event-based data across various fields. + +
+
+
+
+
+ + ☆ Mimir: Improving Video Diffusion Models for Precise Text Understanding + + +
+ Text serves as the key control signal in video generation due to its +narrative nature. To render text descriptions into video clips, current video +diffusion models borrow features from text encoders yet struggle with limited +text comprehension. The recent success of large language models (LLMs) +showcases the power of decoder-only transformers, which offers three clear +benefits for text-to-video (T2V) generation, namely, precise text understanding +resulting from the superior scalability, imagination beyond the input text +enabled by next token prediction, and flexibility to prioritize user interests +through instruction tuning. Nevertheless, the feature distribution gap emerging +from the two different text modeling paradigms hinders the direct use of LLMs +in established T2V models. This work addresses this challenge with Mimir, an +end-to-end training framework featuring a carefully tailored token fuser to +harmonize the outputs from text encoders and LLMs. Such a design allows the T2V +model to fully leverage learned video priors while capitalizing on the +text-related capability of LLMs. Extensive quantitative and qualitative results +demonstrate the effectiveness of Mimir in generating high-quality videos with +excellent text comprehension, especially when processing short captions and +managing shifting motions. Project page: +https://lucaria-academy.github.io/Mimir/ + +
+
+
+
+
+ + ☆ Hybrid deep learning-based strategy for the hepatocellular carcinoma + cancer grade classification of H&E stained liver histopathology images + + +
+ Hepatocellular carcinoma (HCC) is a common type of liver cancer whose +early-stage diagnosis is a common challenge, mainly due to the manual +assessment of hematoxylin and eosin-stained whole slide images, which is a +time-consuming process and may lead to variability in decision-making. For +accurate detection of HCC, we propose a hybrid deep learning-based architecture +that uses transfer learning to extract the features from pre-trained +convolutional neural network (CNN) models and a classifier made up of a +sequence of fully connected layers. This study uses a publicly available The +Cancer Genome Atlas Hepatocellular Carcinoma (TCGA-LIHC)database (n=491) for +model development and database of Kasturba Gandhi Medical College (KMC), India +for validation. The pre-processing step involves patch extraction, colour +normalization, and augmentation that results in 3920 patches for the TCGA +dataset. The developed hybrid deep neural network consisting of a CNN-based +pre-trained feature extractor and a customized artificial neural network-based +classifier is trained using five-fold cross-validation. For this study, eight +different state-of-the-art models are trained and tested as feature extractors +for the proposed hybrid model. The proposed hybrid model with ResNet50-based +feature extractor provided the sensitivity, specificity, F1-score, accuracy, +and AUC of 100.00%, 100.00%, 100.00%, 100.00%, and 1.00, respectively on the +TCGA database. On the KMC database, EfficientNetb3 resulted in the optimal +choice of the feature extractor giving sensitivity, specificity, F1-score, +accuracy, and AUC of 96.97, 98.85, 96.71, 96.71, and 0.99, respectively. The +proposed hybrid models showed improvement in accuracy of 2% and 4% over the +pre-trained models in TCGA-LIHC and KMC databases. + +
+
+ comment: 14 figure, 9 tables +
+
+
+
+
+ + ☆ Align3R: Aligned Monocular Depth Estimation for Dynamic Videos + + +
+ Recent developments in monocular depth estimation methods enable high-quality +depth estimation of single-view images but fail to estimate consistent video +depth across different frames. Recent works address this problem by applying a +video diffusion model to generate video depth conditioned on the input video, +which is training-expensive and can only produce scale-invariant depth values +without camera poses. In this paper, we propose a novel video-depth estimation +method called Align3R to estimate temporal consistent depth maps for a dynamic +video. Our key idea is to utilize the recent DUSt3R model to align estimated +monocular depth maps of different timesteps. First, we fine-tune the DUSt3R +model with additional estimated monocular depth as inputs for the dynamic +scenes. Then, we apply optimization to reconstruct both depth maps and camera +poses. Extensive experiments demonstrate that Align3R estimates consistent +video depth and camera poses for a monocular video with superior performance +than baseline methods. + +
+
+ comment: Project Page: https://igl-hkust.github.io/Align3R.github.io/ +
+
+
+
+
+ + ☆ RoDyGS: Robust Dynamic Gaussian Splatting for Casual Videos + + +
+ Dynamic view synthesis (DVS) has advanced remarkably in recent years, +achieving high-fidelity rendering while reducing computational costs. Despite +the progress, optimizing dynamic neural fields from casual videos remains +challenging, as these videos do not provide direct 3D information, such as +camera trajectories or the underlying scene geometry. In this work, we present +RoDyGS, an optimization pipeline for dynamic Gaussian Splatting from casual +videos. It effectively learns motion and underlying geometry of scenes by +separating dynamic and static primitives, and ensures that the learned motion +and geometry are physically plausible by incorporating motion and geometric +regularization terms. We also introduce a comprehensive benchmark, Kubric-MRig, +that provides extensive camera and object motion along with simultaneous +multi-view captures, features that are absent in previous benchmarks. +Experimental results demonstrate that the proposed method significantly +outperforms previous pose-free dynamic neural fields and achieves competitive +rendering quality compared to existing pose-free static neural fields. The code +and data are publicly available at https://rodygs.github.io/. + +
+
+ comment: Project Page: https://rodygs.github.io/ +
+
+
+
+
+ + ☆ TokenFlow: Unified Image Tokenizer for Multimodal Understanding and + Generation + + +
+ We present TokenFlow, a novel unified image tokenizer that bridges the +long-standing gap between multimodal understanding and generation. Prior +research attempt to employ a single reconstruction-targeted Vector Quantization +(VQ) encoder for unifying these two tasks. We observe that understanding and +generation require fundamentally different granularities of visual information. +This leads to a critical trade-off, particularly compromising performance in +multimodal understanding tasks. TokenFlow addresses this challenge through an +innovative dual-codebook architecture that decouples semantic and pixel-level +feature learning while maintaining their alignment via a shared mapping +mechanism. This design enables direct access to both high-level semantic +representations crucial for understanding tasks and fine-grained visual +features essential for generation through shared indices. Our extensive +experiments demonstrate TokenFlow's superiority across multiple dimensions. +Leveraging TokenFlow, we demonstrate for the first time that discrete visual +input can surpass LLaVA-1.5 13B in understanding performance, achieving a 7.2\% +average improvement. For image reconstruction, we achieve a strong FID score of +0.63 at 384*384 resolution. Moreover, TokenFlow establishes state-of-the-art +performance in autoregressive image generation with a GenEval score of 0.55 at +256*256 resolution, achieving comparable results to SDXL. + +
+
+ comment: https://byteflow-ai.github.io/TokenFlow/ +
+
+
+
+
+ + ☆ Lightweight Stochastic Video Prediction via Hybrid Warping + + +
+ Accurate video prediction by deep neural networks, especially for dynamic +regions, is a challenging task in computer vision for critical applications +such as autonomous driving, remote working, and telemedicine. Due to inherent +uncertainties, existing prediction models often struggle with the complexity of +motion dynamics and occlusions. In this paper, we propose a novel stochastic +long-term video prediction model that focuses on dynamic regions by employing a +hybrid warping strategy. By integrating frames generated through forward and +backward warpings, our approach effectively compensates for the weaknesses of +each technique, improving the prediction accuracy and realism of moving regions +in videos while also addressing uncertainty by making stochastic predictions +that account for various motions. Furthermore, considering real-time +predictions, we introduce a MobileNet-based lightweight architecture into our +model. Our model, called SVPHW, achieves state-of-the-art performance on two +benchmark datasets. + +
+
+ comment: IEEE VCIP 2024 +
+
+
+
+
+ + ☆ CLAP: Unsupervised 3D Representation Learning for Fusion 3D Perception + via Curvature Sampling and Prototype Learning + + +
+ Unsupervised 3D representation learning via masked-and-reconstruction with +differentiable rendering is promising to reduce the labeling burden for fusion +3D perception. However, previous literature conduct pre-training for different +modalities separately because of the hight GPU memory consumption. +Consequently, the interaction between the two modalities (images and point +clouds) is neglected during pre-training. In this paper, we explore joint +unsupervised pre-training for fusion 3D perception via differentiable rendering +and propose CLAP, short for Curvature sampLing and swApping Prototype +assignment prediction. The contributions are three-fold. 1) To overcome the GPU +memory consumption problem, we propose Curvature Sampling to sample the more +informative points/pixels for pre-training. 2) We propose to use learnable +prototypes to represent parts of the scenes in a common feature space and bring +the idea of swapping prototype assignment prediction to learn the interaction +between the two modalities. 3) To further optimize learnable prototypes, we +propose an Expectation-Maximization training scheme to maximize the similarity +between embeddings and prototypes, followed by a Gram Matrix Regularization +Loss to avoid collapse. Experiment results on NuScenes show that CLAP achieves +300% more performance gain as compared to previous SOTA 3D pre-training method +via differentiable rendering. Codes and models will be released. + +
+
+
+
+
+ + ☆ Revisiting Energy-Based Model for Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection is an essential approach to robustifying +deep learning models, enabling them to identify inputs that fall outside of +their trained distribution. Existing OOD detection methods usually depend on +crafted data, such as specific outlier datasets or elaborate data +augmentations. While this is reasonable, the frequent mismatch between crafted +data and OOD data limits model robustness and generalizability. In response to +this issue, we introduce Outlier Exposure by Simple Transformations (OEST), a +framework that enhances OOD detection by leveraging "peripheral-distribution" +(PD) data. Specifically, PD data are samples generated through simple data +transformations, thus providing an efficient alternative to manually curated +outliers. + We adopt energy-based models (EBMs) to study PD data. We recognize the +"energy barrier" in OOD detection, which characterizes the energy difference +between in-distribution (ID) and OOD samples and eases detection. PD data are +introduced to establish the energy barrier during training. Furthermore, this +energy barrier concept motivates a theoretically grounded energy-barrier loss +to replace the classical energy-bounded loss, leading to an improved paradigm, +OEST*, which achieves a more effective and theoretically sound separation +between ID and OOD samples. We perform empirical validation of our proposal, +and extensive experiments across various benchmarks demonstrate that OEST* +achieves better or similar accuracy compared with state-of-the-art methods. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Point-GN: A Non-Parametric Network Using Gaussian Positional Encoding + for Point Cloud Classification WACV + + +
+ This paper introduces Point-GN, a novel non-parametric network for efficient +and accurate 3D point cloud classification. Unlike conventional deep learning +models that rely on a large number of trainable parameters, Point-GN leverages +non-learnable components-specifically, Farthest Point Sampling (FPS), k-Nearest +Neighbors (k-NN), and Gaussian Positional Encoding (GPE)-to extract both local +and global geometric features. This design eliminates the need for additional +training while maintaining high performance, making Point-GN particularly +suited for real-time, resource-constrained applications. We evaluate Point-GN +on two benchmark datasets, ModelNet40 and ScanObjectNN, achieving +classification accuracies of 85.29% and 85.89%, respectively, while +significantly reducing computational complexity. Point-GN outperforms existing +non-parametric methods and matches the performance of fully trained models, all +with zero learnable parameters. Our results demonstrate that Point-GN is a +promising solution for 3D point cloud classification in practical, real-time +environments. + +
+
+ comment: This paper has been accepted for presentation at the IEEE Winter + Conference on Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Real-Time AIoT for UAV Antenna Interference Detection via Edge-Cloud + Collaboration + + +
+ In the fifth-generation (5G) era, eliminating communication interference +sources is crucial for maintaining network performance. Interference often +originates from unauthorized or malfunctioning antennas, and radio monitoring +agencies must address numerous sources of such antennas annually. Unmanned +aerial vehicles (UAVs) can improve inspection efficiency. However, the data +transmission delay in the existing cloud-only (CO) artificial intelligence (AI) +mode fails to meet the low latency requirements for real-time performance. +Therefore, we propose a computer vision-based AI of Things (AIoT) system to +detect antenna interference sources for UAVs. The system adopts an optimized +edge-cloud collaboration (ECC+) mode, combining a keyframe selection algorithm +(KSA), focusing on reducing end-to-end latency (E2EL) and ensuring reliable +data transmission, which aligns with the core principles of ultra-reliable +low-latency communication (URLLC). At the core of our approach is an end-to-end +antenna localization scheme based on the tracking-by-detection (TBD) paradigm, +including a detector (EdgeAnt) and a tracker (AntSort). EdgeAnt achieves +state-of-the-art (SOTA) performance with a mean average precision (mAP) of +42.1% on our custom antenna interference source dataset, requiring only 3 +million parameters and 14.7 GFLOPs. On the COCO dataset, EdgeAnt achieves 38.9% +mAP with 5.4 GFLOPs. We deployed EdgeAnt on Jetson Xavier NX (TRT) and +Raspberry Pi 4B (NCNN), achieving real-time inference speeds of 21.1 (1088) and +4.8 (640) frames per second (FPS), respectively. Compared with CO mode, the +ECC+ mode reduces E2EL by 88.9%, increases accuracy by 28.2%. Additionally, the +system offers excellent scalability for coordinated multiple UAVs inspections. +The detector code is publicly available at +https://github.com/SCNU-RISLAB/EdgeAnt. + +
+
+
+
+
+ + ☆ TREND: Unsupervised 3D Representation Learning via Temporal Forecasting + for LiDAR Perception + + +
+ Labeling LiDAR point clouds is notoriously time-and-energy-consuming, which +spurs recent unsupervised 3D representation learning methods to alleviate the +labeling burden in LiDAR perception via pretrained weights. Almost all existing +work focus on a single frame of LiDAR point cloud and neglect the temporal +LiDAR sequence, which naturally accounts for object motion (and their +semantics). Instead, we propose TREND, namely Temporal REndering with Neural +fielD, to learn 3D representation via forecasting the future observation in an +unsupervised manner. Unlike existing work that follows conventional contrastive +learning or masked auto encoding paradigms, TREND integrates forecasting for 3D +pre-training through a Recurrent Embedding scheme to generate 3D embedding +across time and a Temporal Neural Field to represent the 3D scene, through +which we compute the loss using differentiable rendering. To our best +knowledge, TREND is the first work on temporal forecasting for unsupervised 3D +representation learning. We evaluate TREND on downstream 3D object detection +tasks on popular datasets, including NuScenes, Once and Waymo. Experiment +results show that TREND brings up to 90% more improvement as compared to +previous SOTA unsupervised 3D pre-training methods and generally improve +different downstream models across datasets, demonstrating that indeed temporal +forecasting brings improvement for LiDAR perception. Codes and models will be +released. + +
+
+
+
+
+ + ☆ Point-GR: Graph Residual Point Cloud Network for 3D Object + Classification and Segmentation ICPR 2024 + + +
+ In recent years, the challenge of 3D shape analysis within point cloud data +has gathered significant attention in computer vision. Addressing the +complexities of effective 3D information representation and meaningful feature +extraction for classification tasks remains crucial. This paper presents +Point-GR, a novel deep learning architecture designed explicitly to transform +unordered raw point clouds into higher dimensions while preserving local +geometric features. It introduces residual-based learning within the network to +mitigate the point permutation issues in point cloud data. The proposed +Point-GR network significantly reduced the number of network parameters in +Classification and Part-Segmentation compared to baseline graph-based networks. +Notably, the Point-GR model achieves a state-of-the-art scene segmentation mean +IoU of 73.47% on the S3DIS benchmark dataset, showcasing its effectiveness. +Furthermore, the model shows competitive results in Classification and +Part-Segmentation tasks. + +
+
+ comment: ICPR 2024 G2SP-CV Workshop, Dec 1-5, 2024 Kolkata, India +
+
+
+
+
+ + ☆ Frequency-Guided Diffusion Model with Perturbation Training for + Skeleton-Based Video Anomaly Detection + + +
+ Video anomaly detection is an essential yet challenging open-set task in +computer vision, often addressed by leveraging reconstruction as a proxy task. +However, existing reconstruction-based methods encounter challenges in two main +aspects: (1) limited model robustness for open-set scenarios, (2) and an +overemphasis on, but restricted capacity for, detailed motion reconstruction. +To this end, we propose a novel frequency-guided diffusion model with +perturbation training, which enhances the model robustness by perturbation +training and emphasizes the principal motion components guided by motion +frequencies. Specifically, we first use a trainable generator to produce +perturbative samples for perturbation training of the diffusion model. During +the perturbation training phase, the model robustness is enhanced and the +domain of the reconstructed model is broadened by training against this +generator. Subsequently, perturbative samples are introduced for inference, +which impacts the reconstruction of normal and abnormal motions differentially, +thereby enhancing their separability. Considering that motion details originate +from high-frequency information, we propose a masking method based on 2D +discrete cosine transform to separate high-frequency information and +low-frequency information. Guided by the high-frequency information from +observed motion, the diffusion model can focus on generating low-frequency +information, and thus reconstructing the motion accurately. Experimental +results on five video anomaly detection datasets, including human-related and +open-set benchmarks, demonstrate the effectiveness of the proposed method. Our +code is available at https://github.com/Xiaofeng-Tan/FGDMAD-Code. + +
+
+
+
+
+ + ♻ ☆ Yo'LLaVA: Your Personalized Language and Vision Assistant NeurIPS 2024 + + +
+ Large Multimodal Models (LMMs) have shown remarkable capabilities across a +variety of tasks (e.g., image captioning, visual question answering). While +broad, their knowledge remains generic (e.g., recognizing a dog), and they are +unable to handle personalized subjects (e.g., recognizing a user's pet dog). +Human reasoning, in contrast, typically operates within the context of specific +subjects in our surroundings. For example, one might ask, "What should I buy +for my dog's birthday?"; as opposed to a generic inquiry about "What should I +buy for a dog's birthday?". Similarly, when looking at a friend's image, the +interest lies in seeing their activities (e.g., "my friend is holding a cat"), +rather than merely observing generic human actions (e.g., "a man is holding a +cat"). In this paper, we introduce the novel task of personalizing LMMs, so +that they can have conversations about a specific subject. We propose Yo'LLaVA, +which learns to embed a personalized subject into a set of latent tokens given +a handful of example images of the subject. Our qualitative and quantitative +analyses reveal that Yo'LLaVA can learn the concept more efficiently using +fewer tokens and more effectively encode the visual attributes compared to +strong prompting baselines (e.g., LLaVA). + +
+
+ comment: NeurIPS 2024; Project page: https://thaoshibe.github.io/YoLLaVA +
+
+
+
+
+ + ♻ ☆ VoxNeRF: Bridging Voxel Representation and Neural Radiance Fields for + Enhanced Indoor View Synthesis + + +
+ The generation of high-fidelity view synthesis is essential for robotic +navigation and interaction but remains challenging, particularly in indoor +environments and real-time scenarios. Existing techniques often require +significant computational resources for both training and rendering, and they +frequently result in suboptimal 3D representations due to insufficient +geometric structuring. To address these limitations, we introduce VoxNeRF, a +novel approach that utilizes easy-to-obtain geometry priors to enhance both the +quality and efficiency of neural indoor reconstruction and novel view +synthesis. We propose an efficient voxel-guided sampling technique that +allocates computational resources selectively to the most relevant segments of +rays based on a voxel-encoded geometry prior, significantly reducing training +and rendering time. Additionally, we incorporate a robust depth loss to improve +reconstruction and rendering quality in sparse view settings. Our approach is +validated with extensive experiments on ScanNet and ScanNet++ where VoxNeRF +outperforms existing state-of-the-art methods and establishes a new benchmark +for indoor immersive interpolation and extrapolation settings. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ StarVector: Generating Scalable Vector Graphics Code from Images and + Text + + +
+ Scalable Vector Graphics (SVGs) are vital for modern image rendering due to +their scalability and versatility. Previous SVG generation methods have focused +on curve-based vectorization, lacking semantic understanding, often producing +artifacts, and struggling with SVG primitives beyond path curves. To address +these issues, we introduce StarVector, a multimodal large language model for +SVG generation. It performs image vectorization by understanding image +semantics and using SVG primitives for compact, precise outputs. Unlike +traditional methods, StarVector works directly in the SVG code space, +leveraging visual understanding to apply accurate SVG primitives. To train +StarVector, we create SVG-Stack, a diverse dataset of 2M samples that enables +generalization across vectorization tasks and precise use of primitives like +ellipses, polygons, and text. We address challenges in SVG evaluation, showing +that pixel-based metrics like MSE fail to capture the unique qualities of +vector graphics. We introduce SVG-Bench, a benchmark across 10 datasets, and 3 +tasks: Image-to-SVG, Text-to-SVG generation, and diagram generation. Using this +setup, StarVector achieves state-of-the-art performance, producing more compact +and semantically rich SVGs. + +
+
+
+
+
+ + ♻ ☆ Instance-Warp: Saliency Guided Image Warping for Unsupervised Domain + Adaptation WACV 2025 + + +
+ Driving is challenging in conditions like night, rain, and snow. Lack of good +labeled datasets has hampered progress in scene understanding under such +conditions. Unsupervised Domain Adaptation (UDA) using large labeled clear-day +datasets is a promising research direction in such cases. However, many UDA +methods are trained with dominant scene backgrounds (e.g., roads, sky, +sidewalks) that appear dramatically different across domains. As a result, they +struggle to learn effective features of smaller and often sparse foreground +objects (e.g., people, vehicles, signs). + In this work, we improve UDA training by applying in-place image warping to +focus on salient objects. We design instance-level saliency guidance to +adaptively oversample object regions and undersample background areas, which +reduces adverse effects from background context and enhances backbone feature +learning. Our approach improves adaptation across geographies, lighting, and +weather conditions, and is agnostic to the task (segmentation, detection), +domain adaptation algorithm, saliency guidance, and underlying model +architecture. Result highlights include +6.1 mAP50 for BDD100K Clear +$\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\rightarrow$ Night, +3.0 +mAP50 for BDD100K Clear $\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes +$\rightarrow$ ACDC. Besides, Our method adds minimal training memory and no +additional inference latency. Code is available at +https://github.com/ShenZheng2000/Instance-Warp + +
+
+ comment: WACV 2025 Accepted Paper +
+
+
+
+
+ + ♻ ☆ Boosting Latent Diffusion with Flow Matching ECCV 2024 + + +
+ Visual synthesis has recently seen significant leaps in performance, largely +due to breakthroughs in generative models. Diffusion models have been a key +enabler, as they excel in image diversity. However, this comes at the cost of +slow training and synthesis, which is only partially alleviated by latent +diffusion. To this end, flow matching is an appealing approach due to its +complementary characteristics of faster training and inference but less diverse +synthesis. We demonstrate that introducing flow matching between a frozen +diffusion model and a convolutional decoder enables high-resolution image +synthesis at reduced computational cost and model size. A small diffusion model +can then effectively provide the necessary visual diversity, while flow +matching efficiently enhances resolution and detail by mapping the small to a +high-dimensional latent space. These latents are then projected to +high-resolution images by the subsequent convolutional decoder of the latent +diffusion approach. Combining the diversity of diffusion models, the efficiency +of flow matching, and the effectiveness of convolutional decoders, +state-of-the-art high-resolution image synthesis is achieved at $1024^2$ pixels +with minimal computational cost. Further scaling up our method we can reach +resolutions up to $2048^2$ pixels. Importantly, our approach is orthogonal to +recent approximation and speed-up strategies for the underlying model, making +it easily integrable into the various diffusion model frameworks. + +
+
+ comment: ECCV 2024 (Oral), Project Page: + https://compvis.github.io/fm-boosting/ +
+
+
+
+
+ + ♻ ☆ Cambrian-1: A Fully Open, Vision-Centric Exploration of Multimodal LLMs NeurIPS 2024 + + +
+ We introduce Cambrian-1, a family of multimodal LLMs (MLLMs) designed with a +vision-centric approach. While stronger language models can enhance multimodal +capabilities, the design choices for vision components are often insufficiently +explored and disconnected from visual representation learning research. This +gap hinders accurate sensory grounding in real-world scenarios. Our study uses +LLMs and visual instruction tuning as an interface to evaluate various visual +representations, offering new insights into different models and architectures +-- self-supervised, strongly supervised, or combinations thereof -- based on +experiments with over 20 vision encoders. We critically examine existing MLLM +benchmarks, address the difficulties involved in consolidating and interpreting +results from various tasks, and introduce a new vision-centric benchmark, +CV-Bench. To further improve visual grounding, we propose the Spatial Vision +Aggregator (SVA), a dynamic and spatially-aware connector that integrates +high-resolution vision features with LLMs while reducing the number of tokens. +Additionally, we discuss the curation of high-quality visual instruction-tuning +data from publicly available sources, emphasizing the importance of data source +balancing and distribution ratio. Collectively, Cambrian-1 not only achieves +state-of-the-art performance but also serves as a comprehensive, open cookbook +for instruction-tuned MLLMs. We provide model weights, code, supporting tools, +datasets, and detailed instruction-tuning and evaluation recipes. We hope our +release will inspire and accelerate advancements in multimodal systems and +visual representation learning. + +
+
+ comment: NeurIPS 2024 (Oral). Website at https://cambrian-mllm.github.io +
+
+
+
+
+ + ♻ ☆ SR+Codec: a Benchmark of Super-Resolution for Video Compression Bitrate + Reduction + + +
+ In recent years, there has been significant interest in Super-Resolution +(SR), which focuses on generating a high-resolution image from a low-resolution +input. Deep learning-based methods for super-resolution have been particularly +popular and have shown impressive results on various benchmarks. However, +research indicates that these methods may not perform as well on strongly +compressed videos. We developed a super-resolution benchmark to analyze SR's +capacity to upscale compressed videos. Our dataset employed video codecs based +on five widely-used compression standards: H.264, H.265, H.266, AV1, and AVS3. +We assessed 19 popular SR models using our benchmark and evaluated their +ability to restore details and their susceptibility to compression artifacts. +To get an accurate perceptual ranking of SR models, we conducted a +crowd-sourced side-by-side comparison of their outputs. We found that some SR +models, combined with compression, allow us to reduce the video bitrate without +significant loss of quality. We also compared a range of image and video +quality metrics with subjective scores to evaluate their accuracy on +super-resolved compressed videos. The benchmark is publicly available at +https://videoprocessing.ai/benchmarks/super-resolution-for-video-compression.html + +
+
+
+
+
+ + ♻ ☆ Coverage-Constrained Human-AI Cooperation with Multiple Experts + + +
+ Human-AI cooperative classification (HAI-CC) approaches aim to develop hybrid +intelligent systems that enhance decision-making in various high-stakes +real-world scenarios by leveraging both human expertise and AI capabilities. +Current HAI-CC methods primarily focus on learning-to-defer (L2D), where +decisions are deferred to human experts, and learning-to-complement (L2C), +where AI and human experts make predictions cooperatively. However, a notable +research gap remains in effectively exploring both L2D and L2C under diverse +expert knowledge to improve decision-making, particularly when constrained by +the cooperation cost required to achieve a target probability for AI-only +selection (i.e., coverage). In this paper, we address this research gap by +proposing the Coverage-constrained Learning to Defer and Complement with +Specific Experts (CL2DC) method. CL2DC makes final decisions through either AI +prediction alone or by deferring to or complementing a specific expert, +depending on the input data. Furthermore, we propose a coverage-constrained +optimisation to control the cooperation cost, ensuring it approximates a target +probability for AI-only selection. This approach enables an effective +assessment of system performance within a specified budget. Also, CL2DC is +designed to address scenarios where training sets contain multiple noisy-label +annotations without any clean-label references. Comprehensive evaluations on +both synthetic and real-world datasets demonstrate that CL2DC achieves superior +performance compared to state-of-the-art HAI-CC methods. + +
+
+
+
+
+ + ♻ ☆ Cross-View-Prediction: Exploring Contrastive Feature for Hyperspectral + Image Classification + + +
+ This paper presents a self-supervised feature learning method for +hyperspectral image classification. Our method tries to construct two different +views of the raw hyperspectral image through a cross-representation learning +method. And then to learn semantically consistent representation over the +created views by contrastive learning method. Specifically, four +cross-channel-prediction based augmentation methods are naturally designed to +utilize the high dimension characteristic of hyperspectral data for the view +construction. And the better representative features are learned by maximizing +mutual information and minimizing conditional entropy across different views +from our contrastive network. This 'Cross-View-Predicton' style is +straightforward and gets the state-of-the-art performance of unsupervised +classification with a simple SVM classifier. + +
+
+
+
+
+ + ♻ ☆ GaussianBeV: 3D Gaussian Representation meets Perception Models for BeV + Segmentation WACV 2025 + + +
+ The Bird's-eye View (BeV) representation is widely used for 3D perception +from multi-view camera images. It allows to merge features from different +cameras into a common space, providing a unified representation of the 3D +scene. The key component is the view transformer, which transforms image views +into the BeV. However, actual view transformer methods based on geometry or +cross-attention do not provide a sufficiently detailed representation of the +scene, as they use a sub-sampling of the 3D space that is non-optimal for +modeling the fine structures of the environment. In this paper, we propose +GaussianBeV, a novel method for transforming image features to BeV by finely +representing the scene using a set of 3D gaussians located and oriented in 3D +space. This representation is then splattered to produce the BeV feature map by +adapting recent advances in 3D representation rendering based on gaussian +splatting. GaussianBeV is the first approach to use this 3D gaussian modeling +and 3D scene rendering process online, i.e. without optimizing it on a specific +scene and directly integrated into a single stage model for BeV scene +understanding. Experiments show that the proposed representation is highly +effective and place GaussianBeV as the new state-of-the-art on the BeV semantic +segmentation task on the nuScenes dataset. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ GenMix: Effective Data Augmentation with Generative Diffusion Model + Image Editing + + +
+ Data augmentation is widely used to enhance generalization in visual +classification tasks. However, traditional methods struggle when source and +target domains differ, as in domain adaptation, due to their inability to +address domain gaps. This paper introduces GenMix, a generalizable +prompt-guided generative data augmentation approach that enhances both +in-domain and cross-domain image classification. Our technique leverages image +editing to generate augmented images based on custom conditional prompts, +designed specifically for each problem type. By blending portions of the input +image with its edited generative counterpart and incorporating fractal +patterns, our approach mitigates unrealistic images and label ambiguity, +improving the performance and adversarial robustness of the resulting models. +Efficacy of our method is established with extensive experiments on eight +public datasets for general and fine-grained classification, in both in-domain +and cross-domain settings. Additionally, we demonstrate performance +improvements for self-supervised learning, learning with data scarcity, and +adversarial robustness. As compared to the existing state-of-the-art methods, +our technique achieves stronger performance across the board. + +
+
+ comment: https://diffusemix.github.io/ +
+
+
+
+
+ + ♻ ☆ Deferred Poisoning: Making the Model More Vulnerable via Hessian + Singularization + + +
+ Recent studies have shown that deep learning models are very vulnerable to +poisoning attacks. Many defense methods have been proposed to address this +issue. However, traditional poisoning attacks are not as threatening as +commonly believed. This is because they often cause differences in how the +model performs on the training set compared to the validation set. Such +inconsistency can alert defenders that their data has been poisoned, allowing +them to take the necessary defensive actions. In this paper, we introduce a +more threatening type of poisoning attack called the Deferred Poisoning Attack. +This new attack allows the model to function normally during the training and +validation phases but makes it very sensitive to evasion attacks or even +natural noise. We achieve this by ensuring the poisoned model's loss function +has a similar value as a normally trained model at each input sample but with a +large local curvature. A similar model loss ensures that there is no obvious +inconsistency between the training and validation accuracy, demonstrating high +stealthiness. On the other hand, the large curvature implies that a small +perturbation may cause a significant increase in model loss, leading to +substantial performance degradation, which reflects a worse robustness. We +fulfill this purpose by making the model have singular Hessian information at +the optimal point via our proposed Singularization Regularization term. We have +conducted both theoretical and empirical analyses of the proposed method and +validated its effectiveness through experiments on image classification tasks. +Furthermore, we have confirmed the hazards of this form of poisoning attack +under more general scenarios using natural noise, offering a new perspective +for research in the field of security. + +
+
+
+
+
+ + ♻ ☆ Defending Against Repetitive Backdoor Attacks on Semi-supervised + Learning through Lens of Rate-Distortion-Perception Trade-off WACV 2025 + + +
+ Semi-supervised learning (SSL) has achieved remarkable performance with a +small fraction of labeled data by leveraging vast amounts of unlabeled data +from the Internet. However, this large pool of untrusted data is extremely +vulnerable to data poisoning, leading to potential backdoor attacks. Current +backdoor defenses are not yet effective against such a vulnerability in SSL. In +this study, we propose a novel method, Unlabeled Data Purification (UPure), to +disrupt the association between trigger patterns and target classes by +introducing perturbations in the frequency domain. By leveraging the +Rate-Distortion-Perception (RDP) trade-off, we further identify the frequency +band, where the perturbations are added, and justify this selection. Notably, +UPure purifies poisoned unlabeled data without the need of extra clean labeled +data. Extensive experiments on four benchmark datasets and five SSL algorithms +demonstrate that UPure effectively reduces the attack success rate from 99.78% +to 0% while maintaining model accuracy. Code is available here: +\url{https://github.com/chengyi-chris/UPure}. + +
+
+ comment: Accepted by WACV 2025 +
+
+
+
+
+ + ♻ ☆ LLM as a Complementary Optimizer to Gradient Descent: A Case Study in + Prompt Tuning + + +
+ Mastering a skill generally relies on both hands-on experience from doers and +insightful, high-level guidance by mentors. Will this strategy also work well +for solving complex non-convex optimization problems? Here, a common +gradient-based optimizer acts like a disciplined doer, making locally optimal +updates at each step. Large Language Models (LLMs) can also search for better +solutions by inferring from natural language instructions, akin to a high-level +mentor. In this paper, we show that these two participators are complementary +to each other and can effectively collaborate as a combined optimization +framework. The collaborative optimization is achieved by alternating between +the gradient-based and LLM-based optimizers. We instruct LLMs to generate +possibly improved solutions by taking parameter trajectories recorded during +the previous stage of gradient-based optimization into account. Inferred +results of LLMs are used as restarting points for the next stage of gradient +optimization. We verify the effectiveness of this optimization framework on +prompt tuning. By leveraging both the locally rigorous gradient-based optimizer +and the high-level deductive LLM-based optimizer, the combined optimization +method consistently yields improvements over competitive baselines on a variety +of tasks. Our results demonstrate the synergistic effect of conventional +gradient-based optimization and the inference ability of LLMs. The code is +released at https://github.com/guozix/LLM-catalyst. + +
+
+
+
+
+ + ♻ ☆ Functionality understanding and segmentation in 3D scenes + + +
+ Understanding functionalities in 3D scenes involves interpreting natural +language descriptions to locate functional interactive objects, such as handles +and buttons, in a 3D environment. Functionality understanding is highly +challenging, as it requires both world knowledge to interpret language and +spatial perception to identify fine-grained objects. For example, given a task +like 'turn on the ceiling light', an embodied AI agent must infer that it needs +to locate the light switch, even though the switch is not explicitly mentioned +in the task description. To date, no dedicated methods have been developed for +this problem. In this paper, we introduce Fun3DU, the first approach designed +for functionality understanding in 3D scenes. Fun3DU uses a language model to +parse the task description through Chain-of-Thought reasoning in order to +identify the object of interest. The identified object is segmented across +multiple views of the captured scene by using a vision and language model. The +segmentation results from each view are lifted in 3D and aggregated into the +point cloud using geometric information. Fun3DU is training-free, relying +entirely on pre-trained models. We evaluate Fun3DU on SceneFun3D, the most +recent and only dataset to benchmark this task, which comprises over 3000 task +descriptions on 230 scenes. Our method significantly outperforms +state-of-the-art open-vocabulary 3D segmentation approaches. Project page: +https://jcorsetti.github.io/fun3du + +
+
+ comment: Technical report. 20 pages, 12 figures, 7 tables. Fixed main diagram +
+
+
+
+
+ + ♻ ☆ Generative Photography: Scene-Consistent Camera Control for Realistic + Text-to-Image Synthesis + + +
+ Image generation today can produce somewhat realistic images from text +prompts. However, if one asks the generator to synthesize a particular camera +setting such as creating different fields of view using a 24mm lens versus a +70mm lens, the generator will not be able to interpret and generate +scene-consistent images. This limitation not only hinders the adoption of +generative tools in photography applications but also exemplifies a broader +issue of bridging the gap between the data-driven models and the physical +world. In this paper, we introduce the concept of Generative Photography, a +framework designed to control camera intrinsic settings during content +generation. The core innovation of this work are the concepts of Dimensionality +Lifting and Contrastive Camera Learning, which achieve continuous and +consistent transitions for different camera settings. Experimental results show +that our method produces significantly more scene-consistent photorealistic +images than state-of-the-art models such as Stable Diffusion 3 and FLUX. + +
+
+ comment: Project page: https://generative-photography.github.io/project/ +
+
+
+
+
+ + ♻ ☆ Analysis of Classifier-Free Guidance Weight Schedulers + + +
+ Classifier-Free Guidance (CFG) enhances the quality and condition adherence +of text-to-image diffusion models. It operates by combining the conditional and +unconditional predictions using a fixed weight. However, recent works vary the +weights throughout the diffusion process, reporting superior results but +without providing any rationale or analysis. By conducting comprehensive +experiments, this paper provides insights into CFG weight schedulers. Our +findings suggest that simple, monotonically increasing weight schedulers +consistently lead to improved performances, requiring merely a single line of +code. In addition, more complex parametrized schedulers can be optimized for +further improvement, but do not generalize across different models and tasks. + +
+
+
+
+
+ + ♻ ☆ OpenDriver: An Open-Road Driver State Detection Dataset + + +
+ Among numerous studies for driver state detection, wearable physiological +measurements offer a practical method for real-time monitoring. However, there +are few driver physiological datasets in open-road scenarios, and the existing +datasets suffer from issues such as poor signal quality, small sample sizes, +and short data collection periods. Therefore, in this paper, a large-scale +multimodal driving dataset, OpenDriver, for driver state detection is +developed. The OpenDriver encompasses a total of 3,278 driving trips, with a +signal collection duration spanning approximately 4,600 hours. Two modalities +of driving signals are enrolled in OpenDriver: electrocardiogram (ECG) signals +and six-axis motion data of the steering wheel from a motion measurement unit +(IMU), which were recorded from 81 drivers and their vehicles. Furthermore, +three challenging tasks are involved in our work, namely ECG signal quality +assessment, individual biometric identification based on ECG signals, and +physiological signal analysis in complex driving environments. To facilitate +research in these tasks, corresponding benchmarks have also been introduced. +First, a noisy augmentation strategy is applied to generate a larger-scale ECG +signal dataset with realistic noise simulation for quality assessment. Second, +an end-to-end contrastive learning framework is employed for individual +biometric identification. Finally, a comprehensive analysis of drivers' HRV +features under different driving conditions is conducted. Each benchmark +provides evaluation metrics and reference results. The OpenDriver dataset will +be publicly available at https://github.com/bdne/OpenDriver. + +
+
+ comment: Considering that there are flaws in the statistical data of the + dataset, all the authors agreed to withdraw the manuscript +
+
+
+
+
+ + ♻ ☆ A Spatio-Temporal Representation Learning as an Alternative to + Traditional Glosses in Sign Language Translation and Production WACV 2025 + + +
+ This work addresses the challenges associated with the use of glosses in both +Sign Language Translation (SLT) and Sign Language Production (SLP). While +glosses have long been used as a bridge between sign language and spoken +language, they come with two major limitations that impede the advancement of +sign language systems. First, annotating the glosses is a labor-intensive and +time-consuming process, which limits the scalability of datasets. Second, the +glosses oversimplify sign language by stripping away its spatio-temporal +dynamics, reducing complex signs to basic labels and missing the subtle +movements essential for precise interpretation. To address these limitations, +we introduce Universal Gloss-level Representation (UniGloR), a framework +designed to capture the spatio-temporal features inherent in sign language, +providing a more dynamic and detailed alternative to the use of the glosses. +The core idea of UniGloR is simple yet effective: We derive dense +spatio-temporal representations from sign keypoint sequences using +self-supervised learning and seamlessly integrate them into SLT and SLP tasks. +Our experiments in a keypoint-based setting demonstrate that UniGloR either +outperforms or matches the performance of previous SLT and SLP methods on two +widely-used datasets: PHOENIX14T and How2Sign. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ Rethinking Token Reduction in MLLMs: Towards a Unified Paradigm for + Training-Free Acceleration + + +
+ To accelerate the inference of heavy Multimodal Large Language Models +(MLLMs), this study rethinks the current landscape of training-free token +reduction research. We regret to find that the critical components of existing +methods are tightly intertwined, with their interconnections and effects +remaining unclear for comparison, transfer, and expansion. Therefore, we +propose a unified ''filter-correlate-compress'' paradigm that decomposes the +token reduction into three distinct stages within a pipeline, maintaining +consistent design objectives and elements while allowing for unique +implementations. We additionally demystify the popular works and subsume them +into our paradigm to showcase its universality. Finally, we offer a suite of +methods grounded in the paradigm, striking a balance between speed and accuracy +throughout different phases of the inference. Experimental results across 10 +benchmarks indicate that our methods can achieve up to an 82.4% reduction in +FLOPs with a minimal impact on performance, simultaneously surpassing +state-of-the-art training-free methods. Our project page is at +https://ficoco-accelerate.github.io/. + +
+
+
+
+
+ + ♻ ☆ CamI2V: Camera-Controlled Image-to-Video Diffusion Model + + +
+ Recent advancements have integrated camera pose as a user-friendly and +physics-informed condition in video diffusion models, enabling precise camera +control. In this paper, we identify one of the key challenges as effectively +modeling noisy cross-frame interactions to enhance geometry consistency and +camera controllability. We innovatively associate the quality of a condition +with its ability to reduce uncertainty and interpret noisy cross-frame features +as a form of noisy condition. Recognizing that noisy conditions provide +deterministic information while also introducing randomness and potential +misguidance due to added noise, we propose applying epipolar attention to only +aggregate features along corresponding epipolar lines, thereby accessing an +optimal amount of noisy conditions. Additionally, we address scenarios where +epipolar lines disappear, commonly caused by rapid camera movements, dynamic +objects, or occlusions, ensuring robust performance in diverse environments. +Furthermore, we develop a more robust and reproducible evaluation pipeline to +address the inaccuracies and instabilities of existing camera control metrics. +Our method achieves a 25.64% improvement in camera controllability on the +RealEstate10K dataset without compromising dynamics or generation quality and +demonstrates strong generalization to out-of-domain images. Training and +inference require only 24GB and 12GB of memory, respectively, for 16-frame +sequences at 256x256 resolution. We will release all checkpoints, along with +training and evaluation code. Dynamic videos are best viewed at +https://zgctroy.github.io/CamI2V. + +
+
+
+
+
+ + ♻ ☆ StructChart: On the Schema, Metric, and Augmentation for Visual Chart + Understanding + + +
+ Charts are common in literature across various scientific fields, conveying +rich information easily accessible to readers. Current chart-related tasks +focus on either chart perception that extracts information from the visual +charts, or chart reasoning given the extracted data, e.g. in a tabular form. In +this paper, we introduce StructChart, a novel framework that leverages +Structured Triplet Representations (STR) to achieve a unified and +label-efficient approach to chart perception and reasoning tasks, which is +generally applicable to different downstream tasks, beyond the +question-answering task as specifically studied in peer works. Specifically, +StructChart first reformulates the chart data from the tubular form (linearized +CSV) to STR, which can friendlily reduce the task gap between chart perception +and reasoning. We then propose a Structuring Chart-oriented Representation +Metric (SCRM) to quantitatively evaluate the chart perception task performance. +To augment the training, we further explore the potential of Large Language +Models (LLMs) to enhance the diversity in both chart visual style and +statistical information. Extensive experiments on various chart-related tasks +demonstrate the effectiveness and potential of a unified chart +perception-reasoning paradigm to push the frontier of chart understanding. + +
+
+ comment: All codes, models and SimChart9K data are available for downloading + at: https://github.com/UniModal4Reasoning/ChartVLM and + https://github.com/UniModal4Reasoning/SimChart9K +
+
+
+
+
+ + ♻ ☆ How to Segment in 3D Using 2D Models: Automated 3D Segmentation of + Prostate Cancer Metastatic Lesions on PET Volumes Using Multi-angle Maximum + Intensity Projections and Diffusion Models MICCAI + + +
+ Prostate specific membrane antigen (PSMA) positron emission +tomography/computed tomography (PET/CT) imaging provides a tremendously +exciting frontier in visualization of prostate cancer (PCa) metastatic lesions. +However, accurate segmentation of metastatic lesions is challenging due to low +signal-to-noise ratios and variable sizes, shapes, and locations of the +lesions. This study proposes a novel approach for automated segmentation of +metastatic lesions in PSMA PET/CT 3D volumetric images using 2D denoising +diffusion probabilistic models (DDPMs). Instead of 2D trans-axial slices or 3D +volumes, the proposed approach segments the lesions on generated multi-angle +maximum intensity projections (MA-MIPs) of the PSMA PET images, then obtains +the final 3D segmentation masks from 3D ordered subset expectation maximization +(OSEM) reconstruction of 2D MA-MIPs segmentations. Our proposed method achieved +superior performance compared to state-of-the-art 3D segmentation approaches in +terms of accuracy and robustness in detecting and segmenting small metastatic +PCa lesions. The proposed method has significant potential as a tool for +quantitative analysis of metastatic burden in PCa patients. + +
+
+ comment: 11 pages, 2 figures, accepted in the DGM4MICCAI workshop, MICCAI, + 2024 +
+
+
+
+
+ + ♻ ☆ ROSE: Revolutionizing Open-Set Dense Segmentation with Patch-Wise + Perceptual Large Multimodal Model + + +
+ Advances in CLIP and large multimodal models (LMMs) have enabled +open-vocabulary and free-text segmentation, yet existing models still require +predefined category prompts, limiting free-form category self-generation. Most +segmentation LMMs also remain confined to sparse predictions, restricting their +applicability in open-set environments. In contrast, we propose ROSE, a +Revolutionary Open-set dense SEgmentation LMM, which enables dense mask +prediction and open-category generation through patch-wise perception. Our +method treats each image patch as an independent region of interest candidate, +enabling the model to predict both dense and sparse masks simultaneously. +Additionally, a newly designed instruction-response paradigm takes full +advantage of the generation and generalization capabilities of LMMs, achieving +category prediction independent of closed-set constraints or predefined +categories. To further enhance mask detail and category precision, we introduce +a conversation-based refinement paradigm, integrating the prediction result +from previous step with textual prompt for revision. Extensive experiments +demonstrate that ROSE achieves competitive performance across various +segmentation tasks in a unified framework. Code will be released. + +
+
+
+
+
+ + ♻ ☆ Monocular Lane Detection Based on Deep Learning: A Survey + + +
+ Lane detection plays an important role in autonomous driving perception +systems. As deep learning algorithms gain popularity, monocular lane detection +methods based on them have demonstrated superior performance and emerged as a +key research direction in autonomous driving perception. The core designs of +these algorithmic frameworks can be summarized as follows: (1) Task paradigm, +focusing on lane instance-level discrimination; (2) Lane modeling, representing +lanes as a set of learnable parameters in the neural network; (3) Global +context supplementation, enhancing inference on the obscure lanes; (4) +Perspective effect elimination, providing accurate 3D lanes for downstream +applications. From these perspectives, this paper presents a comprehensive +overview of existing methods, encompassing both the increasingly mature 2D lane +detection approaches and the developing 3D lane detection works. Besides, this +paper compares the performance of mainstream methods on different benchmarks +and investigates their inference speed under a unified setting for fair +comparison. Moreover, we present some extended works on lane detection, +including multi-task perception, video lane detection, online high-definition +map construction, and lane topology reasoning, to offer readers a comprehensive +roadmap for the evolution of lane detection. Finally, we point out some +potential future research directions in this field. We exhaustively collect the +papers and codes of existing works at +https://github.com/Core9724/Awesome-Lane-Detection and will keep tracing the +research. + +
+
+
+
+
+ + ♻ ☆ SparseLGS: Sparse View Language Embedded Gaussian Splatting + + +
+ Recently, several studies have combined Gaussian Splatting to obtain scene +representations with language embeddings for open-vocabulary 3D scene +understanding. While these methods perform well, they essentially require very +dense multi-view inputs, limiting their applicability in real-world scenarios. +In this work, we propose SparseLGS to address the challenge of 3D scene +understanding with pose-free and sparse view input images. Our method leverages +a learning-based dense stereo model to handle pose-free and sparse inputs, and +a three-step region matching approach to address the multi-view semantic +inconsistency problem, which is especially important for sparse inputs. +Different from directly learning high-dimensional CLIP features, we extract +low-dimensional information and build bijections to avoid excessive learning +and storage costs. We introduce a reconstruction loss during semantic training +to improve Gaussian positions and shapes. To the best of our knowledge, we are +the first to address the 3D semantic field problem with sparse pose-free +inputs. Experimental results show that SparseLGS achieves comparable quality +when reconstructing semantic fields with fewer inputs (3-4 views) compared to +previous SOTA methods with dense input. Besides, when using the same sparse +input, SparseLGS leads significantly in quality and heavily improves the +computation speed (5$\times$speedup). Project page: +https://ustc3dv.github.io/SparseLGS + +
+
+ comment: Project Page: https://ustc3dv.github.io/SparseLGS +
+
+
+
+
+ + ♻ ☆ ODE: Open-Set Evaluation of Hallucinations in Multimodal Large Language + Models + + +
+ Hallucination poses a persistent challenge for multimodal large language +models (MLLMs). However, existing benchmarks for evaluating hallucinations are +generally static, which may overlook the potential risk of data contamination. +To address this issue, we propose ODE, an open-set, dynamic protocol designed +to evaluate object hallucinations in MLLMs at both the existence and attribute +levels. ODE employs a graph-based structure to represent real-world object +concepts, their attributes, and the distributional associations between them. +This structure facilitates the extraction of concept combinations based on +diverse distributional criteria, generating varied samples for structured +queries that evaluate hallucinations in both generative and discriminative +tasks. Through the generation of new samples, dynamic concept combinations, and +varied distribution frequencies, ODE mitigates the risk of data contamination +and broadens the scope of evaluation. This protocol is applicable to both +general and specialized scenarios, including those with limited data. +Experimental results demonstrate the effectiveness of our protocol, revealing +that MLLMs exhibit higher hallucination rates when evaluated with ODE-generated +samples, which indicates potential data contamination. Furthermore, these +generated samples aid in analyzing hallucination patterns and fine-tuning +models, offering an effective approach to mitigating hallucinations in MLLMs. + +
+
+
+
+
+ + ♻ ☆ Multi-Sensor Diffusion-Driven Optical Image Translation for Large-Scale + Applications + + +
+ Comparing images captured by disparate sensors is a common challenge in +remote sensing. This requires image translation -- converting imagery from one +sensor domain to another while preserving the original content. Denoising +Diffusion Implicit Models (DDIM) are potential state-of-the-art solutions for +such domain translation due to their proven superiority in multiple +image-to-image translation tasks in computer vision. However, these models +struggle with reproducing radiometric features of large-scale multi-patch +imagery, resulting in inconsistencies across the full image. This renders +downstream tasks like Heterogeneous Change Detection impractical. To overcome +these limitations, we propose a method that leverages denoising diffusion for +effective multi-sensor optical image translation over large areas. Our approach +super-resolves large-scale low spatial resolution images into high-resolution +equivalents from disparate optical sensors, ensuring uniformity across hundreds +of patches. Our contributions lie in new forward and reverse diffusion +processes that address the challenges of large-scale image translation. +Extensive experiments using paired Sentinel-II (10m) and Planet Dove (3m) +images demonstrate that our approach provides precise domain adaptation, +preserving image content while improving radiometric accuracy and feature +representation. A thorough image quality assessment and comparisons with the +standard DDIM framework and five other leading methods are presented. We reach +a mean Learned Perceptual Image Patch Similarity (mLPIPS) of 0.1884 and a +Fr\'echet Inception Distance (FID) of 45.64, expressively outperforming all +compared methods, including DDIM, ShuffleMixer, and SwinIR. The usefulness of +our approach is further demonstrated in two Heterogeneous Change Detection +tasks. + +
+
+ comment: This is the accepted version of the manuscript published in IEEE + Journal of Selected Topics in Applied Earth Observations and Remote Sensing + (JSTARS). Please access the final version at IEEEXplore (Open Access). DOI + 10.1109/JSTARS.2024.3506032. This technology is protected by a patent filed + on 23 december 2023 at Office Luxembourgeois de la propri\'et\'e + intellectuelle (LU505861) +
+
+
+
+
+ + ♻ ☆ Breaking the Frame: Visual Place Recognition by Overlap Prediction WACV 2025 + + +
+ Visual place recognition methods struggle with occlusions and partial visual +overlaps. We propose a novel visual place recognition approach based on overlap +prediction, called VOP, shifting from traditional reliance on global image +similarities and local features to image overlap prediction. VOP proceeds +co-visible image sections by obtaining patch-level embeddings using a Vision +Transformer backbone and establishing patch-to-patch correspondences without +requiring expensive feature detection and matching. Our approach uses a voting +mechanism to assess overlap scores for potential database images. It provides a +nuanced image retrieval metric in challenging scenarios. Experimental results +show that VOP leads to more accurate relative pose estimation and localization +results on the retrieved image pairs than state-of-the-art baselines on a +number of large-scale, real-world indoor and outdoor benchmarks. The code is +available at https://github.com/weitong8591/vop.git. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ The Cooperative Network Architecture: Learning Structured Networks as + Representation of Sensory Patterns + + +
+ Nets, cooperative networks of neurons, have been proposed as format for the +representation of sensory signals, as physical implementation of the Gestalt +phenomenon and as solution to the neural binding problem, while the direct +interaction between nets by structure-sensitive matching has been proposed as +basis for object-global operations such as object detection. The nets are +flexibly composed of overlapping net fragments, which are learned from +statistical regularities of sensory input. We here present the cooperative +network architecture (CNA), a concrete model that learns such net structure to +represent input patterns and deals robustly with noise, deformation, and +out-of-distribution data, thus laying the groundwork for a novel neural +architecture. + +
+
+
+
+
+ + ♻ ☆ Local Lesion Generation is Effective for Capsule Endoscopy Image Data + Augmentation in a Limited Data Setting + + +
+ Limited medical imaging datasets challenge deep learning models by increasing +risks of overfitting and reduced generalization, particularly in Generative +Adversarial Networks (GANs), where discriminators may overfit, leading to +training divergence. This constraint also impairs classification models trained +on small datasets. Generative Data Augmentation (GDA) addresses this by +expanding training datasets with synthetic data, although it requires training +a generative model. We propose and evaluate two local lesion generation +approaches to address the challenge of augmenting small medical image datasets. +The first approach employs the Poisson Image Editing algorithm, a classical +image processing technique, to create realistic image composites that +outperform current state-of-the-art methods. The second approach introduces a +novel generative method, leveraging a fine-tuned Image Inpainting GAN to +synthesize realistic lesions within specified regions of real training images. +A comprehensive comparison of the two proposed methods demonstrates that +effective local lesion generation in a data-constrained setting allows for +reaching new state-of-the-art results in capsule endoscopy lesion +classification. Combination of our techniques achieves a macro F1-score of +33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on +the highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule +endoscopy. To the best of our knowledge, this work is the first to apply a +fine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that +an image-conditional GAN can be adapted effectively to limited datasets to +generate high-quality examples, facilitating effective data augmentation. +Additionally, we show that combining this GAN-based approach with classical +image processing techniques further improves the results. + +
+
+ comment: 54 pages, 35 figures +
+
+
+
+
+ + ♻ ☆ Leveraging LLMs for On-the-Fly Instruction Guided Image Editing + + +
+ The combination of language processing and image processing keeps attracting +increased interest given recent impressive advances that leverage the combined +strengths of both domains of research. Among these advances, the task of +editing an image on the basis solely of a natural language instruction stands +out as a most challenging endeavour. While recent approaches for this task +resort, in one way or other, to some form of preliminary preparation, training +or fine-tuning, this paper explores a novel approach: We propose a +preparation-free method that permits instruction-guided image editing on the +fly. This approach is organized along three steps properly orchestrated that +resort to image captioning and DDIM inversion, followed by obtaining the edit +direction embedding, followed by image editing proper. While dispensing with +preliminary preparation, our approach demonstrates to be effective and +competitive, outperforming recent, state of the art models for this task when +evaluated on the MAGICBRUSH dataset. + +
+
+
+
+
+ + ♻ ☆ STRIDE: Single-video based Temporally Continuous Occlusion-Robust 3D + Pose Estimation WACV + + +
+ The capability to accurately estimate 3D human poses is crucial for diverse +fields such as action recognition, gait recognition, and virtual/augmented +reality. However, a persistent and significant challenge within this field is +the accurate prediction of human poses under conditions of severe occlusion. +Traditional image-based estimators struggle with heavy occlusions due to a lack +of temporal context, resulting in inconsistent predictions. While video-based +models benefit from processing temporal data, they encounter limitations when +faced with prolonged occlusions that extend over multiple frames. This +challenge arises because these models struggle to generalize beyond their +training datasets, and the variety of occlusions is hard to capture in the +training data. Addressing these challenges, we propose STRIDE (Single-video +based TempoRally contInuous Occlusion-Robust 3D Pose Estimation), a novel +Test-Time Training (TTT) approach to fit a human motion prior for each video. +This approach specifically handles occlusions that were not encountered during +the model's training. By employing STRIDE, we can refine a sequence of noisy +initial pose estimates into accurate, temporally coherent poses during test +time, effectively overcoming the limitations of prior methods. Our framework +demonstrates flexibility by being model-agnostic, allowing us to use any +off-the-shelf 3D pose estimation method for improving robustness and temporal +consistency. We validate STRIDE's efficacy through comprehensive experiments on +challenging datasets like Occluded Human3.6M, Human3.6M, and OCMotion, where it +not only outperforms existing single-image and video-based pose estimation +models but also showcases superior handling of substantial occlusions, +achieving fast, robust, accurate, and temporally consistent 3D pose estimates. +Code is made publicly available at https://github.com/take2rohit/stride + +
+
+ comment: Paper accepted at IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV)-2025 +
+
+
+
+
+ + ♻ ☆ EgoPressure: A Dataset for Hand Pressure and Pose Estimation in + Egocentric Vision + + +
+ Touch contact and pressure are essential for understanding how humans +interact with and manipulate objects, insights which can significantly benefit +applications in mixed reality and robotics. However, estimating these +interactions from an egocentric camera perspective is challenging, largely due +to the lack of comprehensive datasets that provide both accurate hand poses on +contacting surfaces and detailed annotations of pressure information. In this +paper, we introduce EgoPressure, a novel egocentric dataset that captures +detailed touch contact and pressure interactions. EgoPressure provides +high-resolution pressure intensity annotations for each contact point and +includes accurate hand pose meshes obtained through our proposed multi-view, +sequence-based optimization method processing data from an 8-camera capture +rig. Our dataset comprises 5 hours of recorded interactions from 21 +participants captured simultaneously by one head-mounted and seven stationary +Kinect cameras, which acquire RGB images and depth maps at 30 Hz. To support +future research and benchmarking, we present several baseline models for +estimating applied pressure on external surfaces from RGB images, with and +without hand pose information. We further explore the joint estimation of the +hand mesh and applied pressure. Our experiments demonstrate that pressure and +hand pose are complementary for understanding hand-object interactions. ng of +hand-object interactions in AR/VR and robotics research. Project page: +\url{https://yiming-zhao.github.io/EgoPressure/}. + +
+
+
+
+
+ + ♻ ☆ One Step Learning, One Step Review AAAI + + +
+ Visual fine-tuning has garnered significant attention with the rise of +pre-trained vision models. The current prevailing method, full fine-tuning, +suffers from the issue of knowledge forgetting as it focuses solely on fitting +the downstream training set. In this paper, we propose a novel weight +rollback-based fine-tuning method called OLOR (One step Learning, One step +Review). OLOR combines fine-tuning with optimizers, incorporating a weight +rollback term into the weight update term at each step. This ensures +consistency in the weight range of upstream and downstream models, effectively +mitigating knowledge forgetting and enhancing fine-tuning performance. In +addition, a layer-wise penalty is presented to employ penalty decay and the +diversified decay rate to adjust the weight rollback levels of layers for +adapting varying downstream tasks. Through extensive experiments on various +tasks such as image classification, object detection, semantic segmentation, +and instance segmentation, we demonstrate the general applicability and +state-of-the-art performance of our proposed OLOR. Code is available at +https://github.com/rainbow-xiao/OLOR-AAAI-2024. + +
+
+ comment: Published at the 38th AAAI Conference on Artificial Intelligence + (AAAI 2024) +
+
+
+
+
+ + ♻ ☆ Knowledge Mechanisms in Large Language Models: A Survey and Perspective EMNLP 2024 + + +
+ Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial +for advancing towards trustworthy AGI. This paper reviews knowledge mechanism +analysis from a novel taxonomy including knowledge utilization and evolution. +Knowledge utilization delves into the mechanism of memorization, comprehension +and application, and creation. Knowledge evolution focuses on the dynamic +progression of knowledge within individual and group LLMs. Moreover, we discuss +what knowledge LLMs have learned, the reasons for the fragility of parametric +knowledge, and the potential dark knowledge (hypothesis) that will be +challenging to address. We hope this work can help understand knowledge in LLMs +and provide insights for future research. + +
+
+ comment: EMNLP 2024 Findings; 39 pages (v4) +
+
+
+
+
+ + ♻ ☆ Enhancing Perception Capabilities of Multimodal LLMs with Training-Free + Fusion + + +
+ Multimodal LLMs (MLLMs) equip language models with visual capabilities by +aligning vision encoders with language models. Existing methods to enhance the +visual perception of MLLMs often involve designing more powerful vision +encoders, which requires exploring a vast design space and re-aligning each +potential encoder with the language model, resulting in prohibitively high +training costs. In this paper, we introduce VisionFuse, a novel integration +framework that efficiently utilizes multiple vision encoders from off-the-shelf +MLLMs to enhance visual perception without requiring additional training. Our +approach is motivated by the observation that different MLLMs tend to focus on +distinct regions given the same query and image. Moreover, we find that the +feature distributions of vision encoders within an MLLM family, a group of +MLLMs sharing the same pretrained LLM, are highly aligned. Building on these +insights, VisionFuse enriches the visual context by concatenating the tokens +generated by the vision encoders of selected MLLMs within a family. By merging +the parameters of language models from these MLLMs, VisionFuse allows a single +language model to align with various vision encoders, significantly reducing +deployment overhead. We conduct comprehensive evaluations across multiple +multimodal benchmarks using various MLLM combinations, demonstrating +substantial improvements in multimodal tasks. Notably, when integrating +MiniGemini-8B and SLIME-8B, VisionFuse achieves an average performance increase +of over 4%. + +
+
+
+
+
+ + ♻ ☆ Learning Developmental Age from 3D Infant Kinetics Using Adaptive Graph + Neural Networks + + +
+ Reliable methods for the neurodevelopmental assessment of infants are +essential for early detection of problems that may need prompt interventions. +Spontaneous motor activity, or 'kinetics', is shown to provide a powerful +surrogate measure of upcoming neurodevelopment. However, its assessment is by +and large qualitative and subjective, focusing on visually identified, +age-specific gestures. In this work, we introduce Kinetic Age (KA), a novel +data-driven metric that quantifies neurodevelopmental maturity by predicting an +infant's age based on their movement patterns. KA offers an interpretable and +generalizable proxy for motor development. Our method leverages 3D video +recordings of infants, processed with pose estimation to extract +spatio-temporal series of anatomical landmarks, which are released as a new +openly available dataset. These data are modeled using adaptive graph +convolutional networks, able to capture the spatio-temporal dependencies in +infant movements. We also show that our data-driven approach achieves +improvement over traditional machine learning baselines based on manually +engineered features. + +
+
+ comment: 15 pages, 9 figures. Code repository available via + https://github.com/deinal/infant-aagcn +
+
+
+
+
+ + ♻ ☆ FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking + Portrait + + +
+ With the rapid advancement of diffusion-based generative models, portrait +image animation has achieved remarkable results. However, it still faces +challenges in temporally consistent video generation and fast sampling due to +its iterative sampling nature. This paper presents FLOAT, an audio-driven +talking portrait video generation method based on flow matching generative +model. We shift the generative modeling from the pixel-based latent space to a +learned motion latent space, enabling efficient design of temporally consistent +motion. To achieve this, we introduce a transformer-based vector field +predictor with a simple yet effective frame-wise conditioning mechanism. +Additionally, our method supports speech-driven emotion enhancement, enabling a +natural incorporation of expressive motions. Extensive experiments demonstrate +that our method outperforms state-of-the-art audio-driven talking portrait +methods in terms of visual quality, motion fidelity, and efficiency. + +
+
+ comment: Project page: https://deepbrainai-research.github.io/float/ +
+
+
+
+
+ + ♻ ☆ Once-for-All: Controllable Generative Image Compression with Dynamic + Granularity Adaption + + +
+ Although recent generative image compression methods have demonstrated +impressive potential in optimizing the rate-distortion-perception trade-off, +they still face the critical challenge of flexible rate adaption to diverse +compression necessities and scenarios. To overcome this challenge, this paper +proposes a Controllable Generative Image Compression framework, termed +Control-GIC, the first capable of fine-grained bitrate adaption across a broad +spectrum while ensuring high-fidelity and generality compression. Control-GIC +is grounded in a VQGAN framework that encodes an image as a sequence of +variable-length codes (i.e. VQ-indices), which can be losslessly compressed and +exhibits a direct positive correlation with the bitrates. Drawing inspiration +from the classical coding principle, we correlate the information density of +local image patches with their granular representations. Hence, we can flexibly +determine a proper allocation of granularity for the patches to achieve dynamic +adjustment for VQ-indices, resulting in desirable compression rates. We further +develop a probabilistic conditional decoder capable of retrieving historic +encoded multi-granularity representations according to transmitted codes, and +then reconstruct hierarchical granular features in the formalization of +conditional probability, enabling more informative aggregation to improve +reconstruction realism. Our experiments show that Control-GIC allows highly +flexible and controllable bitrate adaption where the results demonstrate its +superior performance over recent state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ A Distractor-Aware Memory for Visual Object Tracking with SAM2 + + +
+ Memory-based trackers are video object segmentation methods that form the +target model by concatenating recently tracked frames into a memory buffer and +localize the target by attending the current image to the buffered frames. +While already achieving top performance on many benchmarks, it was the recent +release of SAM2 that placed memory-based trackers into focus of the visual +object tracking community. Nevertheless, modern trackers still struggle in the +presence of distractors. We argue that a more sophisticated memory model is +required, and propose a new distractor-aware memory model for SAM2 and an +introspection-based update strategy that jointly addresses the segmentation +accuracy as well as tracking robustness. The resulting tracker is denoted as +SAM2.1++. We also propose a new distractor-distilled DiDi dataset to study the +distractor problem better. SAM2.1++ outperforms SAM2.1 and related SAM memory +extensions on seven benchmarks and sets a solid new state-of-the-art on six of +them. + +
+
+ comment: Under review. Code available on Github: + https://github.com/jovanavidenovic/DAM4SAM +
+
+
+
+
+ + ♻ ☆ Data-Efficient 3D Visual Grounding via Order-Aware Referring WACV 2025 + + +
+ 3D visual grounding aims to identify the target object within a 3D point +cloud scene referred to by a natural language description. Previous works +usually require significant data relating to point color and their descriptions +to exploit the corresponding complicated verbo-visual relations. In our work, +we introduce Vigor, a novel Data-Efficient 3D Visual Grounding framework via +Order-aware Referring. Vigor leverages LLM to produce a desirable referential +order from the input description for 3D visual grounding. With the proposed +stacked object-referring blocks, the predicted anchor objects in the above +order allow one to locate the target object progressively without supervision +on the identities of anchor objects or exact relations between anchor/target +objects. In addition, we present an order-aware warm-up training strategy, +which augments referential orders for pre-training the visual grounding +framework. This allows us to better capture the complex verbo-visual relations +and benefit the desirable data-efficient learning scheme. Experimental results +on the NR3D and ScanRefer datasets demonstrate our superiority in low-resource +scenarios. In particular, Vigor surpasses current state-of-the-art frameworks +by 9.3% and 7.6% grounding accuracy under 1% data and 10% data settings on the +NR3D dataset, respectively. + +
+
+ comment: accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Learning Trimaps via Clicks for Image Matting + + +
+ Despite significant advancements in image matting, existing models heavily +depend on manually-drawn trimaps for accurate results in natural image +scenarios. However, the process of obtaining trimaps is time-consuming, lacking +user-friendliness and device compatibility. This reliance greatly limits the +practical application of all trimap-based matting methods. To address this +issue, we introduce Click2Trimap, an interactive model capable of predicting +high-quality trimaps and alpha mattes with minimal user click inputs. Through +analyzing real users' behavioral logic and characteristics of trimaps, we +successfully propose a powerful iterative three-class training strategy and a +dedicated simulation function, making Click2Trimap exhibit versatility across +various scenarios. Quantitative and qualitative assessments on synthetic and +real-world matting datasets demonstrate Click2Trimap's superior performance +compared to all existing trimap-free matting methods. Especially, in the user +study, Click2Trimap achieves high-quality trimap and matting predictions in +just an average of 5 seconds per image, demonstrating its substantial practical +value in real-world applications. + +
+
+
+
+
+ + ♻ ☆ TDDSR: Single-Step Diffusion with Two Discriminators for Super + Resolution + + +
+ Super-resolution methods are increasingly becoming popular for both +real-world and face-specific tasks. Many existing approaches, however, rely on +simplistic degradation models, which limits their ability to handle complex and +unknown degradation patterns effectively. While diffusion-based +super-resolution techniques have recently shown impressive results, they are +still constrained by the need for numerous inference steps. To address this, we +propose TDDSR, an efficient single-step diffusion-based super-resolution +method. Our method, distilled from a pre-trained teacher model and based on a +diffusion network, performs super-resolution in a single step. It integrates a +learnable diffusion-based downsampler to capture diverse degradation patterns +and employs two discriminators, one for high-resolution and one for +low-resolution images, to enhance the overall performance. Experimental results +demonstrate its effectiveness across real-world and face-specific SR tasks, +achieving performance beyond other state-of-the-art models and comparable to +previous diffusion methods with multiple sampling steps. + +
+
+
+
+
+ + ♻ ☆ DCVSMNet: Double Cost Volume Stereo Matching Network + + +
+ We introduce Double Cost Volume Stereo Matching Network(DCVSMNet) which is a +novel architecture characterised by by two small upper (group-wise) and lower +(norm correlation) cost volumes. Each cost volume is processed separately, and +a coupling module is proposed to fuse the geometry information extracted from +the upper and lower cost volumes. DCVSMNet is a fast stereo matching network +with a 67 ms inference time and strong generalization ability which can produce +competitive results compared to state-of-the-art methods. The results on +several bench mark datasets show that DCVSMNet achieves better accuracy than +methods such as CGI-Stereo and BGNet at the cost of greater inference time. + +
+
+
+
+
+ + ♻ ☆ Agri-LLaVA: Knowledge-Infused Large Multimodal Assistant on Agricultural + Pests and Diseases + + +
+ In the general domain, large multimodal models (LMMs) have achieved +significant advancements, yet challenges persist in applying them to specific +fields, especially agriculture. As the backbone of the global economy, +agriculture confronts numerous challenges, with pests and diseases being +particularly concerning due to their complexity, variability, rapid spread, and +high resistance. This paper specifically addresses these issues. We construct +the first multimodal instruction-following dataset in the agricultural domain, +covering over 221 types of pests and diseases with approximately 400,000 data +entries. This dataset aims to explore and address the unique challenges in pest +and disease control. Based on this dataset, we propose a knowledge-infused +training method to develop Agri-LLaVA, an agricultural multimodal conversation +system. To accelerate progress in this field and inspire more researchers to +engage, we design a diverse and challenging evaluation benchmark for +agricultural pests and diseases. Experimental results demonstrate that +Agri-LLaVA excels in agricultural multimodal conversation and visual +understanding, providing new insights and approaches to address agricultural +pests and diseases. By open-sourcing our dataset and model, we aim to promote +research and development in LMMs within the agricultural domain and make +significant contributions to tackle the challenges of agricultural pests and +diseases. All resources can be found at https://github.com/Kki2Eve/Agri-LLaVA. + +
+
+
+
+
+ + ♻ ☆ YOLO based Ocean Eddy Localization with AWS SageMaker + + +
+ Ocean eddies play a significant role both on the sea surface and beneath it, +contributing to the sustainability of marine life dependent on oceanic +behaviors. Therefore, it is crucial to investigate ocean eddies to monitor +changes in the Earth, particularly in the oceans, and their impact on climate. +This study aims to pinpoint ocean eddies using AWS cloud services, specifically +SageMaker. The primary objective is to detect small-scale (<20km) ocean eddies +from satellite remote images and assess the feasibility of utilizing SageMaker, +which offers tools for deploying AI applications. Moreover, this research not +only explores the deployment of cloud-based services for remote sensing of +Earth data but also evaluates several YOLO (You Only Look Once) models using +single and multi-GPU-based services in the cloud. Furthermore, this study +underscores the potential of these services, their limitations, challenges +related to deployment and resource management, and their user-riendliness for +Earth science projects. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ SNOOPI: Supercharged One-step Diffusion Distillation with Proper + Guidance + + +
+ Recent approaches have yielded promising results in distilling multi-step +text-to-image diffusion models into one-step ones. The state-of-the-art +efficient distillation technique, i.e., SwiftBrushv2 (SBv2), even surpasses the +teacher model's performance with limited resources. However, our study reveals +its instability when handling different diffusion model backbones due to using +a fixed guidance scale within the Variational Score Distillation (VSD) loss. +Another weakness of the existing one-step diffusion models is the missing +support for negative prompt guidance, which is crucial in practical image +generation. This paper presents SNOOPI, a novel framework designed to address +these limitations by enhancing the guidance in one-step diffusion models during +both training and inference. First, we effectively enhance training stability +through Proper Guidance-SwiftBrush (PG-SB), which employs a random-scale +classifier-free guidance approach. By varying the guidance scale of both +teacher models, we broaden their output distributions, resulting in a more +robust VSD loss that enables SB to perform effectively across diverse backbones +while maintaining competitive performance. Second, we propose a training-free +method called Negative-Away Steer Attention (NASA), which integrates negative +prompts into one-step diffusion models via cross-attention to suppress +undesired elements in generated images. Our experimental results show that our +proposed methods significantly improve baseline models across various metrics. +Remarkably, we achieve an HPSv2 score of 31.08, setting a new state-of-the-art +benchmark for one-step diffusion models. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ DragText: Rethinking Text Embedding in Point-based Image Editing WACV 2025 + + +
+ Point-based image editing enables accurate and flexible control through +content dragging. However, the role of text embedding during the editing +process has not been thoroughly investigated. A significant aspect that remains +unexplored is the interaction between text and image embeddings. During the +progressive editing in a diffusion model, the text embedding remains constant. +As the image embedding increasingly diverges from its initial state, the +discrepancy between the image and text embeddings presents a significant +challenge. In this study, we found that the text prompt significantly +influences the dragging process, particularly in maintaining content integrity +and achieving the desired manipulation. Upon these insights, we propose +DragText, which optimizes text embedding in conjunction with the dragging +process to pair with the modified image embedding. Simultaneously, we +regularize the text optimization process to preserve the integrity of the +original text prompt. Our approach can be seamlessly integrated with existing +diffusion-based drag methods, enhancing performance with only a few lines of +code. + +
+
+ comment: Accepted at WACV 2025; Code is released at + https://github.com/MICV-yonsei/DragText +
+
+
+
+
+ + ♻ ☆ Boosting Weakly-Supervised Referring Image Segmentation via Progressive + Comprehension NeurIPS2024 + + +
+ This paper explores the weakly-supervised referring image segmentation (WRIS) +problem, and focuses on a challenging setup where target localization is +learned directly from image-text pairs. We note that the input text description +typically already contains detailed information on how to localize the target +object, and we also observe that humans often follow a step-by-step +comprehension process (\ie, progressively utilizing target-related attributes +and relations as cues) to identify the target object. Hence, we propose a novel +Progressive Comprehension Network (PCNet) to leverage target-related textual +cues from the input description for progressively localizing the target object. +Specifically, we first use a Large Language Model (LLM) to decompose the input +text description into short phrases. These short phrases are taken as +target-related cues and fed into a Conditional Referring Module (CRM) in +multiple stages, to allow updating the referring text embedding and enhance the +response map for target localization in a multi-stage manner. Based on the CRM, +we then propose a Region-aware Shrinking (RaS) loss to constrain the visual +localization to be conducted progressively in a coarse-to-fine manner across +different stages. Finally, we introduce an Instance-aware Disambiguation (IaD) +loss to suppress instance localization ambiguity by differentiating overlapping +response maps generated by different referring texts on the same image. +Extensive experiments show that our method outperforms SOTA methods on three +common benchmarks. + +
+
+ comment: Accepted to NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ IMWA: Iterative Model Weight Averaging Benefits Class-Imbalanced + Learning Tasks + + +
+ Model Weight Averaging (MWA) is a technique that seeks to enhance model's +performance by averaging the weights of multiple trained models. This paper +first empirically finds that 1) the vanilla MWA can benefit the +class-imbalanced learning, and 2) performing model averaging in the early +epochs of training yields a greater performance improvement than doing that in +later epochs. Inspired by these two observations, in this paper we propose a +novel MWA technique for class-imbalanced learning tasks named Iterative Model +Weight Averaging (IMWA). Specifically, IMWA divides the entire training stage +into multiple episodes. Within each episode, multiple models are concurrently +trained from the same initialized model weight, and subsequently averaged into +a singular model. Then, the weight of this average model serves as a fresh +initialization for the ensuing episode, thus establishing an iterative learning +paradigm. Compared to vanilla MWA, IMWA achieves higher performance +improvements with the same computational cost. Moreover, IMWA can further +enhance the performance of those methods employing EMA strategy, demonstrating +that IMWA and EMA can complement each other. Extensive experiments on various +class-imbalanced learning tasks, i.e., class-imbalanced image classification, +semi-supervised class-imbalanced image classification and semi-supervised +object detection tasks showcase the effectiveness of our IMWA. + +
+
+
+
+
+ + ♻ ☆ Learning Prompt with Distribution-Based Feature Replay for Few-Shot + Class-Incremental Learning + + +
+ Few-shot Class-Incremental Learning (FSCIL) aims to continuously learn new +classes based on very limited training data without forgetting the old ones +encountered. Existing studies solely relied on pure visual networks, while in +this paper we solved FSCIL by leveraging the Vision-Language model (e.g., CLIP) +and propose a simple yet effective framework, named Learning Prompt with +Distribution-based Feature Replay (LP-DiF). We observe that simply using CLIP +for zero-shot evaluation can substantially outperform the most influential +methods. Then, prompt tuning technique is involved to further improve its +adaptation ability, allowing the model to continually capture specific +knowledge from each session. To prevent the learnable prompt from forgetting +old knowledge in the new session, we propose a pseudo-feature replay approach. +Specifically, we preserve the old knowledge of each class by maintaining a +feature-level Gaussian distribution with a diagonal covariance matrix, which is +estimated by the image features of training images and synthesized features +generated from a VAE. When progressing to a new session, pseudo-features are +sampled from old-class distributions combined with training images of the +current session to optimize the prompt, thus enabling the model to learn new +knowledge while retaining old knowledge. Experiments on three prevalent +benchmarks, i.e., CIFAR100, mini-ImageNet, CUB-200, and two more challenging +benchmarks, i.e., SUN-397 and CUB-200$^*$ proposed in this paper showcase the +superiority of LP-DiF, achieving new state-of-the-art (SOTA) in FSCIL. Code is +publicly available at https://github.com/1170300714/LP-DiF. + +
+
+
+
+
+ + ♻ ☆ A Point-Neighborhood Learning Framework for Nasal Endoscope Image + Segmentation + + +
+ Lesion segmentation on nasal endoscopic images is challenging due to its +complex lesion features. Fully-supervised deep learning methods achieve +promising performance with pixel-level annotations but impose a significant +annotation burden on experts. Although weakly supervised or semi-supervised +methods can reduce the labelling burden, their performance is still limited. +Some weakly semi-supervised methods employ a novel annotation strategy that +labels weak single-point annotations for the entire training set while +providing pixel-level annotations for a small subset of the data. However, the +relevant weakly semi-supervised methods only mine the limited information of +the point itself, while ignoring its label property and surrounding reliable +information. This paper proposes a simple yet efficient weakly semi-supervised +method called the Point-Neighborhood Learning (PNL) framework. PNL incorporates +the surrounding area of the point, referred to as the point-neighborhood, into +the learning process. In PNL, we propose a point-neighborhood supervision loss +and a pseudo-label scoring mechanism to explicitly guide the model's training. +Meanwhile, we proposed a more reliable data augmentation scheme. The proposed +method significantly improves performance without increasing the parameters of +the segmentation neural network. Extensive experiments on the NPC-LES dataset +demonstrate that PNL outperforms existing methods by a significant margin. +Additional validation on colonoscopic polyp segmentation datasets confirms the +generalizability of the proposed PNL. + +
+
+ comment: 10 pages, 10 figures, +
+
+
+
+
+ + ♻ ☆ Preserve or Modify? Context-Aware Evaluation for Balancing Preservation + and Modification in Text-Guided Image Editing + + +
+ The development of vision-language and generative models has significantly +advanced text-guided image editing, which seeks the \textit{preservation} of +core elements in the source image while implementing \textit{modifications} +based on the target text. However, existing metrics have a +\textbf{context-blindness} problem, indiscriminately applying the same +evaluation criteria on completely different pairs of source image and target +text, biasing towards either modification or preservation. Directional CLIP +similarity, the only metric that considers both source image and target text, +is also biased towards modification aspects and attends to irrelevant editing +regions of the image. We propose \texttt{AugCLIP}, a \textbf{context-aware} +metric that adaptively coordinates preservation and modification aspects, +depending on the specific context of a given source image and target text. This +is done by deriving the CLIP representation of an ideally edited image, that +preserves the source image with necessary modifications to align with target +text. More specifically, using a multi-modal large language model, +\texttt{AugCLIP} augments the textual descriptions of the source and target, +then calculates a modification vector through a hyperplane that separates +source and target attributes in CLIP space. Extensive experiments on five +benchmark datasets, encompassing a diverse range of editing scenarios, show +that \texttt{AugCLIP} aligns remarkably well with human evaluation standards, +outperforming existing metrics. The code will be open-sourced for community +use. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ SALVE: A 3D Reconstruction Benchmark of Wounds from Consumer-grade + Videos + + +
+ Managing chronic wounds is a global challenge that can be alleviated by the +adoption of automatic systems for clinical wound assessment from consumer-grade +videos. While 2D image analysis approaches are insufficient for handling the 3D +features of wounds, existing approaches utilizing 3D reconstruction methods +have not been thoroughly evaluated. To address this gap, this paper presents a +comprehensive study on 3D wound reconstruction from consumer-grade videos. +Specifically, we introduce the SALVE dataset, comprising video recordings of +realistic wound phantoms captured with different cameras. Using this dataset, +we assess the accuracy and precision of state-of-the-art methods for 3D +reconstruction, ranging from traditional photogrammetry pipelines to advanced +neural rendering approaches. In our experiments, we observe that photogrammetry +approaches do not provide smooth surfaces suitable for precise clinical +measurements of wounds. Neural rendering approaches show promise in addressing +this issue, advancing the use of this technology in wound care practices. + +
+
+
+
+
+ + ♻ ☆ Towards Universal Soccer Video Understanding + + +
+ As a globally celebrated sport, soccer has attracted widespread interest from +fans all over the world. This paper aims to develop a comprehensive multi-modal +framework for soccer video understanding. Specifically, we make the following +contributions in this paper: (i) we introduce SoccerReplay-1988, the largest +multi-modal soccer dataset to date, featuring videos and detailed annotations +from 1,988 complete matches, with an automated annotation pipeline; (ii) we +present the first visual-language foundation model in the soccer domain, +MatchVision, which leverages spatiotemporal information across soccer videos +and excels in various downstream tasks; (iii) we conduct extensive experiments +and ablation studies on event classification, commentary generation, and +multi-view foul recognition. MatchVision demonstrates state-of-the-art +performance on all of them, substantially outperforming existing models, which +highlights the superiority of our proposed data and model. We believe that this +work will offer a standard paradigm for sports understanding research. + +
+
+ comment: Technical Report; Project Page: https://jyrao.github.io/UniSoccer/ +
+
+
+
+
+ + ♻ ☆ Insomnia Identification via Electroencephalography + + +
+ Insomnia is a serious sleep disorder caused by abnormal or excessive neural +activity in the brain. An estimated 50 million people worldwide are thought to +be affected by this condition, which is the second most severe neurological +disease after stroke. In order to ensure a quick recovery, an early and +accurate diagnosis of insomnia enables more effective drug and treatment +administration. This study proposes a method that uses deep learning to +automatically identify patients with insomnia. A set of optimal features are +extracted from spectral and temporal domains, including the relative power of +{\sigma}, \b{eta} and {\gamma} bands, the total power, the absolute slow wave +power, the power ratios of {\theta}, {\alpha}, {\gamma}, \b{eta}, +{\theta}/{\alpha}, {\theta}/\b{eta}, {\alpha}/{\gamma} and {\alpha}/\b{eta}, +mean, zero crossing rate, mobility, complexity, sleep efficiency and total +sleep time, to accurately quantify the differences between insomnia patients +and healthy subjects and develops a 1D CNN model for the classification +process. With the experiments use Fp2 and C4 EEG channels with 50 insomnia +patients and 50 healthy subjects, the proposed model arrives 99.34% accuracy +without sleep stage annotation. Using the features only from a single channel, +the study proposes a smart solution for insomnia patients which allows machine +learning to be to simplify current sleep monitoring hardware and improve +in-home ambulatory monitoring. + +
+
+ comment: This submission was made without all co-authors consent +
+
+
+
+
+ + ♻ ☆ Phased Consistency Models NeurIPS 2024 + + +
+ Consistency Models (CMs) have made significant progress in accelerating the +generation of diffusion models. However, their application to high-resolution, +text-conditioned image generation in the latent space remains unsatisfactory. +In this paper, we identify three key flaws in the current design of Latent +Consistency Models (LCMs). We investigate the reasons behind these limitations +and propose Phased Consistency Models (PCMs), which generalize the design space +and address the identified limitations. Our evaluations demonstrate that PCMs +outperform LCMs across 1--16 step generation settings. While PCMs are +specifically designed for multi-step refinement, they achieve comparable 1-step +generation results to previously state-of-the-art specifically designed 1-step +methods. Furthermore, we show the methodology of PCMs is versatile and +applicable to video generation, enabling us to train the state-of-the-art +few-step text-to-video generator. Our code is available at +https://github.com/G-U-N/Phased-Consistency-Model. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ 4DGen: Grounded 4D Content Generation with Spatial-temporal Consistency + + +
+ Aided by text-to-image and text-to-video diffusion models, existing 4D +content creation pipelines utilize score distillation sampling to optimize the +entire dynamic 3D scene. However, as these pipelines generate 4D content from +text or image inputs directly, they are constrained by limited motion +capabilities and depend on unreliable prompt engineering for desired results. +To address these problems, this work introduces \textbf{4DGen}, a novel +framework for grounded 4D content creation. We identify monocular video +sequences as a key component in constructing the 4D content. Our pipeline +facilitates controllable 4D generation, enabling users to specify the motion +via monocular video or adopt image-to-video generations, thus offering superior +control over content creation. Furthermore, we construct our 4D representation +using dynamic 3D Gaussians, which permits efficient, high-resolution +supervision through rendering during training, thereby facilitating +high-quality 4D generation. Additionally, we employ spatial-temporal pseudo +labels on anchor frames, along with seamless consistency priors implemented +through 3D-aware score distillation sampling and smoothness regularizations. +Compared to existing video-to-4D baselines, our approach yields superior +results in faithfully reconstructing input signals and realistically inferring +renderings from novel viewpoints and timesteps. More importantly, compared to +previous image-to-4D and text-to-4D works, 4DGen supports grounded generation, +offering users enhanced control and improved motion generation capabilities, a +feature difficult to achieve with previous methods. Project page: +https://vita-group.github.io/4DGen/ + +
+
+ comment: Project page: https://vita-group.github.io/4DGen/ +
+
+
+
+
+ + ♻ ☆ Diffusion for Natural Image Matting + + +
+ We aim to leverage diffusion to address the challenging image matting task. +However, the presence of high computational overhead and the inconsistency of +noise sampling between the training and inference processes pose significant +obstacles to achieving this goal. In this paper, we present DiffMatte, a +solution designed to effectively overcome these challenges. First, DiffMatte +decouples the decoder from the intricately coupled matting network design, +involving only one lightweight decoder in the iterations of the diffusion +process. With such a strategy, DiffMatte mitigates the growth of computational +overhead as the number of samples increases. Second, we employ a self-aligned +training strategy with uniform time intervals, ensuring a consistent noise +sampling between training and inference across the entire time domain. Our +DiffMatte is designed with flexibility in mind and can seamlessly integrate +into various modern matting architectures. Extensive experimental results +demonstrate that DiffMatte not only reaches the state-of-the-art level on the +Composition-1k test set, surpassing the best methods in the past by 5% and 15% +in the SAD metric and MSE metric respectively, but also show stronger +generalization ability in other benchmarks. + +
+
+
+
+
+ + ♻ ☆ Defective Edge Detection Using Cascaded Ensemble Canny Operator + + +
+ Edge detection has been one of the most difficult challenges in computer +vision because of the difficulty in identifying the borders and edges from the +real-world images including objects of varying kinds and sizes. Methods based +on ensemble learning, which use a combination of backbones and attention +modules, outperformed more conventional approaches, such as Sobel and Canny +edge detection. Nevertheless, these algorithms are still challenged when faced +with complicated scene photos. In addition, the identified edges utilizing the +current methods are not refined and often include incorrect edges. In this +work, we used a Cascaded Ensemble Canny operator to solve these problems and +detect the object edges. The most difficult Fresh and Rotten and Berkeley +datasets are used to test the suggested approach in Python. In terms of +performance metrics and output picture quality, the acquired results outperform +the specified edge detection networks + +
+
+ comment: 2 Pages and 2 Figures +
+
+
+
+
+
+
+
+ + Artificial Intelligence 137 + +
+
+
+ + ☆ Navigation World Models + + +
+ Navigation is a fundamental skill of agents with visual-motor capabilities. +We introduce a Navigation World Model (NWM), a controllable video generation +model that predicts future visual observations based on past observations and +navigation actions. To capture complex environment dynamics, NWM employs a +Conditional Diffusion Transformer (CDiT), trained on a diverse collection of +egocentric videos of both human and robotic agents, and scaled up to 1 billion +parameters. In familiar environments, NWM can plan navigation trajectories by +simulating them and evaluating whether they achieve the desired goal. Unlike +supervised navigation policies with fixed behavior, NWM can dynamically +incorporate constraints during planning. Experiments demonstrate its +effectiveness in planning trajectories from scratch or by ranking trajectories +sampled from an external policy. Furthermore, NWM leverages its learned visual +priors to imagine trajectories in unfamiliar environments from a single input +image, making it a flexible and powerful tool for next-generation navigation +systems. + +
+
+ comment: project page: https://www.amirbar.net/nwm/ +
+
+
+
+
+ + ☆ The Matrix: Infinite-Horizon World Generation with Real-Time Moving + Control + + +
+ We present The Matrix, the first foundational realistic world simulator +capable of generating continuous 720p high-fidelity real-scene video streams +with real-time, responsive control in both first- and third-person +perspectives, enabling immersive exploration of richly dynamic environments. +Trained on limited supervised data from AAA games like Forza Horizon 5 and +Cyberpunk 2077, complemented by large-scale unsupervised footage from +real-world settings like Tokyo streets, The Matrix allows users to traverse +diverse terrains -- deserts, grasslands, water bodies, and urban landscapes -- +in continuous, uncut hour-long sequences. Operating at 16 FPS, the system +supports real-time interactivity and demonstrates zero-shot generalization, +translating virtual game environments to real-world contexts where collecting +continuous movement data is often infeasible. For example, The Matrix can +simulate a BMW X3 driving through an office setting--an environment present in +neither gaming data nor real-world sources. This approach showcases the +potential of AAA game data to advance robust world models, bridging the gap +between simulations and real-world applications in scenarios with limited data. + +
+
+
+
+
+ + ☆ FLAIR: VLM with Fine-grained Language-informed Image Representations + + +
+ CLIP has shown impressive results in aligning images and texts at scale. +However, its ability to capture detailed visual features remains limited +because CLIP matches images and texts at a global level. To address this issue, +we propose FLAIR, Fine-grained Language-informed Image Representations, an +approach that utilizes long and detailed image descriptions to learn localized +image embeddings. By sampling diverse sub-captions that describe fine-grained +details about an image, we train our vision-language model to produce not only +global embeddings but also text-specific image representations. Our model +introduces text-conditioned attention pooling on top of local image tokens to +produce fine-grained image representations that excel at retrieving detailed +image content. We achieve state-of-the-art performance on both, existing +multimodal retrieval benchmarks, as well as, our newly introduced fine-grained +retrieval task which evaluates vision-language models' ability to retrieve +partial image content. Furthermore, our experiments demonstrate the +effectiveness of FLAIR trained on 30M image-text pairs in capturing +fine-grained visual information, including zero-shot semantic segmentation, +outperforming models trained on billions of pairs. Code is available at +https://github.com/ExplainableML/flair . + +
+
+
+
+
+ + ☆ Best-of-N Jailbreaking + + +
+ We introduce Best-of-N (BoN) Jailbreaking, a simple black-box algorithm that +jailbreaks frontier AI systems across modalities. BoN Jailbreaking works by +repeatedly sampling variations of a prompt with a combination of augmentations +- such as random shuffling or capitalization for textual prompts - until a +harmful response is elicited. We find that BoN Jailbreaking achieves high +attack success rates (ASRs) on closed-source language models, such as 89% on +GPT-4o and 78% on Claude 3.5 Sonnet when sampling 10,000 augmented prompts. +Further, it is similarly effective at circumventing state-of-the-art +open-source defenses like circuit breakers. BoN also seamlessly extends to +other modalities: it jailbreaks vision language models (VLMs) such as GPT-4o +and audio language models (ALMs) like Gemini 1.5 Pro, using modality-specific +augmentations. BoN reliably improves when we sample more augmented prompts. +Across all modalities, ASR, as a function of the number of samples (N), +empirically follows power-law-like behavior for many orders of magnitude. BoN +Jailbreaking can also be composed with other black-box algorithms for even more +effective attacks - combining BoN with an optimized prefix attack achieves up +to a 35% increase in ASR. Overall, our work indicates that, despite their +capability, language models are sensitive to seemingly innocuous changes to +inputs, which attackers can exploit across modalities. + +
+
+
+
+
+ + ☆ Perception Tokens Enhance Visual Reasoning in Multimodal Language Models + + +
+ Multimodal language models (MLMs) still face challenges in fundamental visual +perception tasks where specialized models excel. Tasks requiring reasoning +about 3D structures benefit from depth estimation, and reasoning about 2D +object instances benefits from object detection. Yet, MLMs can not produce +intermediate depth or boxes to reason over. Finetuning MLMs on relevant data +doesn't generalize well and outsourcing computation to specialized vision tools +is too compute-intensive and memory-inefficient. To address this, we introduce +Perception Tokens, intrinsic image representations designed to assist reasoning +tasks where language is insufficient. Perception tokens act as auxiliary +reasoning tokens, akin to chain-of-thought prompts in language models. For +example, in a depth-related task, an MLM augmented with perception tokens can +reason by generating a depth map as tokens, enabling it to solve the problem +effectively. We propose AURORA, a training method that augments MLMs with +perception tokens for improved reasoning over visual inputs. AURORA leverages a +VQVAE to transform intermediate image representations, such as depth maps into +a tokenized format and bounding box tokens, which is then used in a multi-task +training framework. AURORA achieves notable improvements across counting +benchmarks: +10.8% on BLINK, +11.3% on CVBench, and +8.3% on SEED-Bench, +outperforming finetuning approaches in generalization across datasets. It also +improves on relative depth: over +6% on BLINK. With perception tokens, AURORA +expands the scope of MLMs beyond language-based reasoning, paving the way for +more effective visual reasoning capabilities. + +
+
+
+
+
+ + ☆ NODE-AdvGAN: Improving the transferability and perceptual similarity of + adversarial examples by dynamic-system-driven adversarial generative model + + +
+ Understanding adversarial examples is crucial for improving the model's +robustness, as they introduce imperceptible perturbations that deceive models. +Effective adversarial examples, therefore, offer the potential to train more +robust models by removing their singularities. We propose NODE-AdvGAN, a novel +approach that treats adversarial generation as a continuous process and employs +a Neural Ordinary Differential Equation (NODE) for simulating the dynamics of +the generator. By mimicking the iterative nature of traditional gradient-based +methods, NODE-AdvGAN generates smoother and more precise perturbations that +preserve high perceptual similarity when added to benign images. We also +propose a new training strategy, NODE-AdvGAN-T, which enhances transferability +in black-box attacks by effectively tuning noise parameters during training. +Experiments demonstrate that NODE-AdvGAN and NODE-AdvGAN-T generate more +effective adversarial examples that achieve higher attack success rates while +preserving better perceptual quality than traditional GAN-based methods. + +
+
+
+
+
+ + ☆ Evaluating Gender Bias Transfer between Pre-trained and Prompt-Adapted + Language Models + + +
+ Large language models (LLMs) are increasingly being adapted to achieve +task-specificity for deployment in real-world decision systems. Several +previous works have investigated the bias transfer hypothesis (BTH) by studying +the effect of the fine-tuning adaptation strategy on model fairness to find +that fairness in pre-trained masked language models have limited effect on the +fairness of models when adapted using fine-tuning. In this work, we expand the +study of BTH to causal models under prompt adaptations, as prompting is an +accessible, and compute-efficient way to deploy models in real-world systems. +In contrast to previous works, we establish that intrinsic biases in +pre-trained Mistral, Falcon and Llama models are strongly correlated (rho >= +0.94) with biases when the same models are zero- and few-shot prompted, using a +pronoun co-reference resolution task. Further, we find that bias transfer +remains strongly correlated even when LLMs are specifically prompted to exhibit +fair or biased behavior (rho >= 0.92), and few-shot length and stereotypical +composition are varied (rho >= 0.97). Our findings highlight the importance of +ensuring fairness in pre-trained LLMs, especially when they are later used to +perform downstream tasks via prompt adaptation. + +
+
+
+
+
+ + ☆ Feed-Forward Bullet-Time Reconstruction of Dynamic Scenes from Monocular + Videos + + +
+ Recent advancements in static feed-forward scene reconstruction have +demonstrated significant progress in high-quality novel view synthesis. +However, these models often struggle with generalizability across diverse +environments and fail to effectively handle dynamic content. We present BTimer +(short for BulletTimer), the first motion-aware feed-forward model for +real-time reconstruction and novel view synthesis of dynamic scenes. Our +approach reconstructs the full scene in a 3D Gaussian Splatting representation +at a given target ('bullet') timestamp by aggregating information from all the +context frames. Such a formulation allows BTimer to gain scalability and +generalization by leveraging both static and dynamic scene datasets. Given a +casual monocular dynamic video, BTimer reconstructs a bullet-time scene within +150ms while reaching state-of-the-art performance on both static and dynamic +scene datasets, even compared with optimization-based approaches. + +
+
+ comment: Project website: + https://research.nvidia.com/labs/toronto-ai/bullet-timer/ +
+
+
+
+
+ + ☆ You're (Not) My Type -- Can LLMs Generate Feedback of Specific Types for + Introductory Programming Tasks? + + +
+ Background: Feedback as one of the most influential factors for learning has +been subject to a great body of research. It plays a key role in the +development of educational technology systems and is traditionally rooted in +deterministic feedback defined by experts and their experience. However, with +the rise of generative AI and especially Large Language Models (LLMs), we +expect feedback as part of learning systems to transform, especially for the +context of programming. In the past, it was challenging to automate feedback +for learners of programming. LLMs may create new possibilities to provide +richer, and more individual feedback than ever before. + Objectives: This paper aims to generate specific types of feedback for +introductory programming tasks using LLMs. We revisit existing feedback +taxonomies to capture the specifics of the generated feedback, such as +randomness, uncertainty, and degrees of variation. + Methods: We iteratively designed prompts for the generation of specific +feedback types (as part of existing feedback taxonomies) in response to +authentic student programs. We then evaluated the generated output and +determined to what extent it reflected certain feedback types. + Results and Conclusion: The present work provides a better understanding of +different feedback dimensions and characteristics. The results have +implications for future feedback research with regard to, for example, feedback +effects and learners' informational needs. It further provides a basis for the +development of new tools and learning systems for novice programmers including +feedback generated by AI. + +
+
+ comment: Accepted at Journal of Computer Assisted Learning (2024) +
+
+
+
+
+ + ☆ KKLIP: Knowledge Distillation Exploiting K-means Clustering for + Language-Image Pre-Training + + +
+ Recently, CLIP has emerged as a valuable model for aligning image and text +information in multi-modal scenarios. However, researchers have observed +limitations in the ability of CLIP's text and image encoders to extract +detailed knowledge from caption-image pairs. In response, this paper introduces +KKLIP, a novel approach designed to enhance the quality of CLIP by +incorporating a new knowledge distillation (KD) method derived from Llama 2. +Our method comprises three objectives: Text Embedding Distillation, Concept +Learning, and Contrastive Learning. Firstly, Text Embedding Distillation +involves training the KKLIP text encoder to emulate the teacher model, Llama 2. +Secondly, Concept Learning assigns a soft concept label to each caption-image +pair through offline k-means clustering of text information from Llama 2, +allowing KKLIP to learn from these soft concept labels. Finally, Contrastive +Learning harmonizes text and image embeddings. Our experimental results +demonstrate that KKLIP enhances the quality of both text and image encoders. + +
+
+
+
+
+ + ☆ A Bidirectional Siamese Recurrent Neural Network for Accurate Gait + Recognition Using Body Landmarks + + +
+ Gait recognition is a significant biometric technique for person +identification, particularly in scenarios where other physiological biometrics +are impractical or ineffective. In this paper, we address the challenges +associated with gait recognition and present a novel approach to improve its +accuracy and reliability. The proposed method leverages advanced techniques, +including sequential gait landmarks obtained through the Mediapipe pose +estimation model, Procrustes analysis for alignment, and a Siamese +biGRU-dualStack Neural Network architecture for capturing temporal +dependencies. Extensive experiments were conducted on large-scale cross-view +datasets to demonstrate the effectiveness of the approach, achieving high +recognition accuracy compared to other models. The model demonstrated +accuracies of 95.7%, 94.44%, 87.71%, and 86.6% on CASIA-B, SZU RGB-D, OU-MVLP, +and Gait3D datasets respectively. The results highlight the potential +applications of the proposed method in various practical domains, indicating +its significant contribution to the field of gait recognition. + +
+
+
+
+
+ + ☆ Flow Matching with General Discrete Paths: A Kinetic-Optimal Perspective + + +
+ The design space of discrete-space diffusion or flow generative models are +significantly less well-understood than their continuous-space counterparts, +with many works focusing only on a simple masked construction. In this work, we +aim to take a holistic approach to the construction of discrete generative +models based on continuous-time Markov chains, and for the first time, allow +the use of arbitrary discrete probability paths, or colloquially, corruption +processes. Through the lens of optimizing the symmetric kinetic energy, we +propose velocity formulas that can be applied to any given probability path, +completely decoupling the probability and velocity, and giving the user the +freedom to specify any desirable probability path based on expert knowledge +specific to the data domain. Furthermore, we find that a special construction +of mixture probability paths optimizes the symmetric kinetic energy for the +discrete case. We empirically validate the usefulness of this new design space +across multiple modalities: text generation, inorganic material generation, and +image generation. We find that we can outperform the mask construction even in +text with kinetic-optimal mixture paths, while we can make use of +domain-specific constructions of the probability path over the visual domain. + +
+
+
+
+
+ + ☆ Training-Free Mitigation of Language Reasoning Degradation After + Multimodal Instruction Tuning + + +
+ Multimodal models typically combine a powerful large language model (LLM) +with a vision encoder and are then trained on multimodal data via instruction +tuning. While this process adapts LLMs to multimodal settings, it remains +unclear whether this adaptation compromises their original language reasoning +capabilities. In this work, we explore the effects of multimodal instruction +tuning on language reasoning performance. We focus on LLaVA, a leading +multimodal framework that integrates LLMs such as Vicuna or Mistral with the +CLIP vision encoder. We compare the performance of the original LLMs with their +multimodal-adapted counterparts across eight language reasoning tasks. Our +experiments yield several key insights. First, the impact of multimodal +learning varies between Vicuna and Mistral: we observe a degradation in +language reasoning for Mistral but improvements for Vicuna across most tasks. +Second, while multimodal instruction learning consistently degrades performance +on mathematical reasoning tasks (e.g., GSM8K), it enhances performance on +commonsense reasoning tasks (e.g., CommonsenseQA). Finally, we demonstrate that +a training-free model merging technique can effectively mitigate the language +reasoning degradation observed in multimodal-adapted Mistral and even improve +performance on visual tasks. + +
+
+
+
+
+ + ☆ YT-30M: A multi-lingual multi-category dataset of YouTube comments + + +
+ This paper introduces two large-scale multilingual comment datasets, YT-30M +(and YT-100K) from YouTube. The analysis in this paper is performed on a +smaller sample (YT-100K) of YT-30M. Both the datasets: YT-30M (full) and +YT-100K (randomly selected 100K sample from YT-30M) are publicly released for +further research. YT-30M (YT-100K) contains 32236173 (108694) comments posted +by YouTube channel that belong to YouTube categories. Each comment is +associated with a video ID, comment ID, commentor name, commentor channel ID, +comment text, upvotes, original channel ID and category of the YouTube channel +(e.g., 'News & Politics', 'Science & Technology', etc.). + +
+
+
+
+
+ + ☆ From Words to Workflows: Automating Business Processes + + +
+ As businesses increasingly rely on automation to streamline operations, the +limitations of Robotic Process Automation (RPA) have become apparent, +particularly its dependence on expert knowledge and inability to handle complex +decision-making tasks. Recent advancements in Artificial Intelligence (AI), +particularly Generative AI (GenAI) and Large Language Models (LLMs), have paved +the way for Intelligent Automation (IA), which integrates cognitive +capabilities to overcome the shortcomings of RPA. This paper introduces +Text2Workflow, a novel method that automatically generates workflows from +natural language user requests. Unlike traditional automation approaches, +Text2Workflow offers a generalized solution for automating any business +process, translating user inputs into a sequence of executable steps +represented in JavaScript Object Notation (JSON) format. Leveraging the +decision-making and instruction-following capabilities of LLMs, this method +provides a scalable, adaptable framework that enables users to visualize and +execute workflows with minimal manual intervention. This research outlines the +Text2Workflow methodology and its broader implications for automating complex +business processes. + +
+
+ comment: Under review at Elsevier's Engineering Applications of Artificial + Intelligence +
+
+
+
+
+ + ☆ PBP: Post-training Backdoor Purification for Malware Classifiers NDSS 2025 + + +
+ In recent years, the rise of machine learning (ML) in cybersecurity has +brought new challenges, including the increasing threat of backdoor poisoning +attacks on ML malware classifiers. For instance, adversaries could inject +malicious samples into public malware repositories, contaminating the training +data and potentially misclassifying malware by the ML model. Current +countermeasures predominantly focus on detecting poisoned samples by leveraging +disagreements within the outputs of a diverse set of ensemble models on +training data points. However, these methods are not suitable for scenarios +where Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove +backdoors from a model after it has been trained. Addressing this scenario, we +introduce PBP, a post-training defense for malware classifiers that mitigates +various types of backdoor embeddings without assuming any specific backdoor +embedding mechanism. Our method exploits the influence of backdoor attacks on +the activation distribution of neural networks, independent of the +trigger-embedding method. In the presence of a backdoor attack, the activation +distribution of each layer is distorted into a mixture of distributions. By +regulating the statistics of the batch normalization layers, we can guide a +backdoored model to perform similarly to a clean one. Our method demonstrates +substantial advantages over several state-of-the-art methods, as evidenced by +experiments on two datasets, two types of backdoor methods, and various attack +configurations. Notably, our approach requires only a small portion of the +training data -- only 1\% -- to purify the backdoor and reduce the attack +success rate from 100\% to almost 0\%, a 100-fold improvement over the baseline +methods. Our code is available at +\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}. + +
+
+ comment: Accepted at NDSS 2025 +
+
+
+
+
+ + ☆ BIMCaP: BIM-based AI-supported LiDAR-Camera Pose Refinement + + +
+ This paper introduces BIMCaP, a novel method to integrate mobile 3D sparse +LiDAR data and camera measurements with pre-existing building information +models (BIMs), enhancing fast and accurate indoor mapping with affordable +sensors. BIMCaP refines sensor poses by leveraging a 3D BIM and employing a +bundle adjustment technique to align real-world measurements with the model. +Experiments using real-world open-access data show that BIMCaP achieves +superior accuracy, reducing translational error by over 4 cm compared to +current state-of-the-art methods. This advancement enhances the accuracy and +cost-effectiveness of 3D mapping methodologies like SLAM. BIMCaP's improvements +benefit various fields, including construction site management and emergency +response, by providing up-to-date, aligned digital maps for better +decision-making and productivity. Link to the repository: +https://github.com/MigVega/BIMCaP + +
+
+ comment: 10 pages, 24 figures, Conference: EG-ICE: 31st International Workshop + on Intelligent Computing in Engineering +
+
+
+
+
+ + ☆ Genetic Algorithm Based System for Path Planning with Unmanned Aerial + Vehicles Swarms in Cell-Grid Environments + + +
+ Path Planning methods for autonomously controlling swarms of unmanned aerial +vehicles (UAVs) are gaining momentum due to their operational advantages. An +increasing number of scenarios now require autonomous control of multiple UAVs, +as autonomous operation can significantly reduce labor costs. Additionally, +obtaining optimal flight paths can lower energy consumption, thereby extending +battery life for other critical operations. Many of these scenarios, however, +involve obstacles such as power lines and trees, which complicate Path +Planning. This paper presents an evolutionary computation-based system +employing genetic algorithms to address this problem in environments with +obstacles. The proposed approach aims to ensure complete coverage of areas with +fixed obstacles, such as in field exploration tasks, while minimizing flight +time regardless of map size or the number of UAVs in the swarm. No specific +goal points or prior information beyond the provided map is required. The +experiments conducted in this study used five maps of varying sizes and +obstacle densities, as well as a control map without obstacles, with different +numbers of UAVs. The results demonstrate that this method can determine optimal +paths for all UAVs during full map traversal, thus minimizing resource +consumption. A comparative analysis with other state-of-the-art approach is +presented to highlight the advantages and potential limitations of the proposed +method. + +
+
+
+
+
+ + ☆ Tango*: Constrained synthesis planning using chemically informed value + functions + + +
+ Computer-aided synthesis planning (CASP) has made significant strides in +generating retrosynthetic pathways for simple molecules in a non-constrained +fashion. Recent work introduces a specialised bidirectional search algorithm +with forward and retro expansion to address the starting material-constrained +synthesis problem, allowing CASP systems to provide synthesis pathways from +specified starting materials, such as waste products or renewable feed-stocks. +In this work, we introduce a simple guided search which allows solving the +starting material-constrained synthesis planning problem using an existing, +uni-directional search algorithm, Retro*. We show that by optimising a single +hyperparameter, Tango* outperforms existing methods in terms of efficiency and +solve rate. We find the Tango* cost function catalyses strong improvements for +the bidirectional DESP methods. Our method also achieves lower wall clock times +while proposing synthetic routes of similar length, a common metric for route +quality. Finally, we highlight potential reasons for the strong performance of +Tango over neural guided search methods + +
+
+
+
+
+ + ☆ Automated Test-Case Generation for REST APIs Using Model Inference + Search Heuristic + + +
+ The rising popularity of the microservice architectural style has led to a +growing demand for automated testing approaches tailored to these systems. +EvoMaster is a state-of-the-art tool that uses Evolutionary Algorithms (EAs) to +automatically generate test cases for microservices' REST APIs. One limitation +of these EAs is the use of unit-level search heuristics, such as branch +distances, which focus on fine-grained code coverage and may not effectively +capture the complex, interconnected behaviors characteristic of system-level +testing. To address this limitation, we propose a new search heuristic (MISH) +that uses real-time automaton learning to guide the test case generation +process. We capture the sequential call patterns exhibited by a test case by +learning an automaton from the stream of log events outputted by different +microservices within the same system. Therefore, MISH learns a representation +of the systemwide behavior, allowing us to define the fitness of a test case +based on the path it traverses within the inferred automaton. We empirically +evaluate MISH's effectiveness on six real-world benchmark microservice +applications and compare it against a state-of-the-art technique, MOSA, for +testing REST APIs. Our evaluation shows promising results for using MISH to +guide the automated test case generation within EvoMaster. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Learning Semantic Association Rules from Internet of Things Data + + +
+ Association Rule Mining (ARM) is the task of discovering commonalities in +data in the form of logical implications. ARM is used in the Internet of Things +(IoT) for different tasks including monitoring and decision-making. However, +existing methods give limited consideration to IoT-specific requirements such +as heterogeneity and volume. Furthermore, they do not utilize important static +domain-specific description data about IoT systems, which is increasingly +represented as knowledge graphs. In this paper, we propose a novel ARM pipeline +for IoT data that utilizes both dynamic sensor data and static IoT system +metadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method +(Aerial) as part of the pipeline to address the high volume of IoT data and +reduce the total number of rules that are resource-intensive to process. Aerial +learns a neural representation of a given data and extracts association rules +from this representation by exploiting the reconstruction (decoding) mechanism +of an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show +that ARM on both static and dynamic IoT data results in more generically +applicable rules while Aerial can learn a more concise set of high-quality +association rules than the state-of-the-art with full coverage over the +datasets. + +
+
+
+
+
+ + ☆ Benchmarking Pretrained Attention-based Models for Real-Time Recognition + in Robot-Assisted Esophagectomy SP + + +
+ Esophageal cancer is among the most common types of cancer worldwide. It is +traditionally treated using open esophagectomy, but in recent years, +robot-assisted minimally invasive esophagectomy (RAMIE) has emerged as a +promising alternative. However, robot-assisted surgery can be challenging for +novice surgeons, as they often suffer from a loss of spatial orientation. +Computer-aided anatomy recognition holds promise for improving surgical +navigation, but research in this area remains limited. In this study, we +developed a comprehensive dataset for semantic segmentation in RAMIE, featuring +the largest collection of vital anatomical structures and surgical instruments +to date. Handling this diverse set of classes presents challenges, including +class imbalance and the recognition of complex structures such as nerves. This +study aims to understand the challenges and limitations of current +state-of-the-art algorithms on this novel dataset and problem. Therefore, we +benchmarked eight real-time deep learning models using two pretraining +datasets. We assessed both traditional and attention-based networks, +hypothesizing that attention-based networks better capture global patterns and +address challenges such as occlusion caused by blood or other tissues. The +benchmark includes our RAMIE dataset and the publicly available CholecSeg8k +dataset, enabling a thorough assessment of surgical segmentation tasks. Our +findings indicate that pretraining on ADE20k, a dataset for semantic +segmentation, is more effective than pretraining on ImageNet. Furthermore, +attention-based models outperform traditional convolutional neural networks, +with SegNeXt and Mask2Former achieving higher Dice scores, and Mask2Former +additionally excelling in average symmetric surface distance. + +
+
+ comment: Accepted for presentation at the SPIE Medical Imaging Conference, + 2025 +
+
+
+
+
+ + ☆ Enhancing Supply Chain Visibility with Generative AI: An Exploratory + Case Study on Relationship Prediction in Knowledge Graphs + + +
+ A key stumbling block in effective supply chain risk management for companies +and policymakers is a lack of visibility on interdependent supply network +relationships. Relationship prediction, also called link prediction is an +emergent area of supply chain surveillance research that aims to increase the +visibility of supply chains using data-driven techniques. Existing methods have +been successful for predicting relationships but struggle to extract the +context in which these relationships are embedded - such as the products being +supplied or locations they are supplied from. Lack of context prevents +practitioners from distinguishing transactional relations from established +supply chain relations, hindering accurate estimations of risk. In this work, +we develop a new Generative Artificial Intelligence (Gen AI) enhanced machine +learning framework that leverages pre-trained language models as embedding +models combined with machine learning models to predict supply chain +relationships within knowledge graphs. By integrating Generative AI techniques, +our approach captures the nuanced semantic relationships between entities, +thereby improving supply chain visibility and facilitating more precise risk +management. Using data from a real case study, we show that GenAI-enhanced link +prediction surpasses all benchmarks, and demonstrate how GenAI models can be +explored and effectively used in supply chain risk management. + +
+
+ comment: 18 pages, 5 figures +
+
+
+
+
+ + ☆ DiffStyleTTS: Diffusion-based Hierarchical Prosody Modeling for + Text-to-Speech with Diverse and Controllable Styles COLING 2025 + + +
+ Human speech exhibits rich and flexible prosodic variations. To address the +one-to-many mapping problem from text to prosody in a reasonable and flexible +manner, we propose DiffStyleTTS, a multi-speaker acoustic model based on a +conditional diffusion module and an improved classifier-free guidance, which +hierarchically models speech prosodic features, and controls different prosodic +styles to guide prosody prediction. Experiments show that our method +outperforms all baselines in naturalness and achieves superior synthesis speed +compared to three diffusion-based baselines. Additionally, by adjusting the +guiding scale, DiffStyleTTS effectively controls the guidance intensity of the +synthetic prosody. + +
+
+ comment: COLING 2025 +
+
+
+
+
+ + ☆ WiS Platform: Enhancing Evaluation of LLM-Based Multi-Agent Systems + Through Game-Based Analysis + + +
+ Recent advancements in autonomous multi-agent systems (MAS) based on large +language models (LLMs) have enhanced the application scenarios and improved the +capability of LLMs to handle complex tasks. Despite demonstrating +effectiveness, existing studies still evidently struggle to evaluate, analysis, +and reproducibility of LLM-based MAS. In this paper, to facilitate the research +on LLM-based MAS, we introduce an open, scalable, and real-time updated +platform for accessing and analyzing the LLM-based MAS based on the games Who +is Spy?" (WiS). Our platform is featured with three main worths: (1) a unified +model evaluate interface that supports models available on Hugging Face; (2) +real-time updated leaderboard for model evaluation; (3) a comprehensive +evaluation covering game-winning rates, attacking, defense strategies, and +reasoning of LLMs. To rigorously test WiS, we conduct extensive experiments +coverage of various open- and closed-source LLMs, we find that different agents +exhibit distinct and intriguing behaviors in the game. The experimental results +demonstrate the effectiveness and efficiency of our platform in evaluating +LLM-based MAS. Our platform and its documentation are publicly available at +\url{https://whoisspy.ai/} + +
+
+
+
+
+ + ☆ Intuitive Axial Augmentation Using Polar-Sine-Based Piecewise Distortion + for Medical Slice-Wise Segmentation + + +
+ Most data-driven models for medical image analysis rely on universal +augmentations to improve performance. Experimental evidence has confirmed their +effectiveness, but the unclear mechanism underlying them poses a barrier to the +widespread acceptance and trust in such methods within the medical community. +We revisit and acknowledge the unique characteristics of medical images apart +from traditional digital images, and consequently, proposed a medical-specific +augmentation algorithm that is more elastic and aligns well with radiology scan +procedure. The method performs piecewise affine with sinusoidal distorted ray +according to radius on polar coordinates, thus simulating uncertain postures of +human lying flat on the scanning table. Our method could generate human +visceral distribution without affecting the fundamental relative position on +axial plane. Two non-adaptive algorithms, namely Meta-based Scan Table Removal +and Similarity-Guided Parameter Search, are introduced to bolster robustness of +our augmentation method. Experiments show our method improves accuracy across +multiple famous segmentation frameworks without requiring more data samples. +Our preview code is available in: https://github.com/MGAMZ/PSBPD. + +
+
+
+
+
+ + ☆ DIVE: Taming DINO for Subject-Driven Video Editing + + +
+ Building on the success of diffusion models in image generation and editing, +video editing has recently gained substantial attention. However, maintaining +temporal consistency and motion alignment still remains challenging. To address +these issues, this paper proposes DINO-guided Video Editing (DIVE), a framework +designed to facilitate subject-driven editing in source videos conditioned on +either target text prompts or reference images with specific identities. The +core of DIVE lies in leveraging the powerful semantic features extracted from a +pretrained DINOv2 model as implicit correspondences to guide the editing +process. Specifically, to ensure temporal motion consistency, DIVE employs DINO +features to align with the motion trajectory of the source video. Extensive +experiments on diverse real-world videos demonstrate that our framework can +achieve high-quality editing results with robust motion consistency, +highlighting the potential of DINO to contribute to video editing. For precise +subject editing, DIVE incorporates the DINO features of reference images into a +pretrained text-to-image model to learn Low-Rank Adaptations (LoRAs), +effectively registering the target subject's identity. Project page: +https://dino-video-editing.github.io + +
+
+
+
+
+ + ☆ Improving Linguistic Diversity of Large Language Models with Possibility + Exploration Fine-Tuning + + +
+ While Large Language Models (LLMs) have made significant strides in +replicating human-like abilities, there are concerns about a reduction in the +linguistic diversity of their outputs. This results in the homogenization of +viewpoints and perspectives, as well as the underrepresentation of specific +demographic groups. Although several fine-tuning and prompting techniques have +been suggested to tackle the issue, they are often tailored to specific tasks +or come with a substantial increase in computational cost and latency. This +makes them challenging to apply to applications that demand very low latency, +such as chatbots and virtual assistants. We propose Possibility Exploration +Fine-Tuning (PEFT), a task-agnostic framework that enhances the text diversity +of LLMs without increasing latency or computational cost. Given the same +prompt, models fine-tuned with PEFT can simultaneously generate multiple +diverse responses, each corresponding with a controllable possibility number. +Experiments on dialogue and story generation tasks demonstrate that PEFT +significantly enhances the diversity of LLM outputs, as evidenced by lower +similarity between candidate responses. Since PEFT emphasizes semantic +diversity over lexical diversity, it can also notably reduce demographic bias +in dialogue systems. The implementations and datasets are available in our +repository: https://github.com/mailong25/peft_diversity + +
+
+
+
+
+ + ☆ AI-Driven Day-to-Day Route Choice + + +
+ Understanding travelers' route choices can help policymakers devise optimal +operational and planning strategies for both normal and abnormal circumstances. +However, existing choice modeling methods often rely on predefined assumptions +and struggle to capture the dynamic and adaptive nature of travel behavior. +Recently, Large Language Models (LLMs) have emerged as a promising alternative, +demonstrating remarkable ability to replicate human-like behaviors across +various fields. Despite this potential, their capacity to accurately simulate +human route choice behavior in transportation contexts remains doubtful. To +satisfy this curiosity, this paper investigates the potential of LLMs for route +choice modeling by introducing an LLM-empowered agent, "LLMTraveler." This +agent integrates an LLM as its core, equipped with a memory system that learns +from past experiences and makes decisions by balancing retrieved data and +personality traits. The study systematically evaluates the LLMTraveler's +ability to replicate human-like decision-making through two stages: (1) +analyzing its route-switching behavior in single origin-destination (OD) pair +congestion game scenarios, where it demonstrates patterns align with laboratory +data but are not fully explained by traditional models, and (2) testing its +capacity to model day-to-day (DTD) adaptive learning behaviors on the Ortuzar +and Willumsen (OW) network, producing results comparable to Multinomial Logit +(MNL) and Reinforcement Learning (RL) models. These experiments demonstrate +that the framework can partially replicate human-like decision-making in route +choice while providing natural language explanations for its decisions. This +capability offers valuable insights for transportation policymaking, such as +simulating traveler responses to new policies or changes in the network. + +
+
+
+
+
+ + ☆ LuxEmbedder: A Cross-Lingual Approach to Enhanced Luxembourgish Sentence + Embeddings COLING 2025 + + +
+ Sentence embedding models play a key role in various Natural Language +Processing tasks, such as in Topic Modeling, Document Clustering and +Recommendation Systems. However, these models rely heavily on parallel data, +which can be scarce for many low-resource languages, including Luxembourgish. +This scarcity results in suboptimal performance of monolingual and +cross-lingual sentence embedding models for these languages. To address this +issue, we compile a relatively small but high-quality human-generated +cross-lingual parallel dataset to train \tool, an enhanced sentence embedding +model for Luxembourgish with strong cross-lingual capabilities. Additionally, +we present evidence suggesting that including low-resource languages in +parallel training datasets can be more advantageous for other low-resource +languages than relying solely on high-resource language pairs. Furthermore, +recognizing the lack of sentence embedding benchmarks for low-resource +languages, we create a paraphrase detection benchmark specifically for +Luxembourgish, aiming to partially fill this gap and promote further research. + +
+
+ comment: Accepted at COLING 2025 +
+
+
+
+
+ + ☆ Path-Guided Particle-based Sampling + + +
+ Particle-based Bayesian inference methods by sampling from a partition-free +target (posterior) distribution, e.g., Stein variational gradient descent +(SVGD), have attracted significant attention. We propose a path-guided +particle-based sampling~(PGPS) method based on a novel Log-weighted Shrinkage +(LwS) density path linking an initial distribution to the target distribution. +We propose to utilize a Neural network to learn a vector field motivated by the +Fokker-Planck equation of the designed density path. Particles, initiated from +the initial distribution, evolve according to the ordinary differential +equation defined by the vector field. The distribution of these particles is +guided along a density path from the initial distribution to the target +distribution. The proposed LwS density path allows for an efficient search of +modes of the target distribution while canonical methods fail. We theoretically +analyze the Wasserstein distance of the distribution of the PGPS-generated +samples and the target distribution due to approximation and discretization +errors. Practically, the proposed PGPS-LwS method demonstrates higher Bayesian +inference accuracy and better calibration ability in experiments conducted on +both synthetic and real-world Bayesian learning tasks, compared to baselines, +such as SVGD and Langevin dynamics, etc. + +
+
+
+
+
+ + ☆ Contextual Data Integration for Bike-sharing Demand Prediction with + Graph Neural Networks in Degraded Weather Conditions + + +
+ Demand for bike sharing is impacted by various factors, such as weather +conditions, events, and the availability of other transportation modes. This +impact remains elusive due to the complex interdependence of these factors or +locationrelated user behavior variations. It is also not clear which factor is +additional information which are not already contained in the historical +demand. Intermodal dependencies between bike-sharing and other modes are also +underexplored, and the value of this information has not been studied in +degraded situations. The proposed study analyzes the impact of adding +contextual data, such as weather, time embedding, and road traffic flow, to +predict bike-sharing Origin-Destination (OD) flows in atypical weather +situations Our study highlights a mild relationship between prediction quality +of bike-sharing demand and road traffic flow, while the introduced time +embedding allows outperforming state-of-the-art results, particularly in the +case of degraded weather conditions. Including weather data as an additional +input further improves our model with respect to the basic ST-ED-RMGC +prediction model by reducing of more than 20% the prediction error in degraded +weather condition. + +
+
+
+
+
+ + ☆ Integrating Generative AI into Art Therapy: A Technical Showcase + + +
+ This paper explores the integration of generative AI into the field of art +therapy. Leveraging proven text-to-image models, we introduce a novel technical +design to complement art therapy. The resulting AI-based tools shall enable +patients to refine and customize their creative work, opening up new avenues of +expression and accessibility. Using three illustrative examples, we demonstrate +potential outputs of our solution and evaluate them qualitatively. Furthermore, +we discuss the current limitations and ethical considerations associated with +this integration and provide an outlook into future research efforts. Our +implementations are publicly available at https://github.com/BFH-AMI/sds24. + +
+
+
+
+
+ + ☆ Black-Box Forgery Attacks on Semantic Watermarks for Diffusion Models + + +
+ Integrating watermarking into the generation process of latent diffusion +models (LDMs) simplifies detection and attribution of generated content. +Semantic watermarks, such as Tree-Rings and Gaussian Shading, represent a novel +class of watermarking techniques that are easy to implement and highly robust +against various perturbations. However, our work demonstrates a fundamental +security vulnerability of semantic watermarks. We show that attackers can +leverage unrelated models, even with different latent spaces and architectures +(UNet vs DiT), to perform powerful and realistic forgery attacks. Specifically, +we design two watermark forgery attacks. The first imprints a targeted +watermark into real images by manipulating the latent representation of an +arbitrary image in an unrelated LDM to get closer to the latent representation +of a watermarked image. We also show that this technique can be used for +watermark removal. The second attack generates new images with the target +watermark by inverting a watermarked image and re-generating it with an +arbitrary prompt. Both attacks just need a single reference image with the +target watermark. Overall, our findings question the applicability of semantic +watermarks by revealing that attackers can easily forge or remove these +watermarks under realistic conditions. + +
+
+ comment: 23 pages, 21 figures, 6 tables +
+
+
+
+
+ + ☆ Intent-driven In-context Learning for Few-shot Dialogue State Tracking + + +
+ Dialogue state tracking (DST) plays an essential role in task-oriented +dialogue systems. However, user's input may contain implicit information, +posing significant challenges for DST tasks. Additionally, DST data includes +complex information, which not only contains a large amount of noise unrelated +to the current turn, but also makes constructing DST datasets expensive. To +address these challenges, we introduce Intent-driven In-context Learning for +Few-shot DST (IDIC-DST). By extracting user's intent, we propose an +Intent-driven Dialogue Information Augmentation module to augment the dialogue +information, which can track dialogue states more effectively. Moreover, we +mask noisy information from DST data and rewrite user's input in the +Intent-driven Examples Retrieval module, where we retrieve similar examples. We +then utilize a pre-trained large language model to update the dialogue state +using the augmented dialogue information and examples. Experimental results +demonstrate that IDIC-DST achieves state-of-the-art performance in few-shot +settings on MultiWOZ 2.1 and MultiWOZ 2.4 datasets. + +
+
+
+
+
+ + ☆ Detecting abnormal heart sound using mobile phones and on-device IConNet + + +
+ Given the global prevalence of cardiovascular diseases, there is a pressing +need for easily accessible early screening methods. Typically, this requires +medical practitioners to investigate heart auscultations for irregular sounds, +followed by echocardiography and electrocardiography tests. To democratize +early diagnosis, we present a user-friendly solution for abnormal heart sound +detection, utilizing mobile phones and a lightweight neural network optimized +for on-device inference. Unlike previous approaches reliant on specialized +stethoscopes, our method directly analyzes audio recordings, facilitated by a +novel architecture known as IConNet. IConNet, an Interpretable Convolutional +Neural Network, harnesses insights from audio signal processing, enhancing +efficiency and providing transparency in neural pattern extraction from raw +waveform signals. This is a significant step towards trustworthy AI in +healthcare, aiding in remote health monitoring efforts. + +
+
+ comment: N2Women'24 Workshop, MobiSys 2024, Tokyo, Japan +
+
+
+
+
+ + ☆ AIM: Adaptive Inference of Multi-Modal LLMs via Token Merging and + Pruning + + +
+ Large language models (LLMs) have enabled the creation of multi-modal LLMs +that exhibit strong comprehension of visual data such as images and videos. +However, these models usually rely on extensive visual tokens from visual +encoders, leading to high computational demands, which limits their +applicability in resource-constrained environments and for long-context tasks. +In this work, we propose a training-free adaptive inference method for +multi-modal LLMs that can accommodate a broad range of efficiency requirements +with a minimum performance drop. Our method consists of a) iterative token +merging based on embedding similarity before LLMs, and b) progressive token +pruning within LLM layers based on multi-modal importance. With a minimalist +design, our method can be applied to both video and image LLMs. Extensive +experiments on diverse video and image benchmarks demonstrate that, our method +substantially reduces computation load (e.g., a $\textbf{7-fold}$ reduction in +FLOPs) while preserving the performance of video and image LLMs. Further, under +a similar computational cost, our method outperforms the state-of-the-art +methods in long video understanding (e.g., $\textbf{+4.6}$ on MLVU). +Additionally, our in-depth analysis provides insights into token redundancy and +LLM layer behaviors, offering guidance for future research in designing +efficient multi-modal LLMs. Our code will be available at +https://github.com/LaVi-Lab/AIM. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Does Safety Training of LLMs Generalize to Semantically Related Natural + Prompts? NeurIPS 2024 + + +
+ Large Language Models (LLMs) are known to be susceptible to crafted +adversarial attacks or jailbreaks that lead to the generation of objectionable +content despite being aligned to human preferences using safety fine-tuning +methods. While the large dimensionality of input token space makes it +inevitable to find adversarial prompts that can jailbreak these models, we aim +to evaluate whether safety fine-tuned LLMs are safe against natural prompts +which are semantically related to toxic seed prompts that elicit safe responses +after alignment. We surprisingly find that popular aligned LLMs such as GPT-4 +can be compromised using naive prompts that are NOT even crafted with an +objective of jailbreaking the model. Furthermore, we empirically show that +given a seed prompt that elicits a toxic response from an unaligned model, one +can systematically generate several semantically related natural prompts that +can jailbreak aligned LLMs. Towards this, we propose a method of Response +Guided Question Augmentation (ReG-QA) to evaluate the generalization of safety +aligned LLMs to natural prompts, that first generates several toxic answers +given a seed question using an unaligned LLM (Q to A), and further leverages an +LLM to generate questions that are likely to produce these answers (A to Q). We +interestingly find that safety fine-tuned LLMs such as GPT-4o are vulnerable to +producing natural jailbreak questions from unsafe content (without denial) and +can thus be used for the latter (A to Q) step. We obtain attack success rates +that are comparable to/ better than leading adversarial attack methods on the +JailbreakBench leaderboard, while being significantly more stable against +defenses such as Smooth-LLM and Synonym Substitution, which are effective +against existing all attacks on the leaderboard. + +
+
+ comment: Accepted at the Safe Generative AI Workshop @ NeurIPS 2024 +
+
+
+
+
+ + ☆ ClusterKV: Manipulating LLM KV Cache in Semantic Space for Recallable + Compression + + +
+ Large Language Models (LLMs) have been widely deployed in a variety of +applications, and the context length is rapidly increasing to handle tasks such +as long-document QA and complex logical reasoning. However, long context poses +significant challenges for inference efficiency, including high memory costs of +key-value (KV) cache and increased latency due to extensive memory accesses. +Recent works have proposed compressing KV cache to approximate computation, but +these methods either evict tokens permanently, never recalling them for later +inference, or recall previous tokens at the granularity of pages divided by +textual positions. Both approaches degrade the model accuracy and output +quality. To achieve efficient and accurate recallable KV cache compression, we +introduce ClusterKV, which recalls tokens at the granularity of semantic +clusters. We design and implement efficient algorithms and systems for +clustering, selection, indexing and caching. Experiment results show that +ClusterKV attains negligible accuracy loss across various tasks with 32k +context lengths, using only a 1k to 2k KV cache budget, and achieves up to a +2$\times$ speedup in latency and a 2.5$\times$ improvement in decoding +throughput. Compared to SoTA recallable KV compression methods, ClusterKV +demonstrates higher model accuracy and output quality, while maintaining or +exceeding inference efficiency. + +
+
+
+
+
+ + ☆ U-MATH: A University-Level Benchmark for Evaluating Mathematical Skills + in LLMs + + +
+ The current evaluation of mathematical skills in LLMs is limited, as existing +benchmarks are either relatively small, primarily focus on elementary and +high-school problems, or lack diversity in topics. Additionally, the inclusion +of visual elements in tasks remains largely under-explored. + To address these gaps, we introduce U-MATH, a novel benchmark of 1,100 +unpublished open-ended university-level problems sourced from teaching +materials. It is balanced across six core subjects, with 20% of multimodal +problems. Given the open-ended nature of U-MATH problems, we employ an LLM to +judge the correctness of generated solutions. To this end, we release +$\mu$-MATH, a dataset to evaluate the LLMs' capabilities in judging solutions. + The evaluation of general domain, math-specific, and multimodal LLMs +highlights the challenges presented by U-MATH. Our findings reveal that LLMs +achieve a maximum accuracy of only 63% on text-based tasks, with even lower 45% +on visual problems. The solution assessment proves challenging for LLMs, with +the best LLM judge having an F1-score of 80% on $\mu$-MATH. + +
+
+
+
+
+ + ☆ Semi-decentralized Training of Spatio-Temporal Graph Neural Networks for + Traffic Prediction + + +
+ In smart mobility, large networks of geographically distributed sensors +produce vast amounts of high-frequency spatio-temporal data that must be +processed in real time to avoid major disruptions. Traditional centralized +approaches are increasingly unsuitable to this task, as they struggle to scale +with expanding sensor networks, and reliability issues in central components +can easily affect the whole deployment. To address these challenges, we explore +and adapt semi-decentralized training techniques for Spatio-Temporal Graph +Neural Networks (ST-GNNs) in smart mobility domain. We implement a simulation +framework where sensors are grouped by proximity into multiple cloudlets, each +handling a subgraph of the traffic graph, fetching node features from other +cloudlets to train its own local ST-GNN model, and exchanging model updates +with other cloudlets to ensure consistency, enhancing scalability and removing +reliance on a centralized aggregator. We perform extensive comparative +evaluation of four different ST-GNN training setups -- centralized, traditional +FL, server-free FL, and Gossip Learning -- on large-scale traffic datasets, the +METR-LA and PeMS-BAY datasets, for short-, mid-, and long-term vehicle speed +predictions. Experimental results show that semi-decentralized setups are +comparable to centralized approaches in performance metrics, while offering +advantages in terms of scalability and fault tolerance. In addition, we +highlight often overlooked issues in existing literature for distributed +ST-GNNs, such as the variation in model performance across different +geographical areas due to region-specific traffic patterns, and the significant +communication overhead and computational costs that arise from the large +receptive field of GNNs, leading to substantial data transfers and increased +computation of partial embeddings. + +
+
+ comment: 8 pages, 4 figures, 3 tables, conference +
+
+
+
+
+ + ☆ Optimizing Dense Visual Predictions Through Multi-Task Coherence and + Prioritization WACV 2025 + + +
+ Multi-Task Learning (MTL) involves the concurrent training of multiple tasks, +offering notable advantages for dense prediction tasks in computer vision. MTL +not only reduces training and inference time as opposed to having multiple +single-task models, but also enhances task accuracy through the interaction of +multiple tasks. However, existing methods face limitations. They often rely on +suboptimal cross-task interactions, resulting in task-specific predictions with +poor geometric and predictive coherence. In addition, many approaches use +inadequate loss weighting strategies, which do not address the inherent +variability in task evolution during training. To overcome these challenges, we +propose an advanced MTL model specifically designed for dense vision tasks. Our +model leverages state-of-the-art vision transformers with task-specific +decoders. To enhance cross-task coherence, we introduce a trace-back method +that improves both cross-task geometric and predictive features. Furthermore, +we present a novel dynamic task balancing approach that projects task losses +onto a common scale and prioritizes more challenging tasks during training. +Extensive experiments demonstrate the superiority of our method, establishing +new state-of-the-art performance across two benchmark datasets. The code is +available at:https://github.com/Klodivio355/MT-CP + +
+
+ comment: Accepted by WACV 2025 +
+
+
+
+
+ + ☆ Towards Understanding and Quantifying Uncertainty for Text-to-Image + Generation + + +
+ Uncertainty quantification in text-to-image (T2I) generative models is +crucial for understanding model behavior and improving output reliability. In +this paper, we are the first to quantify and evaluate the uncertainty of T2I +models with respect to the prompt. Alongside adapting existing approaches +designed to measure uncertainty in the image space, we also introduce +Prompt-based UNCertainty Estimation for T2I models (PUNC), a novel method +leveraging Large Vision-Language Models (LVLMs) to better address uncertainties +arising from the semantics of the prompt and generated images. PUNC utilizes a +LVLM to caption a generated image, and then compares the caption with the +original prompt in the more semantically meaningful text space. PUNC also +enables the disentanglement of both aleatoric and epistemic uncertainties via +precision and recall, which image-space approaches are unable to do. Extensive +experiments demonstrate that PUNC outperforms state-of-the-art uncertainty +estimation techniques across various settings. Uncertainty quantification in +text-to-image generation models can be used on various applications including +bias detection, copyright protection, and OOD detection. We also introduce a +comprehensive dataset of text prompts and generation pairs to foster further +research in uncertainty quantification for generative models. Our findings +illustrate that PUNC not only achieves competitive performance but also enables +novel applications in evaluating and improving the trustworthiness of +text-to-image models. + +
+
+ comment: 28 pages and 22 figures +
+
+
+
+
+ + ☆ Physics-Informed Deep Inverse Operator Networks for Solving PDE Inverse + Problems + + +
+ Inverse problems involving partial differential equations (PDEs) can be seen +as discovering a mapping from measurement data to unknown quantities, often +framed within an operator learning approach. However, existing methods +typically rely on large amounts of labeled training data, which is impractical +for most real-world applications. Moreover, these supervised models may fail to +capture the underlying physical principles accurately. To address these +limitations, we propose a novel architecture called Physics-Informed Deep +Inverse Operator Networks (PI-DIONs), which can learn the solution operator of +PDE-based inverse problems without labeled training data. We extend the +stability estimates established in the inverse problem literature to the +operator learning framework, thereby providing a robust theoretical foundation +for our method. These estimates guarantee that the proposed model, trained on a +finite sample and grid, generalizes effectively across the entire domain and +function space. Extensive experiments are conducted to demonstrate that +PI-DIONs can effectively and accurately learn the solution operators of the +inverse problems without the need for labeled data. + +
+
+
+
+
+ + ☆ Testing Neural Network Verifiers: A Soundness Benchmark with Hidden + Counterexamples + + +
+ In recent years, many neural network (NN) verifiers have been developed to +formally verify certain properties of neural networks such as robustness. +Although many benchmarks have been constructed to evaluate the performance of +NN verifiers, they typically lack a ground-truth for hard instances where no +current verifier can verify and no counterexample can be found, which makes it +difficult to check the soundness of a new verifier if it claims to verify hard +instances which no other verifier can do. We propose to develop a soundness +benchmark for NN verification. Our benchmark contains instances with +deliberately inserted counterexamples while we also try to hide the +counterexamples from regular adversarial attacks which can be used for finding +counterexamples. We design a training method to produce neural networks with +such hidden counterexamples. Our benchmark aims to be used for testing the +soundness of NN verifiers and identifying falsely claimed verifiability when it +is known that hidden counterexamples exist. We systematically construct our +benchmark and generate instances across diverse model architectures, activation +functions, input sizes, and perturbation radii. We demonstrate that our +benchmark successfully identifies bugs in state-of-the-art NN verifiers, as +well as synthetic bugs, providing a crucial step toward enhancing the +reliability of testing NN verifiers. Our code is available at +https://github.com/MVP-Harry/SoundnessBench and our benchmark is available at +https://huggingface.co/datasets/SoundnessBench/SoundnessBench. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ A Measure of the System Dependence of Automated Metrics + + +
+ Automated metrics for Machine Translation have made significant progress, +with the goal of replacing expensive and time-consuming human evaluations. +These metrics are typically assessed by their correlation with human judgments, +which captures the monotonic relationship between human and metric scores. +However, we argue that it is equally important to ensure that metrics treat all +systems fairly and consistently. In this paper, we introduce a method to +evaluate this aspect. + +
+
+
+
+
+ + ☆ Large Language Models show both individual and collective creativity + comparable to humans + + +
+ Artificial intelligence has, so far, largely automated routine tasks, but +what does it mean for the future of work if Large Language Models (LLMs) show +creativity comparable to humans? To measure the creativity of LLMs +holistically, the current study uses 13 creative tasks spanning three domains. +We benchmark the LLMs against individual humans, and also take a novel approach +by comparing them to the collective creativity of groups of humans. We find +that the best LLMs (Claude and GPT-4) rank in the 52nd percentile against +humans, and overall LLMs excel in divergent thinking and problem solving but +lag in creative writing. When questioned 10 times, an LLM's collective +creativity is equivalent to 8-10 humans. When more responses are requested, two +additional responses of LLMs equal one extra human. Ultimately, LLMs, when +optimally applied, may compete with a small group of humans in the future of +work. + +
+
+
+
+
+ + ☆ Fine-Grained Behavior Simulation with Role-Playing Large Language Model + on Social Media + + +
+ Large language models (LLMs) have demonstrated impressive capabilities in +role-playing tasks. However, there is limited research on whether LLMs can +accurately simulate user behavior in real-world scenarios, such as social +media. This requires models to effectively analyze a user's history and +simulate their role. In this paper, we introduce \textbf{FineRob}, a novel +fine-grained behavior simulation dataset. We collect the complete behavioral +history of 1,866 distinct users across three social media platforms. Each +behavior is decomposed into three fine-grained elements: object, type, and +content, resulting in 78.6k QA records. Based on FineRob, we identify two +dominant reasoning patterns in LLMs' behavior simulation processes and propose +the \textbf{OM-CoT} fine-tuning method to enhance the capability. Through +comprehensive experiments, we conduct an in-depth analysis of key factors of +behavior simulation and also demonstrate the effectiveness of OM-CoT +approach\footnote{Code and dataset are available at +\url{https://github.com/linkseed18612254945/FineRob}} + +
+
+
+
+
+ + ☆ Robust Multi-bit Text Watermark with LLM-based Paraphrasers + + +
+ We propose an imperceptible multi-bit text watermark embedded by paraphrasing +with LLMs. We fine-tune a pair of LLM paraphrasers that are designed to behave +differently so that their paraphrasing difference reflected in the text +semantics can be identified by a trained decoder. To embed our multi-bit +watermark, we use two paraphrasers alternatively to encode the pre-defined +binary code at the sentence level. Then we use a text classifier as the decoder +to decode each bit of the watermark. Through extensive experiments, we show +that our watermarks can achieve over 99.99\% detection AUC with small (1.1B) +text paraphrasers while keeping the semantic information of the original +sentence. More importantly, our pipeline is robust under word substitution and +sentence paraphrasing perturbations and generalizes well to +out-of-distributional data. We also show the stealthiness of our watermark with +LLM-based evaluation. We open-source the code: +https://github.com/xiaojunxu/multi-bit-text-watermark. + +
+
+
+
+
+ + ☆ Experience-driven discovery of planning strategies + + +
+ One explanation for how people can plan efficiently despite limited cognitive +resources is that we possess a set of adaptive planning strategies and know +when and how to use them. But how are these strategies acquired? While previous +research has studied how individuals learn to choose among existing strategies, +little is known about the process of forming new planning strategies. In this +work, we propose that new planning strategies are discovered through +metacognitive reinforcement learning. To test this, we designed a novel +experiment to investigate the discovery of new planning strategies. We then +present metacognitive reinforcement learning models and demonstrate their +capability for strategy discovery as well as show that they provide a better +explanation of human strategy discovery than alternative learning mechanisms. +However, when fitted to human data, these models exhibit a slower discovery +rate than humans, leaving room for improvement. + +
+
+
+
+
+ + ☆ CredID: Credible Multi-Bit Watermark for Large Language Models + Identification + + +
+ Large Language Models (LLMs) are widely used in complex natural language +processing tasks but raise privacy and security concerns due to the lack of +identity recognition. This paper proposes a multi-party credible watermarking +framework (CredID) involving a trusted third party (TTP) and multiple LLM +vendors to address these issues. In the watermark embedding stage, vendors +request a seed from the TTP to generate watermarked text without sending the +user's prompt. In the extraction stage, the TTP coordinates each vendor to +extract and verify the watermark from the text. This provides a credible +watermarking scheme while preserving vendor privacy. Furthermore, current +watermarking algorithms struggle with text quality, information capacity, and +robustness, making it challenging to meet the diverse identification needs of +LLMs. Thus, we propose a novel multi-bit watermarking algorithm and an +open-source toolkit to facilitate research. Experiments show our CredID +enhances watermark credibility and efficiency without compromising text +quality. Additionally, we successfully utilized this framework to achieve +highly accurate identification among multiple LLM vendors. + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ ChatTS: Aligning Time Series with LLMs via Synthetic Data for Enhanced + Understanding and Reasoning + + +
+ Understanding time series is crucial for its application in real-world +scenarios. Recently, large language models (LLMs) have been increasingly +applied to time series tasks, leveraging their strong language capabilities to +enhance various applications. However, research on multimodal LLMs (MLLMs) for +time series understanding and reasoning remains limited, primarily due to the +scarcity of high-quality datasets that align time series with textual +information. This paper introduces ChatTS, a novel MLLM designed for time +series analysis. ChatTS treats time series as a modality, similar to how vision +MLLMs process images, enabling it to perform both understanding and reasoning +with time series. To address the scarcity of training data, we propose an +attribute-based method for generating synthetic time series with detailed +attribute descriptions. We further introduce Time Series Evol-Instruct, a novel +approach that generates diverse time series Q&As, enhancing the model's +reasoning capabilities. To the best of our knowledge, ChatTS is the first MLLM +that takes multivariate time series as input, which is fine-tuned exclusively +on synthetic datasets. We evaluate its performance using benchmark datasets +with real-world data, including six alignment tasks and four reasoning tasks. +Our results show that ChatTS significantly outperforms existing vision-based +MLLMs (e.g., GPT-4o) and text/agent-based LLMs, achieving a 46.0% improvement +in alignment tasks and a 25.8% improvement in reasoning tasks. + +
+
+ comment: 14 pages, 14 figures +
+
+
+
+
+ + ☆ Revolve: Optimizing AI Systems by Tracking Response Evolution in Textual + Optimization + + +
+ Recent advancements in large language models (LLMs) have significantly +enhanced the ability of LLM-based systems to perform complex tasks through +natural language processing and tool interaction. However, optimizing these +LLM-based systems for specific tasks remains challenging, often requiring +manual interventions like prompt engineering and hyperparameter tuning. +Existing automatic optimization methods, such as textual feedback-based +techniques (e.g., TextGrad), tend to focus on immediate feedback, analogous to +using immediate derivatives in traditional numerical gradient descent. However, +relying solely on such feedback can be limited when the adjustments made in +response to this feedback are either too small or fluctuate irregularly, +potentially slowing down or even stalling the optimization process. To overcome +these challenges, more adaptive methods are needed, especially in situations +where the system's response is evolving slowly or unpredictably. In this paper, +we introduce REVOLVE, an optimization method that tracks how "R"esponses +"EVOLVE" across iterations in LLM systems. By focusing on the evolution of +responses over time, REVOLVE enables more stable and effective optimization by +making thoughtful, progressive adjustments at each step. Experimental results +demonstrate that REVOLVE outperforms competitive baselines, achieving a 7.8% +improvement in prompt optimization, a 20.72% gain in solution refinement, and a +29.17% increase in code optimization. Additionally, REVOLVE converges in fewer +iterations, resulting in significant computational savings. These advantages +highlight its adaptability and efficiency, positioning REVOLVE as a valuable +tool for optimizing LLM-based systems and accelerating the development of +next-generation AI technologies. Code is available at: +https://github.com/Peiyance/REVOLVE. + +
+
+ comment: 20 pages, 2 figures +
+
+
+
+
+ + ☆ Coordinated Multi-Armed Bandits for Improved Spatial Reuse in Wi-Fi + + +
+ Multi-Access Point Coordination (MAPC) and Artificial Intelligence and +Machine Learning (AI/ML) are expected to be key features in future Wi-Fi, such +as the forthcoming IEEE 802.11bn (Wi-Fi 8) and beyond. In this paper, we +explore a coordinated solution based on online learning to drive the +optimization of Spatial Reuse (SR), a method that allows multiple devices to +perform simultaneous transmissions by controlling interference through Packet +Detect (PD) adjustment and transmit power control. In particular, we focus on a +Multi-Agent Multi-Armed Bandit (MA-MAB) setting, where multiple decision-making +agents concurrently configure SR parameters from coexisting networks by +leveraging the MAPC framework, and study various algorithms and reward-sharing +mechanisms. We evaluate different MA-MAB implementations using Komondor, a +well-adopted Wi-Fi simulator, and demonstrate that AI-native SR enabled by +coordinated MABs can improve the network performance over current Wi-Fi +operation: mean throughput increases by 15%, fairness is improved by increasing +the minimum throughput across the network by 210%, while the maximum access +delay is kept below 3 ms. + +
+
+
+
+
+ + ☆ Preference-based opponent shaping in differentiable games + + +
+ Strategy learning in game environments with multi-agent is a challenging +problem. Since each agent's reward is determined by the joint strategy, a +greedy learning strategy that aims to maximize its own reward may fall into a +local optimum. Recent studies have proposed the opponent modeling and shaping +methods for game environments. These methods enhance the efficiency of strategy +learning by modeling the strategies and updating processes of other agents. +However, these methods often rely on simple predictions of opponent strategy +changes. Due to the lack of modeling behavioral preferences such as cooperation +and competition, they are usually applicable only to predefined scenarios and +lack generalization capabilities. In this paper, we propose a novel +Preference-based Opponent Shaping (PBOS) method to enhance the strategy +learning process by shaping agents' preferences towards cooperation. We +introduce the preference parameter, which is incorporated into the agent's loss +function, thus allowing the agent to directly consider the opponent's loss +function when updating the strategy. We update the preference parameters +concurrently with strategy learning to ensure that agents can adapt to any +cooperative or competitive game environment. Through a series of experiments, +we verify the performance of PBOS algorithm in a variety of differentiable +games. The experimental results show that the PBOS algorithm can guide the +agent to learn the appropriate preference parameters, so as to achieve better +reward distribution in multiple game environments. + +
+
+
+
+
+ + ☆ TokenFlow: Unified Image Tokenizer for Multimodal Understanding and + Generation + + +
+ We present TokenFlow, a novel unified image tokenizer that bridges the +long-standing gap between multimodal understanding and generation. Prior +research attempt to employ a single reconstruction-targeted Vector Quantization +(VQ) encoder for unifying these two tasks. We observe that understanding and +generation require fundamentally different granularities of visual information. +This leads to a critical trade-off, particularly compromising performance in +multimodal understanding tasks. TokenFlow addresses this challenge through an +innovative dual-codebook architecture that decouples semantic and pixel-level +feature learning while maintaining their alignment via a shared mapping +mechanism. This design enables direct access to both high-level semantic +representations crucial for understanding tasks and fine-grained visual +features essential for generation through shared indices. Our extensive +experiments demonstrate TokenFlow's superiority across multiple dimensions. +Leveraging TokenFlow, we demonstrate for the first time that discrete visual +input can surpass LLaVA-1.5 13B in understanding performance, achieving a 7.2\% +average improvement. For image reconstruction, we achieve a strong FID score of +0.63 at 384*384 resolution. Moreover, TokenFlow establishes state-of-the-art +performance in autoregressive image generation with a GenEval score of 0.55 at +256*256 resolution, achieving comparable results to SDXL. + +
+
+ comment: https://byteflow-ai.github.io/TokenFlow/ +
+
+
+
+
+ + ☆ UTSD: Unified Time Series Diffusion Model + + +
+ Transformer-based architectures have achieved unprecedented success in time +series analysis. However, facing the challenge of across-domain modeling, +existing studies utilize statistical prior as prompt engineering fails under +the huge distribution shift among various domains. In this paper, a Unified +Time Series Diffusion (UTSD) model is established for the first time to model +the multi-domain probability distribution, utilizing the powerful probability +distribution modeling ability of Diffusion. Unlike the autoregressive models +that capture the conditional probabilities of the prediction horizon to the +historical sequence, we use a diffusion denoising process to model the mixture +distribution of the cross-domain data and generate the prediction sequence for +the target domain directly utilizing conditional sampling. The proposed UTSD +contains three pivotal designs: (1) The condition network captures the +multi-scale fluctuation patterns from the observation sequence, which are +utilized as context representations to guide the denoising network to generate +the prediction sequence; (2) Adapter-based fine-tuning strategy, the +multi-domain universal representation learned in the pretraining stage is +utilized for downstream tasks in target domains; (3) The diffusion and +denoising process on the actual sequence space, combined with the improved +classifier free guidance as the conditional generation strategy, greatly +improves the stability and accuracy of the downstream task. We conduct +extensive experiments on mainstream benchmarks, and the pre-trained UTSD +outperforms existing foundation models on all data domains, exhibiting superior +zero-shot generalization ability. After training from scratch, UTSD achieves +comparable performance against domain-specific proprietary models. The +empirical results validate the potential of UTSD as a time series foundational +model. + +
+
+
+
+
+ + ☆ Point-GN: A Non-Parametric Network Using Gaussian Positional Encoding + for Point Cloud Classification WACV + + +
+ This paper introduces Point-GN, a novel non-parametric network for efficient +and accurate 3D point cloud classification. Unlike conventional deep learning +models that rely on a large number of trainable parameters, Point-GN leverages +non-learnable components-specifically, Farthest Point Sampling (FPS), k-Nearest +Neighbors (k-NN), and Gaussian Positional Encoding (GPE)-to extract both local +and global geometric features. This design eliminates the need for additional +training while maintaining high performance, making Point-GN particularly +suited for real-time, resource-constrained applications. We evaluate Point-GN +on two benchmark datasets, ModelNet40 and ScanObjectNN, achieving +classification accuracies of 85.29% and 85.89%, respectively, while +significantly reducing computational complexity. Point-GN outperforms existing +non-parametric methods and matches the performance of fully trained models, all +with zero learnable parameters. Our results demonstrate that Point-GN is a +promising solution for 3D point cloud classification in practical, real-time +environments. + +
+
+ comment: This paper has been accepted for presentation at the IEEE Winter + Conference on Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Less is More: A Stealthy and Efficient Adversarial Attack Method for + DRL-based Autonomous Driving Policies + + +
+ Despite significant advancements in deep reinforcement learning (DRL)-based +autonomous driving policies, these policies still exhibit vulnerability to +adversarial attacks. This vulnerability poses a formidable challenge to the +practical deployment of these policies in autonomous driving. Designing +effective adversarial attacks is an indispensable prerequisite for enhancing +the robustness of these policies. In view of this, we present a novel stealthy +and efficient adversarial attack method for DRL-based autonomous driving +policies. Specifically, we introduce a DRL-based adversary designed to trigger +safety violations (e.g., collisions) by injecting adversarial samples at +critical moments. We model the attack as a mixed-integer optimization problem +and formulate it as a Markov decision process. Then, we train the adversary to +learn the optimal policy for attacking at critical moments without domain +knowledge. Furthermore, we introduce attack-related information and a +trajectory clipping method to enhance the learning capability of the adversary. +Finally, we validate our method in an unprotected left-turn scenario across +different traffic densities. The experimental results show that our method +achieves more than 90% collision rate within three attacks in most cases. +Furthermore, our method achieves more than 130% improvement in attack +efficiency compared to the unlimited attack method. + +
+
+
+
+
+ + ☆ MRNet: Multifaceted Resilient Networks for Medical Image-to-Image + Translation + + +
+ We propose a Multifaceted Resilient Network(MRNet), a novel architecture +developed for medical image-to-image translation that outperforms +state-of-the-art methods in MRI-to-CT and MRI-to-MRI conversion. MRNet +leverages the Segment Anything Model (SAM) to exploit frequency-based features +to build a powerful method for advanced medical image transformation. The +architecture extracts comprehensive multiscale features from diverse datasets +using a powerful SAM image encoder and performs resolution-aware feature fusion +that consistently integrates U-Net encoder outputs with SAM-derived features. +This fusion optimizes the traditional U-Net skip connection while leveraging +transformer-based contextual analysis. The translation is complemented by an +innovative dual-mask configuration incorporating dynamic attention patterns and +a specialized loss function designed to address regional mapping mismatches, +preserving both the gross anatomy and tissue details. Extensive validation +studies have shown that MRNet outperforms state-of-the-art architectures, +particularly in maintaining anatomical fidelity and minimizing translation +artifacts. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ MILLION: A General Multi-Objective Framework with Controllable Risk for + Portfolio Management VLDB 2025 + + +
+ Portfolio management is an important yet challenging task in AI for FinTech, +which aims to allocate investors' budgets among different assets to balance the +risk and return of an investment. In this study, we propose a general +Multi-objectIve framework with controLLable rIsk for pOrtfolio maNagement +(MILLION), which consists of two main phases, i.e., return-related maximization +and risk control. Specifically, in the return-related maximization phase, we +introduce two auxiliary objectives, i.e., return rate prediction, and return +rate ranking, combined with portfolio optimization to remit the overfitting +problem and improve the generalization of the trained model to future markets. +Subsequently, in the risk control phase, we propose two methods, i.e., +portfolio interpolation and portfolio improvement, to achieve fine-grained risk +control and fast risk adaption to a user-specified risk level. For the +portfolio interpolation method, we theoretically prove that the risk can be +perfectly controlled if the to-be-set risk level is in a proper interval. In +addition, we also show that the return rate of the adjusted portfolio after +portfolio interpolation is no less than that of the min-variance optimization, +as long as the model in the reward maximization phase is effective. +Furthermore, the portfolio improvement method can achieve greater return rates +while keeping the same risk level compared to portfolio interpolation. +Extensive experiments are conducted on three real-world datasets. The results +demonstrate the effectiveness and efficiency of the proposed framework. + +
+
+ comment: accepted by VLDB 2025 +
+
+
+
+
+ + ☆ Specification Generation for Neural Networks in Systems + + +
+ Specifications - precise mathematical representations of correct +domain-specific behaviors - are crucial to guarantee the trustworthiness of +computer systems. With the increasing development of neural networks as +computer system components, specifications gain more importance as they can be +used to regulate the behaviors of these black-box models. Traditionally, +specifications are designed by domain experts based on their intuition of +correct behavior. However, this is labor-intensive and hence not a scalable +approach as computer system applications diversify. We hypothesize that the +traditional (aka reference) algorithms that neural networks replace for higher +performance can act as effective proxies for correct behaviors of the models, +when available. This is because they have been used and tested for long enough +to encode several aspects of the trustworthy/correct behaviors in the +underlying domain. Driven by our hypothesis, we develop a novel automated +framework, SpecTRA to generate specifications for neural networks using +references. We formulate specification generation as an optimization problem +and solve it with observations of reference behaviors. SpecTRA clusters similar +observations into compact specifications. We present specifications generated +by SpecTRA for neural networks in adaptive bit rate and congestion control +algorithms. Our specifications show evidence of being correct and matching +intuition. Moreover, we use our specifications to show several unknown +vulnerabilities of the SOTA models for computer systems. + +
+
+
+
+
+ + PEMF-VVTO: Point-Enhanced Video Virtual Try-on via Mask-free Paradigm + + +
+ Video Virtual Try-on aims to fluently transfer the garment image to a +semantically aligned try-on area in the source person video. Previous methods +leveraged the inpainting mask to remove the original garment in the source +video, thus achieving accurate garment transfer on simple model videos. +However, when these methods are applied to realistic video data with more +complex scene changes and posture movements, the overly large and incoherent +agnostic masks will destroy the essential spatial-temporal information of the +original video, thereby inhibiting the fidelity and coherence of the try-on +video. To alleviate this problem, %avoid the inherent deficiencies of +mask-based try-on paradigm, we propose a novel point-enhanced mask-free video +virtual try-on framework (PEMF-VVTO). Specifically, we first leverage the +pre-trained mask-based try-on model to construct large-scale paired training +data (pseudo-person samples). Training on these mask-free data enables our +model to perceive the original spatial-temporal information while realizing +accurate garment transfer. Then, based on the pre-acquired sparse frame-cloth +and frame-frame point alignments, we design the point-enhanced spatial +attention (PSA) and point-enhanced temporal attention (PTA) to further improve +the try-on accuracy and video coherence of the mask-free model. Concretely, PSA +explicitly guides the garment transfer to desirable locations through the +sparse semantic alignments of video frames and cloth. PTA exploits the temporal +attention on sparse point correspondences to enhance the smoothness of +generated videos. Extensive qualitative and quantitative experiments clearly +illustrate that our PEMF-VVTO can generate more natural and coherent try-on +videos than existing state-of-the-art methods. + +
+
+
+
+
+ + ☆ Human Multi-View Synthesis from a Single-View Model:Transferred Body and + Face Representations + + +
+ Generating multi-view human images from a single view is a complex and +significant challenge. Although recent advancements in multi-view object +generation have shown impressive results with diffusion models, novel view +synthesis for humans remains constrained by the limited availability of 3D +human datasets. Consequently, many existing models struggle to produce +realistic human body shapes or capture fine-grained facial details accurately. +To address these issues, we propose an innovative framework that leverages +transferred body and facial representations for multi-view human synthesis. +Specifically, we use a single-view model pretrained on a large-scale human +dataset to develop a multi-view body representation, aiming to extend the 2D +knowledge of the single-view model to a multi-view diffusion model. +Additionally, to enhance the model's detail restoration capability, we +integrate transferred multimodal facial features into our trained human +diffusion model. Experimental evaluations on benchmark datasets demonstrate +that our approach outperforms the current state-of-the-art methods, achieving +superior performance in multi-view human synthesis. + +
+
+
+
+
+ + ☆ Surveying the Effects of Quality, Diversity, and Complexity in Synthetic + Data From Large Language Models + + +
+ Synthetic data generation with Large Language Models is a promising paradigm +for augmenting natural data over a nearly infinite range of tasks. Given this +variety, direct comparisons among synthetic data generation algorithms are +scarce, making it difficult to understand where improvement comes from and what +bottlenecks exist. We propose to evaluate algorithms via the makeup of +synthetic data generated by each algorithm in terms of data quality, diversity, +and complexity. We choose these three characteristics for their significance in +open-ended processes and the impact each has on the capabilities of downstream +models. We find quality to be essential for in-distribution model +generalization, diversity to be essential for out-of-distribution +generalization, and complexity to be beneficial for both. Further, we emphasize +the existence of Quality-Diversity trade-offs in training data and the +downstream effects on model performance. We then examine the effect of various +components in the synthetic data pipeline on each data characteristic. This +examination allows us to taxonomize and compare synthetic data generation +algorithms through the components they utilize and the resulting effects on +data QDC composition. This analysis extends into a discussion on the importance +of balancing QDC in synthetic data for efficient reinforcement learning and +self-improvement algorithms. Analogous to the QD trade-offs in training data, +often there exist trade-offs between model output quality and output diversity +which impact the composition of synthetic data. We observe that many models are +currently evaluated and optimized only for output quality, thereby limiting +output diversity and the potential for self-improvement. We argue that +balancing these trade-offs is essential to the development of future +self-improvement algorithms and highlight a number of works making progress in +this direction. + +
+
+
+
+
+ + ☆ Theoretical limitations of multi-layer Transformer + + +
+ Transformers, especially the decoder-only variants, are the backbone of most +modern large language models; yet we do not have much understanding of their +expressive power except for the simple $1$-layer case. + Due to the difficulty of analyzing multi-layer models, all previous work +relies on unproven complexity conjectures to show limitations for multi-layer +Transformers. In this work, we prove the first $\textit{unconditional}$ lower +bound against multi-layer decoder-only transformers. For any constant $L$, we +prove that any $L$-layer decoder-only transformer needs a polynomial model +dimension ($n^{\Omega(1)}$) to perform sequential composition of $L$ functions +over an input of $n$ tokens. + As a consequence, our results give: (1) the first depth-width trade-off for +multi-layer transformers, exhibiting that the $L$-step composition task is +exponentially harder for $L$-layer models compared to $(L+1)$-layer ones; (2) +an unconditional separation between encoder and decoder, exhibiting a hard task +for decoders that can be solved by an exponentially shallower and smaller +encoder; (3) a provable advantage of chain-of-thought, exhibiting a task that +becomes exponentially easier with chain-of-thought. + On the technical side, we propose the multi-party $\textit{autoregressive}$ +$\textit{communication}$ $\textit{model}$ that captures the computation of a +decoder-only Transformer. We also introduce a new proof technique that finds a +certain $\textit{indistinguishable}$ $\textit{decomposition}$ of all possible +inputs iteratively for proving lower bounds in this model. We believe our new +communication model and proof technique will be helpful to further understand +the computational power of transformers. + +
+
+
+
+
+ + ☆ 3D Interaction Geometric Pre-training for Molecular Relational Learning + + +
+ Molecular Relational Learning (MRL) is a rapidly growing field that focuses +on understanding the interaction dynamics between molecules, which is crucial +for applications ranging from catalyst engineering to drug discovery. Despite +recent progress, earlier MRL approaches are limited to using only the 2D +topological structure of molecules, as obtaining the 3D interaction geometry +remains prohibitively expensive. This paper introduces a novel 3D geometric +pre-training strategy for MRL (3DMRL) that incorporates a 3D virtual +interaction environment, overcoming the limitations of costly traditional +quantum mechanical calculation methods. With the constructed 3D virtual +interaction environment, 3DMRL trains 2D MRL model to learn the overall 3D +geometric information of molecular interaction through contrastive learning. +Moreover, fine-grained interaction between molecules is learned through force +prediction loss, which is crucial in understanding the wide range of molecular +interaction processes. Extensive experiments on various tasks using real-world +datasets, including out-of-distribution and extrapolation scenarios, +demonstrate the effectiveness of 3DMRL, showing up to a 24.93\% improvement in +performance across 40 tasks. + +
+
+
+
+
+ + Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large + Vision-Language Model via Causality Analysis WACV2025 + + +
+ Recent advancements in large vision-language models (LVLM) have significantly +enhanced their ability to comprehend visual inputs alongside natural language. +However, a major challenge in their real-world application is hallucination, +where LVLMs generate non-existent visual elements, eroding user trust. The +underlying mechanism driving this multimodal hallucination is poorly +understood. Minimal research has illuminated whether contexts such as sky, +tree, or grass field involve the LVLM in hallucinating a frisbee. We +hypothesize that hidden factors, such as objects, contexts, and semantic +foreground-background structures, induce hallucination. This study proposes a +novel causal approach: a hallucination probing system to identify these hidden +factors. By analyzing the causality between images, text prompts, and network +saliency, we systematically explore interventions to block these factors. Our +experimental findings show that a straightforward technique based on our +analysis can significantly reduce hallucinations. Additionally, our analyses +indicate the potential to edit network internals to minimize hallucinated +outputs. + +
+
+ comment: Accepted by WACV2025 +
+
+
+
+
+ + ☆ STDCformer: A Transformer-Based Model with a Spatial-Temporal Causal + De-Confounding Strategy for Crowd Flow Prediction + + +
+ Existing works typically treat spatial-temporal prediction as the task of +learning a function $F$ to transform historical observations to future +observations. We further decompose this cross-time transformation into three +processes: (1) Encoding ($E$): learning the intrinsic representation of +observations, (2) Cross-Time Mapping ($M$): transforming past representations +into future representations, and (3) Decoding ($D$): reconstructing future +observations from the future representations. From this perspective, +spatial-temporal prediction can be viewed as learning $F = E \cdot M \cdot D$, +which includes learning the space transformations $\left\{{E},{D}\right\}$ +between the observation space and the hidden representation space, as well as +the spatial-temporal mapping $M$ from future states to past states within the +representation space. This leads to two key questions: \textbf{Q1: What kind of +representation space allows for mapping the past to the future? Q2: How to +achieve map the past to the future within the representation space?} To address +Q1, we propose a Spatial-Temporal Backdoor Adjustment strategy, which learns a +Spatial-Temporal De-Confounded (STDC) representation space and estimates the +de-confounding causal effect of historical data on future data. This causal +relationship we captured serves as the foundation for subsequent +spatial-temporal mapping. To address Q2, we design a Spatial-Temporal Embedding +(STE) that fuses the information of temporal and spatial confounders, capturing +the intrinsic spatial-temporal characteristics of the representations. +Additionally, we introduce a Cross-Time Attention mechanism, which queries the +attention between the future and the past to guide spatial-temporal mapping. + +
+
+
+
+
+ + ☆ SAVER: A Toolbox for Sampling-Based, Probabilistic Verification of + Neural Networks + + +
+ We present a neural network verification toolbox to 1) assess the probability +of satisfaction of a constraint, and 2) synthesize a set expansion factor to +achieve the probability of satisfaction. Specifically, the tool box establishes +with a user-specified level of confidence whether the output of the neural +network for a given input distribution is likely to be contained within a given +set. Should the tool determine that the given set cannot satisfy the likelihood +constraint, the tool also implements an approach outlined in this paper to +alter the constraint set to ensure that the user-defined satisfaction +probability is achieved. The toolbox is comprised of sampling-based approaches +which exploit the properties of signed distance function to define set +containment. + +
+
+ comment: 7 pages, 8 figures, submitted to the 28th ACM International + Conference on Hybrid Systems: Computation and Control +
+
+
+
+
+ + ☆ Inverse Delayed Reinforcement Learning + + +
+ Inverse Reinforcement Learning (IRL) has demonstrated effectiveness in a +variety of imitation tasks. In this paper, we introduce an IRL framework +designed to extract rewarding features from expert trajectories affected by +delayed disturbances. Instead of relying on direct observations, our approach +employs an efficient off-policy adversarial training framework to derive expert +features and recover optimal policies from augmented delayed observations. +Empirical evaluations in the MuJoCo environment under diverse delay settings +validate the effectiveness of our method. Furthermore, we provide a theoretical +analysis showing that recovering expert policies from augmented delayed +observations outperforms using direct delayed observations. + +
+
+
+
+
+ + ☆ Panoptic Diffusion Models: co-generation of images and segmentation maps + + +
+ Recently, diffusion models have demonstrated impressive capabilities in +text-guided and image-conditioned image generation. However, existing diffusion +models cannot simultaneously generate a segmentation map of objects and a +corresponding image from the prompt. Previous attempts either generate +segmentation maps based on the images or provide maps as input conditions to +control image generation, limiting their functionality to given inputs. +Incorporating an inherent understanding of the scene layouts can improve the +creativity and realism of diffusion models. To address this limitation, we +present Panoptic Diffusion Model (PDM), the first model designed to generate +both images and panoptic segmentation maps concurrently. PDM bridges the gap +between image and text by constructing segmentation layouts that provide +detailed, built-in guidance throughout the generation process. This ensures the +inclusion of categories mentioned in text prompts and enriches the diversity of +segments within the background. We demonstrate the effectiveness of PDM across +two architectures: a unified diffusion transformer and a two-stream transformer +with a pretrained backbone. To facilitate co-generation with fewer sampling +steps, we incorporate a fast diffusion solver into PDM. Additionally, when +ground-truth maps are available, PDM can function as a text-guided +image-to-image generation model. Finally, we propose a novel metric for +evaluating the quality of generated maps and show that PDM achieves +state-of-the-art results in image generation with implicit scene control. + +
+
+
+
+
+ + ☆ Higher Order Transformers: Efficient Attention Mechanism for Tensor + Structured Data + + +
+ Transformers are now ubiquitous for sequence modeling tasks, but their +extension to multi-dimensional data remains a challenge due to the quadratic +cost of the attention mechanism. In this paper, we propose Higher-Order +Transformers (HOT), a novel architecture designed to efficiently process data +with more than two axes, i.e. higher-order tensors. To address the +computational challenges associated with high-order tensor attention, we +introduce a novel Kronecker factorized attention mechanism that reduces the +attention cost to quadratic in each axis' dimension, rather than quadratic in +the total size of the input tensor. To further enhance efficiency, HOT +leverages kernelized attention, reducing the complexity to linear. This +strategy maintains the model's expressiveness while enabling scalable attention +computation. We validate the effectiveness of HOT on two high-dimensional +tasks, including multivariate time series forecasting, and 3D medical image +classification. Experimental results demonstrate that HOT achieves competitive +performance while significantly improving computational efficiency, showcasing +its potential for tackling a wide range of complex, multi-dimensional data. + +
+
+
+
+
+ + ♻ ☆ Marconi: Prefix Caching for the Era of Hybrid LLMs + + +
+ Hybrid models that combine the language modeling capabilities of Attention +layers with the efficiency of Recurrent layers (e.g., State Space Models) have +gained traction in practically supporting long contexts in Large Language Model +serving. Yet, the unique properties of these models complicate the usage of +complementary efficiency optimizations such as prefix caching that skip +redundant computations across requests. Most notably, their use of in-place +state updates for recurrent layers precludes rolling back cache entries for +partial sequence overlaps, and instead mandates only exact-match cache hits; +the effect is a deluge of (large) cache entries per sequence, most of which +yield minimal reuse opportunities. We present Marconi, the first system that +supports efficient prefix caching with Hybrid LLMs. Key to Marconi are its +novel admission and eviction policies that more judiciously assess potential +cache entries based not only on recency, but also on (1) forecasts of their +reuse likelihood across a taxonomy of different hit scenarios, and (2) the +compute savings that hits deliver relative to memory footprints. Across diverse +workloads and Hybrid models, Marconi achieves up to 34.4$\times$ higher token +hit rates (71.1% or 617 ms lower TTFT) compared to state-of-the-art prefix +caching systems. + +
+
+
+
+
+ + ♻ ☆ StarVector: Generating Scalable Vector Graphics Code from Images and + Text + + +
+ Scalable Vector Graphics (SVGs) are vital for modern image rendering due to +their scalability and versatility. Previous SVG generation methods have focused +on curve-based vectorization, lacking semantic understanding, often producing +artifacts, and struggling with SVG primitives beyond path curves. To address +these issues, we introduce StarVector, a multimodal large language model for +SVG generation. It performs image vectorization by understanding image +semantics and using SVG primitives for compact, precise outputs. Unlike +traditional methods, StarVector works directly in the SVG code space, +leveraging visual understanding to apply accurate SVG primitives. To train +StarVector, we create SVG-Stack, a diverse dataset of 2M samples that enables +generalization across vectorization tasks and precise use of primitives like +ellipses, polygons, and text. We address challenges in SVG evaluation, showing +that pixel-based metrics like MSE fail to capture the unique qualities of +vector graphics. We introduce SVG-Bench, a benchmark across 10 datasets, and 3 +tasks: Image-to-SVG, Text-to-SVG generation, and diagram generation. Using this +setup, StarVector achieves state-of-the-art performance, producing more compact +and semantically rich SVGs. + +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Data Deduplication for Enhancing Federated Learning + of Language Models (Extended Version) NDSS + + +
+ Deduplication is a vital preprocessing step that enhances machine learning +model performance and saves training time and energy. However, enhancing +federated learning through deduplication poses challenges, especially regarding +scalability and potential privacy violations if deduplication involves sharing +all clients' data. In this paper, we address the problem of deduplication in a +federated setup by introducing a pioneering protocol, Efficient +Privacy-Preserving Multi-Party Deduplication (EP-MPD). It efficiently removes +duplicates from multiple clients' datasets without compromising data privacy. +EP-MPD is constructed in a modular fashion, utilizing two novel variants of the +Private Set Intersection protocol. Our extensive experiments demonstrate the +significant benefits of deduplication in federated learning of large language +models. For instance, we observe up to 19.62\% improvement in perplexity and up +to 27.95\% reduction in running time while varying the duplication level +between 10\% and 30\%. EP-MPD effectively balances privacy and performance in +federated learning, making it a valuable solution for large-scale applications. + +
+
+ comment: Accepted at the Network and Distributed Systems Security (NDSS) + Symposium, 2025 +
+
+
+
+
+ + ♻ ☆ FCL-ViT: Task-Aware Attention Tuning for Continual Learning + + +
+ Continual Learning (CL) involves adapting the prior Deep Neural Network (DNN) +knowledge to new tasks, without forgetting the old ones. However, modern CL +techniques focus on provisioning memory capabilities to existing DNN models +rather than designing new ones that are able to adapt according to the task at +hand. This paper presents the novel Feedback Continual Learning Vision +Transformer (FCL-ViT) that uses a feedback mechanism to generate real-time +dynamic attention features tailored to the current task. The FCL-ViT operates +in two Phases. In phase 1, the generic image features are produced and +determine where the Transformer should attend on the current image. In phase 2, +task-specific image features are generated that leverage dynamic attention. To +this end, Tunable self-Attention Blocks (TABs) and Task Specific Blocks (TSBs) +are introduced that operate in both phases and are responsible for tuning the +TABs attention, respectively. The FCL-ViT surpasses state-of-the-art +performance on Continual Learning compared to benchmark methods, while +retaining a small number of trainable DNN parameters. + +
+
+
+
+
+ + ♻ ☆ Enhancing Biomedical Knowledge Discovery for Diseases: An Open-Source + Framework Applied on Rett Syndrome and Alzheimer's Disease + + +
+ The ever-growing volume of biomedical publications creates a critical need +for efficient knowledge discovery. In this context, we introduce an open-source +end-to-end framework designed to construct knowledge around specific diseases +directly from raw text. To facilitate research in disease-related knowledge +discovery, we create two annotated datasets focused on Rett syndrome and +Alzheimer's disease, enabling the identification of semantic relations between +biomedical entities. Extensive benchmarking explores various ways to represent +relations and entity representations, offering insights into optimal modeling +strategies for semantic relation detection and highlighting language models' +competence in knowledge discovery. We also conduct probing experiments using +different layer representations and attention scores to explore transformers' +ability to capture semantic relations. + +
+
+ comment: Published in IEEE Access, doi: 10.1109/ACCESS.2024.3509714 +
+
+
+
+
+ + ♻ ☆ Challenges in Guardrailing Large Language Models for Science + + +
+ The rapid development in large language models (LLMs) has transformed the +landscape of natural language processing and understanding (NLP/NLU), offering +significant benefits across various domains. However, when applied to +scientific research, these powerful models exhibit critical failure modes +related to scientific integrity and trustworthiness. Existing general-purpose +LLM guardrails are insufficient to address these unique challenges in the +scientific domain. We provide comprehensive guidelines for deploying LLM +guardrails in the scientific domain. We identify specific challenges -- +including time sensitivity, knowledge contextualization, conflict resolution, +and intellectual property concerns -- and propose a guideline framework for the +guardrails that can align with scientific needs. These guardrail dimensions +include trustworthiness, ethics & bias, safety, and legal aspects. We also +outline in detail the implementation strategies that employ white-box, +black-box, and gray-box methodologies that can be enforced within scientific +contexts. + +
+
+
+
+
+ + ♻ ☆ Data quality dimensions for fair AI + + +
+ Artificial Intelligence (AI) systems are not intrinsically neutral and biases +trickle in any type of technological tool. In particular when dealing with +people, the impact of AI algorithms' technical errors originating with +mislabeled data is undeniable. As they feed wrong and discriminatory +classifications, these systems are not systematically guarded against bias. In +this article we consider the problem of bias in AI systems from the point of +view of data quality dimensions. We highlight the limited model construction of +bias mitigation tools based on accuracy strategy, illustrating potential +improvements of a specific tool in gender classification errors occurring in +two typically difficult contexts: the classification of non-binary individuals, +for which the label set becomes incomplete with respect to the dataset; and the +classification of transgender individuals, for which the dataset becomes +inconsistent with respect to the label set. Using formal methods for reasoning +about the behavior of the classification system in presence of a changing +world, we propose to reconsider the fairness of the classification task in +terms of completeness, consistency, timeliness and reliability, and offer some +theoretical results. + +
+
+
+
+
+ + ♻ ☆ Affordance-based Robot Manipulation with Flow Matching + + +
+ We present a framework for assistive robot manipulation, which focuses on two +fundamental challenges: first, efficiently adapting large-scale models to +downstream scene affordance understanding tasks, especially in daily living +scenarios where gathering multi-task data involving humans requires strenuous +effort; second, effectively learning robot trajectories by grounding the visual +affordance model. We tackle the first challenge by employing a +parameter-efficient prompt tuning method that prepends learnable text prompts +to the frozen vision model to predict manipulation affordances in multi-task +scenarios. Then we propose to learn robot trajectories guided by affordances in +a supervised Flow Matching method. Flow matching represents a robot visuomotor +policy as a conditional process of flowing random waypoints to desired robot +trajectories. Finally, we introduce a real-world dataset with 10 tasks across +Activities of Daily Living to test our framework. Our extensive evaluation +highlights that the proposed prompt tuning method for learning manipulation +affordance with language prompter achieves competitive performance and even +outperforms other finetuning protocols across data scales, while satisfying +parameter efficiency. Learning multi-task robot trajectories with flow matching +policy also leads to consistently better results than alternative behavior +cloning methods, including marginally better generalization performance and +prominently faster inference than diffusion policy with DDPM. Our framework +seamlessly unifies affordance model learning and trajectory generation with +flow matching for robot manipulation. + +
+
+
+
+
+ + ♻ ☆ Number Cookbook: Number Understanding of Language Models and How to + Improve It + + +
+ Large language models (LLMs) can solve an increasing number of complex +reasoning tasks while making surprising mistakes in basic numerical +understanding and processing (such as 9.11 > 9.9). The latter ability is +essential for tackling complex arithmetic and mathematical problems and serves +as a foundation for most reasoning tasks, but previous work paid little +attention to it or only discussed several restricted tasks (like integer +addition). In this paper, we comprehensively investigate the numerical +understanding and processing ability (NUPA) of LLMs. Firstly, we introduce a +benchmark covering four common numerical representations and 17 distinct +numerical tasks in four major categories, resulting in 41 meaningful +combinations in total. These tasks are derived from primary and secondary +education curricula, encompassing nearly all everyday numerical understanding +and processing scenarios, and the rules of these tasks are very simple and +clear. Through the benchmark, we find that current LLMs fail frequently in many +of the tasks. To study the problem, we train small models with existing and +potential techniques for enhancing NUPA (such as tokenizers, PEs, and number +formats), comprehensively evaluating their effectiveness using our testbed. We +also finetune practical-scale LLMs on our proposed NUPA tasks and find that 1) +naive finetuning can improve NUPA a lot on many but not all tasks, and 2) +surprisingly, techniques designed to enhance NUPA prove ineffective for +finetuning pretrained models. We further explore the impact of chain-of-thought +techniques on NUPA. Our work provides a more detailed and comprehensive +understanding of NUPA in LLMs. Our benchmark and code are released at +https://github.com/GraphPKU/number_cookbook. + +
+
+
+
+
+ + ♻ ☆ DataLab: A Unified Platform for LLM-Powered Business Intelligence + + +
+ Business intelligence (BI) transforms large volumes of data within modern +organizations into actionable insights for informed decision-making. Recently, +large language model (LLM)-based agents have streamlined the BI workflow by +automatically performing task planning, reasoning, and actions in executable +environments based on natural language (NL) queries. However, existing +approaches primarily focus on individual BI tasks such as NL2SQL and NL2VIS. +The fragmentation of tasks across different data roles and tools lead to +inefficiencies and potential errors due to the iterative and collaborative +nature of BI. In this paper, we introduce DataLab, a unified BI platform that +integrates a one-stop LLM-based agent framework with an augmented computational +notebook interface. DataLab supports a wide range of BI tasks for different +data roles by seamlessly combining LLM assistance with user customization +within a single environment. To achieve this unification, we design a domain +knowledge incorporation module tailored for enterprise-specific BI tasks, an +inter-agent communication mechanism to facilitate information sharing across +the BI workflow, and a cell-based context management strategy to enhance +context utilization efficiency in BI notebooks. Extensive experiments +demonstrate that DataLab achieves state-of-the-art performance on various BI +tasks across popular research benchmarks. Moreover, DataLab maintains high +effectiveness and efficiency on real-world datasets from Tencent, achieving up +to a 58.58% increase in accuracy and a 61.65% reduction in token cost on +enterprise-specific BI tasks. + +
+
+
+
+
+ + ♻ ☆ Prediction-Powered Ranking of Large Language Models NeurIPS 2024 + + +
+ Large language models are often ranked according to their level of alignment +with human preferences -- a model is better than other models if its outputs +are more frequently preferred by humans. One of the popular ways to elicit +human preferences utilizes pairwise comparisons between the outputs provided by +different models to the same inputs. However, since gathering pairwise +comparisons by humans is costly and time-consuming, it has become a common +practice to gather pairwise comparisons by a strong large language model -- a +model strongly aligned with human preferences. Surprisingly, practitioners +cannot currently measure the uncertainty that any mismatch between human and +model preferences may introduce in the constructed rankings. In this work, we +develop a statistical framework to bridge this gap. Given a (small) set of +pairwise comparisons by humans and a large set of pairwise comparisons by a +model, our framework provides a rank-set -- a set of possible ranking positions +-- for each of the models under comparison. Moreover, it guarantees that, with +a probability greater than or equal to a user-specified value, the rank-sets +cover the true ranking consistent with the distribution of human pairwise +preferences asymptotically. Using pairwise comparisons made by humans in the +LMSYS Chatbot Arena platform and pairwise comparisons made by three strong +large language models, we empirically demonstrate the effectivity of our +framework and show that the rank-sets constructed using only pairwise +comparisons by the strong large language models are often inconsistent with +(the distribution of) human pairwise preferences. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Can In-context Learning Really Generalize to Out-of-distribution Tasks? + + +
+ In this work, we explore the mechanism of in-context learning (ICL) on +out-of-distribution (OOD) tasks that were not encountered during training. To +achieve this, we conduct synthetic experiments where the objective is to learn +OOD mathematical functions through ICL using a GPT-2 model. We reveal that +Transformers may struggle to learn OOD task functions through ICL. +Specifically, ICL performance resembles implementing a function within the +pretraining hypothesis space and optimizing it with gradient descent based on +the in-context examples. Additionally, we investigate ICL's well-documented +ability to learn unseen abstract labels in context. We demonstrate that such +ability only manifests in the scenarios without distributional shifts and, +therefore, may not serve as evidence of new-task-learning ability. Furthermore, +we assess ICL's performance on OOD tasks when the model is pretrained on +multiple tasks. Both empirical and theoretical analyses demonstrate the +existence of the \textbf{low-test-error preference} of ICL, where it tends to +implement the pretraining function that yields low test error in the testing +context. We validate this through numerical experiments. This new theoretical +result, combined with our empirical findings, elucidates the mechanism of ICL +in addressing OOD tasks. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ♻ ☆ Towards a Robust Soft Baby Robot With Rich Interaction Ability for + Advanced Machine Learning Algorithms + + +
+ Advanced machine learning algorithms require platforms that are extremely +robust and equipped with rich sensory feedback to handle extensive +trial-and-error learning without relying on strong inductive biases. +Traditional robotic designs, while well-suited for their specific use cases, +are often fragile when used with these algorithms. To address this gap -- and +inspired by the vision of enabling curiosity-driven baby robots -- we present a +novel robotic limb designed from scratch. Our design has a hybrid soft-hard +structure, high redundancy with rich non-contact sensors (exclusively cameras), +and easily replaceable failure points. Proof-of-concept experiments using two +contemporary reinforcement learning algorithms on a physical prototype +demonstrate that our design is able to succeed in a simple target-finding task +even under simulated sensor failures, all with minimal human oversight during +extended learning periods. We believe this design represents a concrete step +toward more tailored robotic designs for achieving general-purpose, generally +intelligent robots. + +
+
+ comment: 6 pages in main text + 2 pages of references, 8 figures in main text, + 1 table in main text; source code available at + https://github.com/dylanashley/robot-limb-testai +
+
+
+
+
+ + ♻ ☆ tcrLM: a lightweight protein language model for predicting T cell + receptor and epitope binding specificity + + +
+ The anti-cancer immune response relies on the bindings between T-cell +receptors (TCRs) and antigens, which elicits adaptive immunity to eliminate +tumor cells. This ability of the immune system to respond to novel various +neoantigens arises from the immense diversity of TCR repository. However, TCR +diversity poses a significant challenge on accurately predicting antigen-TCR +bindings. In this study, we introduce a lightweight masked language model, +termed tcrLM, to address this challenge. Our approach involves randomly masking +segments of TCR sequences and training tcrLM to infer the masked segments, +thereby enabling the extraction of expressive features from TCR sequences. To +further enhance robustness, we incorporate virtual adversarial training into +tcrLM. We construct the largest TCR CDR3 sequence set with more than 100 +million distinct sequences, and pretrain tcrLM on these sequences. The +pre-trained encoder is subsequently applied to predict TCR-antigen binding +specificity. We evaluate model performance on three test datasets: independent, +external, and COVID-19 test set. The results demonstrate that tcrLM not only +surpasses existing TCR-antigen binding prediction methods, but also outperforms +other mainstream protein language models. More interestingly, tcrLM effectively +captures the biochemical properties and positional preference of amino acids +within TCR sequences. Additionally, the predicted TCR-neoantigen binding scores +indicates the immunotherapy responses and clinical outcomes in a melanoma +cohort. These findings demonstrate the potential of tcrLM in predicting +TCR-antigen binding specificity, with significant implications for advancing +immunotherapy and personalized medicine. + +
+
+
+
+
+ + ♻ ☆ When LLMs Meet Cybersecurity: A Systematic Literature Review + + +
+ The rapid development of large language models (LLMs) has opened new avenues +across various fields, including cybersecurity, which faces an evolving threat +landscape and demand for innovative technologies. Despite initial explorations +into the application of LLMs in cybersecurity, there is a lack of a +comprehensive overview of this research area. This paper addresses this gap by +providing a systematic literature review, covering the analysis of over 300 +works, encompassing 25 LLMs and more than 10 downstream scenarios. Our +comprehensive overview addresses three key research questions: the construction +of cybersecurity-oriented LLMs, the application of LLMs to various +cybersecurity tasks, the challenges and further research in this area. This +study aims to shed light on the extensive potential of LLMs in enhancing +cybersecurity practices and serve as a valuable resource for applying LLMs in +this field. We also maintain and regularly update a list of practical guides on +LLMs for cybersecurity at https://github.com/tmylla/Awesome-LLM4Cybersecurity. + +
+
+ comment: We have updated the related papers up to Aug 31st, with 50+ new + papers added +
+
+
+
+
+ + ♻ ☆ Self-Improvement in Language Models: The Sharpening Mechanism + + +
+ Recent work in language modeling has raised the possibility of +self-improvement, where a language models evaluates and refines its own +generations to achieve higher performance without external feedback. It is +impossible for this self-improvement to create information that is not already +in the model, so why should we expect that this will lead to improved +capabilities? We offer a new perspective on the capabilities of +self-improvement through a lens we refer to as sharpening. Motivated by the +observation that language models are often better at verifying response quality +than they are at generating correct responses, we formalize self-improvement as +using the model itself as a verifier during post-training in order to +``sharpen'' the model to one placing large mass on high-quality sequences, +thereby amortizing the expensive inference-time computation of generating good +sequences. We begin by introducing a new statistical framework for sharpening +in which the learner aims to sharpen a pre-trained base policy via sample +access, and establish fundamental limits. Then we analyze two natural families +of self-improvement algorithms based on SFT and RLHF. We find that (i) the +SFT-based approach is minimax optimal whenever the initial model has sufficient +coverage, but (ii) the RLHF-based approach can improve over SFT-based +self-improvement by leveraging online exploration, bypassing the need for +coverage. Finally, we empirically validate the sharpening mechanism via +inference-time and amortization experiments. We view these findings as a +starting point toward a foundational understanding that can guide the design +and evaluation of self-improvement algorithms. + +
+
+
+
+
+ + ♻ ☆ OpenDriver: An Open-Road Driver State Detection Dataset + + +
+ Among numerous studies for driver state detection, wearable physiological +measurements offer a practical method for real-time monitoring. However, there +are few driver physiological datasets in open-road scenarios, and the existing +datasets suffer from issues such as poor signal quality, small sample sizes, +and short data collection periods. Therefore, in this paper, a large-scale +multimodal driving dataset, OpenDriver, for driver state detection is +developed. The OpenDriver encompasses a total of 3,278 driving trips, with a +signal collection duration spanning approximately 4,600 hours. Two modalities +of driving signals are enrolled in OpenDriver: electrocardiogram (ECG) signals +and six-axis motion data of the steering wheel from a motion measurement unit +(IMU), which were recorded from 81 drivers and their vehicles. Furthermore, +three challenging tasks are involved in our work, namely ECG signal quality +assessment, individual biometric identification based on ECG signals, and +physiological signal analysis in complex driving environments. To facilitate +research in these tasks, corresponding benchmarks have also been introduced. +First, a noisy augmentation strategy is applied to generate a larger-scale ECG +signal dataset with realistic noise simulation for quality assessment. Second, +an end-to-end contrastive learning framework is employed for individual +biometric identification. Finally, a comprehensive analysis of drivers' HRV +features under different driving conditions is conducted. Each benchmark +provides evaluation metrics and reference results. The OpenDriver dataset will +be publicly available at https://github.com/bdne/OpenDriver. + +
+
+ comment: Considering that there are flaws in the statistical data of the + dataset, all the authors agreed to withdraw the manuscript +
+
+
+
+
+ + ♻ ☆ Optimizing Container Loading and Unloading through Dual-Cycling and + Dockyard Rehandle Reduction Using a Hybrid Genetic Algorithm + + +
+ This paper addresses the optimization of container unloading and loading +operations at ports, integrating quay-crane dual-cycling with dockyard rehandle +minimization. We present a unified model encompassing both operations: ship +container unloading and loading by quay crane, and the other is reducing +dockyard rehandles while loading the ship. We recognize that optimizing one +aspect in isolation can lead to suboptimal outcomes due to interdependencies. +Specifically, optimizing unloading sequences for minimal operation time may +inadvertently increase dockyard rehandles during loading and vice versa. To +address this NP-hard problem, we propose a hybrid genetic algorithm (GA) +QCDC-DR-GA comprising one-dimensional and two-dimensional GA components. Our +model, QCDC-DR-GA, consistently outperforms four state-of-the-art methods in +maximizing dual cycles and minimizing dockyard rehandles. Compared to those +methods, it reduced 15-20% of total operation time for large vessels. +Statistical validation through a two-tailed paired t-test confirms the +superiority of QCDC-DR-GA at a 5% significance level. The approach effectively +combines QCDC optimization with dockyard rehandle minimization, optimizing the +total unloading-loading time. Results underscore the inefficiency of separately +optimizing QCDC and dockyard rehandles. Fragmented approaches, such as QCDC +Scheduling Optimized by bi-level GA and GA-ILSRS (Scenario 2), show limited +improvement compared to QCDC-DR-GA. As in GA-ILSRS (Scenario 1), neglecting +dual-cycle optimization leads to inferior performance than QCDC-DR-GA. This +emphasizes the necessity of simultaneously considering both aspects for optimal +resource utilization and overall operational efficiency. + +
+
+
+
+
+ + ♻ ☆ How to Segment in 3D Using 2D Models: Automated 3D Segmentation of + Prostate Cancer Metastatic Lesions on PET Volumes Using Multi-angle Maximum + Intensity Projections and Diffusion Models MICCAI + + +
+ Prostate specific membrane antigen (PSMA) positron emission +tomography/computed tomography (PET/CT) imaging provides a tremendously +exciting frontier in visualization of prostate cancer (PCa) metastatic lesions. +However, accurate segmentation of metastatic lesions is challenging due to low +signal-to-noise ratios and variable sizes, shapes, and locations of the +lesions. This study proposes a novel approach for automated segmentation of +metastatic lesions in PSMA PET/CT 3D volumetric images using 2D denoising +diffusion probabilistic models (DDPMs). Instead of 2D trans-axial slices or 3D +volumes, the proposed approach segments the lesions on generated multi-angle +maximum intensity projections (MA-MIPs) of the PSMA PET images, then obtains +the final 3D segmentation masks from 3D ordered subset expectation maximization +(OSEM) reconstruction of 2D MA-MIPs segmentations. Our proposed method achieved +superior performance compared to state-of-the-art 3D segmentation approaches in +terms of accuracy and robustness in detecting and segmenting small metastatic +PCa lesions. The proposed method has significant potential as a tool for +quantitative analysis of metastatic burden in PCa patients. + +
+
+ comment: 11 pages, 2 figures, accepted in the DGM4MICCAI workshop, MICCAI, + 2024 +
+
+
+
+
+ + ♻ ☆ Segmentation-Free Outcome Prediction from Head and Neck Cancer PET/CT + Images: Deep Learning-Based Feature Extraction from Multi-Angle Maximum + Intensity Projections (MA-MIPs) + + +
+ We introduce an innovative, simple, effective segmentation-free approach for +outcome prediction in head \& neck cancer (HNC) patients. By harnessing deep +learning-based feature extraction techniques and multi-angle maximum intensity +projections (MA-MIPs) applied to Fluorodeoxyglucose Positron Emission +Tomography (FDG-PET) volumes, our proposed method eliminates the need for +manual segmentations of regions-of-interest (ROIs) such as primary tumors and +involved lymph nodes. Instead, a state-of-the-art object detection model is +trained to perform automatic cropping of the head and neck region on the PET +volumes. A pre-trained deep convolutional neural network backbone is then +utilized to extract deep features from MA-MIPs obtained from 72 multi-angel +axial rotations of the cropped PET volumes. These deep features extracted from +multiple projection views of the PET volumes are then aggregated and fused, and +employed to perform recurrence-free survival analysis on a cohort of 489 HNC +patients. The proposed approach outperforms the best performing method on the +target dataset for the task of recurrence-free survival analysis. By +circumventing the manual delineation of the malignancies on the FDG PET-CT +images, our approach eliminates the dependency on subjective interpretations +and highly enhances the reproducibility of the proposed survival analysis +method. + +
+
+ comment: 15 pages, 4 tables, 4 figures. Published in Cancers 2024, Volume 16, + Issue 14, page 2538 +
+
+
+
+
+ + ♻ ☆ The Reality of AI and Biorisk + + +
+ To accurately and confidently answer the question 'could an AI model or +system increase biorisk', it is necessary to have both a sound theoretical +threat model for how AI models or systems could increase biorisk and a robust +method for testing that threat model. This paper provides an analysis of +existing available research surrounding two AI and biorisk threat models: 1) +access to information and planning via large language models (LLMs), and 2) the +use of AI-enabled biological tools (BTs) in synthesizing novel biological +artifacts. We find that existing studies around AI-related biorisk are nascent, +often speculative in nature, or limited in terms of their methodological +maturity and transparency. The available literature suggests that current LLMs +and BTs do not pose an immediate risk, and more work is needed to develop +rigorous approaches to understanding how future models could increase biorisks. +We end with recommendations about how empirical work can be expanded to more +precisely target biorisk and ensure rigor and validity of findings. + +
+
+ comment: Updated to correct author affiliations +
+
+
+
+
+ + ♻ ☆ D-Wave's Nonlinear-Program Hybrid Solver: Description and Performance + Analysis + + +
+ The development of advanced quantum-classical algorithms is among the most +prominent strategies in quantum computing. Numerous hybrid solvers have been +introduced recently. Many of these methods are created ad hoc to address +specific use cases. However, several well-established schemes are frequently +utilized to address optimization problems. In this context, D-Wave launched the +Hybrid Solver Service in 2020, offering a portfolio of methods designed to +accelerate time-to-solution for users aiming to optimize performance and +operational processes. Recently, a new technique has been added to this +portfolio: the Nonlinear-Program Hybrid Solver. This paper describes this +solver and evaluates its performance through a benchmark of 45 instances across +three combinatorial optimization problems: the Traveling Salesman Problem, the +Knapsack Problem, and the Maximum Cut Problem. To facilitate the use of this +relatively unexplored solver, we provide details of the implementation used to +solve these three optimization problems. + +
+
+ comment: 13 pages, 9 figures and 7 tables +
+
+
+
+
+ + ♻ ☆ Explainable fault and severity classification for rolling element + bearings using Kolmogorov-Arnold networks + + +
+ Rolling element bearings are critical components of rotating machinery, with +their performance directly influencing the efficiency and reliability of +industrial systems. At the same time, bearing faults are a leading cause of +machinery failures, often resulting in costly downtime, reduced productivity, +and, in extreme cases, catastrophic damage. This study presents a methodology +that utilizes Kolmogorov-Arnold Networks to address these challenges through +automatic feature selection, hyperparameter tuning and interpretable fault +analysis within a unified framework. By training shallow network architectures +and minimizing the number of selected features, the framework produces +lightweight models that deliver explainable results through feature attribution +and symbolic representations of their activation functions. Validated on two +widely recognized datasets for bearing fault diagnosis, the framework achieved +perfect F1-Scores for fault detection and high performance in fault and +severity classification tasks, including 100% F1-Scores in most cases. Notably, +it demonstrated adaptability by handling diverse fault types, such as imbalance +and misalignment, within the same dataset. The symbolic representations +enhanced model interpretability, while feature attribution offered insights +into the optimal feature types or signals for each studied task. These results +highlight the framework's potential for practical applications, such as +real-time machinery monitoring, and for scientific research requiring efficient +and explainable models. + +
+
+
+
+
+ + ♻ ☆ Time-Reversal Provides Unsupervised Feedback to LLMs NeurIPS 2024 + + +
+ Large Language Models (LLMs) are typically trained to predict in the forward +direction of time. However, recent works have shown that prompting these models +to look back and critique their own generations can produce useful feedback. +Motivated by this, we explore the question of whether LLMs can be empowered to +think (predict and score) backwards to provide unsupervised feedback that +complements forward LLMs. Towards this, we introduce Time Reversed Language +Models (TRLMs), which can score and generate queries when conditioned on +responses, effectively functioning in the reverse direction of time. Further, +to effectively infer in the response to query direction, we pre-train and +fine-tune a language model (TRLM-Ba) in the reverse token order from scratch. +We show empirically (and theoretically in a stylized setting) that +time-reversed models can indeed complement forward model predictions when used +to score the query given response for re-ranking multiple forward generations. +We obtain up to 5\% improvement on the widely used AlpacaEval Leaderboard over +the competent baseline of best-of-N re-ranking using self log-perplexity +scores. We further show that TRLM scoring outperforms conventional forward +scoring of response given query, resulting in significant gains in applications +such as citation generation and passage retrieval. We next leverage the +generative ability of TRLM to augment or provide unsupervised feedback to input +safety filters of LLMs, demonstrating a drastic reduction in false negative +rate with negligible impact on false positive rates against several attacks +published on the popular JailbreakBench leaderboard. + +
+
+ comment: Accepted as a spotlight in NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Multi-Sensor Diffusion-Driven Optical Image Translation for Large-Scale + Applications + + +
+ Comparing images captured by disparate sensors is a common challenge in +remote sensing. This requires image translation -- converting imagery from one +sensor domain to another while preserving the original content. Denoising +Diffusion Implicit Models (DDIM) are potential state-of-the-art solutions for +such domain translation due to their proven superiority in multiple +image-to-image translation tasks in computer vision. However, these models +struggle with reproducing radiometric features of large-scale multi-patch +imagery, resulting in inconsistencies across the full image. This renders +downstream tasks like Heterogeneous Change Detection impractical. To overcome +these limitations, we propose a method that leverages denoising diffusion for +effective multi-sensor optical image translation over large areas. Our approach +super-resolves large-scale low spatial resolution images into high-resolution +equivalents from disparate optical sensors, ensuring uniformity across hundreds +of patches. Our contributions lie in new forward and reverse diffusion +processes that address the challenges of large-scale image translation. +Extensive experiments using paired Sentinel-II (10m) and Planet Dove (3m) +images demonstrate that our approach provides precise domain adaptation, +preserving image content while improving radiometric accuracy and feature +representation. A thorough image quality assessment and comparisons with the +standard DDIM framework and five other leading methods are presented. We reach +a mean Learned Perceptual Image Patch Similarity (mLPIPS) of 0.1884 and a +Fr\'echet Inception Distance (FID) of 45.64, expressively outperforming all +compared methods, including DDIM, ShuffleMixer, and SwinIR. The usefulness of +our approach is further demonstrated in two Heterogeneous Change Detection +tasks. + +
+
+ comment: This is the accepted version of the manuscript published in IEEE + Journal of Selected Topics in Applied Earth Observations and Remote Sensing + (JSTARS). Please access the final version at IEEEXplore (Open Access). DOI + 10.1109/JSTARS.2024.3506032. This technology is protected by a patent filed + on 23 december 2023 at Office Luxembourgeois de la propri\'et\'e + intellectuelle (LU505861) +
+
+
+
+
+ + ♻ ☆ The Cooperative Network Architecture: Learning Structured Networks as + Representation of Sensory Patterns + + +
+ Nets, cooperative networks of neurons, have been proposed as format for the +representation of sensory signals, as physical implementation of the Gestalt +phenomenon and as solution to the neural binding problem, while the direct +interaction between nets by structure-sensitive matching has been proposed as +basis for object-global operations such as object detection. The nets are +flexibly composed of overlapping net fragments, which are learned from +statistical regularities of sensory input. We here present the cooperative +network architecture (CNA), a concrete model that learns such net structure to +represent input patterns and deals robustly with noise, deformation, and +out-of-distribution data, thus laying the groundwork for a novel neural +architecture. + +
+
+
+
+
+ + ♻ ☆ Local Lesion Generation is Effective for Capsule Endoscopy Image Data + Augmentation in a Limited Data Setting + + +
+ Limited medical imaging datasets challenge deep learning models by increasing +risks of overfitting and reduced generalization, particularly in Generative +Adversarial Networks (GANs), where discriminators may overfit, leading to +training divergence. This constraint also impairs classification models trained +on small datasets. Generative Data Augmentation (GDA) addresses this by +expanding training datasets with synthetic data, although it requires training +a generative model. We propose and evaluate two local lesion generation +approaches to address the challenge of augmenting small medical image datasets. +The first approach employs the Poisson Image Editing algorithm, a classical +image processing technique, to create realistic image composites that +outperform current state-of-the-art methods. The second approach introduces a +novel generative method, leveraging a fine-tuned Image Inpainting GAN to +synthesize realistic lesions within specified regions of real training images. +A comprehensive comparison of the two proposed methods demonstrates that +effective local lesion generation in a data-constrained setting allows for +reaching new state-of-the-art results in capsule endoscopy lesion +classification. Combination of our techniques achieves a macro F1-score of +33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on +the highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule +endoscopy. To the best of our knowledge, this work is the first to apply a +fine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that +an image-conditional GAN can be adapted effectively to limited datasets to +generate high-quality examples, facilitating effective data augmentation. +Additionally, we show that combining this GAN-based approach with classical +image processing techniques further improves the results. + +
+
+ comment: 54 pages, 35 figures +
+
+
+
+
+ + ♻ GWQ: Gradient-Aware Weight Quantization for Large Language Models + + +
+ Large language models (LLMs) show impressive performance in solving complex +language tasks. However, its large number of parameters present significant +challenges for the deployment and application of the model on edge devices. +Compressing large language models to low bits can enable them to run on +resource-constrained devices, often leading to performance degradation. To +address this problem, we propose gradient-aware weight quantization (GWQ), the +first quantization approach for low-bit weight quantization that leverages +gradients to localize outliers, requiring only a minimal amount of calibration +data for outlier detection. GWQ retains the weights corresponding to the top 1% +outliers preferentially at FP16 precision, while the remaining non-outlier +weights are stored in a low-bit format. GWQ found experimentally that utilizing +the sensitive weights in the gradient localization model is more scientific +compared to utilizing the sensitive weights in the Hessian matrix localization +model. Compared to current quantization methods, GWQ can be applied to multiple +language models and achieves lower PPL on the WikiText2 and C4 dataset. In the +zero-shot task, GWQ quantized models have higher accuracy compared to other +quantization methods. GWQ is also suitable for multimodal model quantization, +and the quantized Qwen-VL family model is more accurate than other methods. +Zero-shot target detection task dataset RefCOCO outperforms the current +stat-of-the-arts method SPQR. GWQ achieves 1.2 times inference speedup in +comparison to the original model, and effectively reduces the inference memory. + +
+
+
+
+
+ + ♻ ☆ DualAD: Dual-Layer Planning for Reasoning in Autonomous Driving + + +
+ We present a novel autonomous driving framework, DualAD, designed to imitate +human reasoning during driving. DualAD comprises two layers: a rule-based +motion planner at the bottom layer that handles routine driving tasks requiring +minimal reasoning, and an upper layer featuring a rule-based text encoder that +converts driving scenarios from absolute states into text description. This +text is then processed by a large language model (LLM) to make driving +decisions. The upper layer intervenes in the bottom layer's decisions when +potential danger is detected, mimicking human reasoning in critical situations. +Closed-loop experiments demonstrate that DualAD, using a zero-shot pre-trained +model, significantly outperforms rule-based motion planners that lack reasoning +abilities. Our experiments also highlight the effectiveness of the text +encoder, which considerably enhances the model's scenario understanding. +Additionally, the integrated DualAD model improves with stronger LLMs, +indicating the framework's potential for further enhancement. Code and +benchmarks are available at github.com/TUM-AVS/DualAD. + +
+
+ comment: Autonomous Driving, Large Language Models (LLMs), Human Reasoning, + Critical Scenario +
+
+
+
+
+ + ♻ ☆ Leveraging LLMs for On-the-Fly Instruction Guided Image Editing + + +
+ The combination of language processing and image processing keeps attracting +increased interest given recent impressive advances that leverage the combined +strengths of both domains of research. Among these advances, the task of +editing an image on the basis solely of a natural language instruction stands +out as a most challenging endeavour. While recent approaches for this task +resort, in one way or other, to some form of preliminary preparation, training +or fine-tuning, this paper explores a novel approach: We propose a +preparation-free method that permits instruction-guided image editing on the +fly. This approach is organized along three steps properly orchestrated that +resort to image captioning and DDIM inversion, followed by obtaining the edit +direction embedding, followed by image editing proper. While dispensing with +preliminary preparation, our approach demonstrates to be effective and +competitive, outperforming recent, state of the art models for this task when +evaluated on the MAGICBRUSH dataset. + +
+
+
+
+
+ + ♻ ☆ Elephants Never Forget: Memorization and Learning of Tabular Data in + Large Language Models + + +
+ While many have shown how Large Language Models (LLMs) can be applied to a +diverse set of tasks, the critical issues of data contamination and +memorization are often glossed over. In this work, we address this concern for +tabular data. Specifically, we introduce a variety of different techniques to +assess whether a language model has seen a tabular dataset during training. +This investigation reveals that LLMs have memorized many popular tabular +datasets verbatim. We then compare the few-shot learning performance of LLMs on +datasets that were seen during training to the performance on datasets released +after training. We find that LLMs perform better on datasets seen during +training, indicating that memorization leads to overfitting. At the same time, +LLMs show non-trivial performance on novel datasets and are surprisingly robust +to data transformations. We then investigate the in-context statistical +learning abilities of LLMs. While LLMs are significantly better than random at +solving statistical classification problems, the sample efficiency of few-shot +learning lags behind traditional statistical learning algorithms, especially as +the dimension of the problem increases. This suggests that much of the observed +few-shot performance on novel real-world datasets is due to the LLM's world +knowledge. Overall, our results highlight the importance of testing whether an +LLM has seen an evaluation dataset during pre-training. We release the +https://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package +to test LLMs for memorization of tabular datasets. + +
+
+ comment: COLM camera ready, fix typo +
+
+
+
+
+ + ♻ ☆ Knowledge Mechanisms in Large Language Models: A Survey and Perspective EMNLP 2024 + + +
+ Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial +for advancing towards trustworthy AGI. This paper reviews knowledge mechanism +analysis from a novel taxonomy including knowledge utilization and evolution. +Knowledge utilization delves into the mechanism of memorization, comprehension +and application, and creation. Knowledge evolution focuses on the dynamic +progression of knowledge within individual and group LLMs. Moreover, we discuss +what knowledge LLMs have learned, the reasons for the fragility of parametric +knowledge, and the potential dark knowledge (hypothesis) that will be +challenging to address. We hope this work can help understand knowledge in LLMs +and provide insights for future research. + +
+
+ comment: EMNLP 2024 Findings; 39 pages (v4) +
+
+
+
+
+ + ♻ ☆ Enhancing Perception Capabilities of Multimodal LLMs with Training-Free + Fusion + + +
+ Multimodal LLMs (MLLMs) equip language models with visual capabilities by +aligning vision encoders with language models. Existing methods to enhance the +visual perception of MLLMs often involve designing more powerful vision +encoders, which requires exploring a vast design space and re-aligning each +potential encoder with the language model, resulting in prohibitively high +training costs. In this paper, we introduce VisionFuse, a novel integration +framework that efficiently utilizes multiple vision encoders from off-the-shelf +MLLMs to enhance visual perception without requiring additional training. Our +approach is motivated by the observation that different MLLMs tend to focus on +distinct regions given the same query and image. Moreover, we find that the +feature distributions of vision encoders within an MLLM family, a group of +MLLMs sharing the same pretrained LLM, are highly aligned. Building on these +insights, VisionFuse enriches the visual context by concatenating the tokens +generated by the vision encoders of selected MLLMs within a family. By merging +the parameters of language models from these MLLMs, VisionFuse allows a single +language model to align with various vision encoders, significantly reducing +deployment overhead. We conduct comprehensive evaluations across multiple +multimodal benchmarks using various MLLM combinations, demonstrating +substantial improvements in multimodal tasks. Notably, when integrating +MiniGemini-8B and SLIME-8B, VisionFuse achieves an average performance increase +of over 4%. + +
+
+
+
+
+ + ♻ ☆ Facility Location Games with Scaling Effects AAMAS-24 + + +
+ We take the classic facility location problem and consider a variation, in +which each agent's individual cost function is equal to their distance from the +facility multiplied by a scaling factor which is determined by the facility +placement. In addition to the general class of continuous scaling functions, we +also provide results for piecewise linear scaling functions which can +effectively approximate or model the scaling of many real world scenarios. We +focus on the objectives of total and maximum cost, describing the computation +of the optimal solution. We then move to the approximate mechanism design +setting, observing that the agents' preferences may no longer be single-peaked. +Consequently, we characterize the conditions on scaling functions which ensure +that agents have single-peaked preferences. Under these conditions, we find a +characterization of continuous, strategyproof, and anonymous mechanisms, and +compute the total and maximum cost approximation ratios achievable by these +mechanisms. + +
+
+ comment: This is an updated version of the paper which appeared at the 23rd + International Conference on Autonomous Agents and Multi-Agent Systems + (AAMAS-24) +
+
+
+
+
+ + ♻ ☆ FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking + Portrait + + +
+ With the rapid advancement of diffusion-based generative models, portrait +image animation has achieved remarkable results. However, it still faces +challenges in temporally consistent video generation and fast sampling due to +its iterative sampling nature. This paper presents FLOAT, an audio-driven +talking portrait video generation method based on flow matching generative +model. We shift the generative modeling from the pixel-based latent space to a +learned motion latent space, enabling efficient design of temporally consistent +motion. To achieve this, we introduce a transformer-based vector field +predictor with a simple yet effective frame-wise conditioning mechanism. +Additionally, our method supports speech-driven emotion enhancement, enabling a +natural incorporation of expressive motions. Extensive experiments demonstrate +that our method outperforms state-of-the-art audio-driven talking portrait +methods in terms of visual quality, motion fidelity, and efficiency. + +
+
+ comment: Project page: https://deepbrainai-research.github.io/float/ +
+
+
+
+
+ + ♻ ☆ Adaptive Dense Reward: Understanding the Gap Between Action and Reward + Space in Alignment + + +
+ Reinforcement Learning from Human Feedback (RLHF) has proven highly effective +in aligning Large Language Models (LLMs) with human preferences. However, the +original RLHF typically optimizes under an overall reward, which can lead to a +suboptimal learning process. This limitation stems from RLHF's lack of +awareness regarding which specific tokens should be reinforced or suppressed. +Moreover, conflicts in supervision can arise, for instance, when a chosen +response includes erroneous tokens, while a rejected response contains accurate +elements. To rectify these shortcomings, increasing dense reward methods, such +as step-wise and token-wise RLHF, have been proposed. However, these existing +methods are limited to specific tasks (like mathematics). In this paper, we +propose the ``Adaptive Message-wise RLHF'' method, which robustly applies to +various tasks. By defining pivot tokens as key indicators, our approach +adaptively identifies essential information and converts sequence-level +supervision into fine-grained, subsequence-level supervision. This aligns the +density of rewards and action spaces more closely with the information density +of the input. Experiments demonstrate that our method can be integrated into +various training methods, significantly mitigating hallucinations and +catastrophic forgetting problems, while outperforming other methods on multiple +evaluation metrics. Our method improves the success rate on adversarial samples +by 10\% compared to the sample-wise approach, and achieves a 1.3\% improvement +on evaluation benchmarks such as MMLU, GSM8K, HumanEval, etc. + +
+
+
+
+
+ + ♻ ☆ "Moralized" Multi-Step Jailbreak Prompts: Black-Box Testing of + Guardrails in Large Language Models for Verbal Attacks + + +
+ As the application of large language models continues to expand in various +fields, it poses higher challenges to the effectiveness of identifying harmful +content generation and guardrail mechanisms. This research aims to evaluate the +guardrail effectiveness of GPT-4o, Grok-2 Beta, Llama 3.1 (405B), Gemini 1.5, +and Claude 3.5 Sonnet through black-box testing of seemingly ethical multi-step +jailbreak prompts. It conducts ethical attacks by designing an identical +multi-step prompts that simulates the scenario of "corporate middle managers +competing for promotions." The data results show that the guardrails of the +above-mentioned LLMs were bypassed and the content of verbal attacks was +generated. Claude 3.5 Sonnet's resistance to multi-step jailbreak prompts is +more obvious. To ensure objectivity, the experimental process, black box test +code, and enhanced guardrail code are uploaded to the GitHub repository: +https://github.com/brucewang123456789/GeniusTrail.git. + +
+
+ comment: This paper has been submitted to Nature Machine Intelligence and + OpenReview preprints. It has 7 pages of text, 3 figures, and 3 tables +
+
+
+
+
+ + ♻ ☆ Provably Mitigating Overoptimization in RLHF: Your SFT Loss is + Implicitly an Adversarial Regularizer + + +
+ Aligning generative models with human preference via RLHF typically suffers +from overoptimization, where an imperfectly learned reward model can misguide +the generative model to output undesired responses. We investigate this problem +in a principled manner by identifying the source of the misalignment as a form +of distributional shift and uncertainty in learning human preferences. To +mitigate overoptimization, we first propose a theoretical algorithm that +chooses the best policy for an adversarially chosen reward model; one that +simultaneously minimizes the maximum likelihood estimation of the loss and a +reward penalty term. Here, the reward penalty term is introduced to prevent the +policy from choosing actions with spurious high proxy rewards, resulting in +provable sample efficiency of the algorithm under a partial coverage style +condition. Moving from theory to practice, the proposed algorithm further +enjoys an equivalent but surprisingly easy-to-implement reformulation. Using +the equivalence between reward models and the corresponding optimal policy, the +algorithm features a simple objective that combines: (i) a preference +optimization loss that directly aligns the policy with human preference, and +(ii) a supervised learning loss that explicitly imitates the policy with a +(suitable) baseline distribution. In the context of aligning large language +models (LLM), this objective fuses the direct preference optimization (DPO) +loss with the supervised fine-tuning (SFT) loss to help mitigate the +overoptimization towards undesired responses, for which we name the algorithm +Regularized Preference Optimization (RPO). Experiments of aligning LLMs +demonstrate the improved performance of RPO compared with DPO baselines. Our +work sheds light on the interplay between preference optimization and SFT in +tuning LLMs with both theoretical guarantees and empirical evidence. + +
+
+ comment: Accepted by The Thirty-Eighth Annual Conference on Neural Information + Processing Systems. 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ DEL-Ranking: Ranking-Correction Denoising Framework for Elucidating + Molecular Affinities in DNA-Encoded Libraries + + +
+ DNA-encoded library (DEL) screening has revolutionized the detection of +protein-ligand interactions through read counts, enabling rapid exploration of +vast chemical spaces. However, noise in read counts, stemming from nonspecific +interactions, can mislead this exploration process. We present DEL-Ranking, a +novel distribution-correction denoising framework that addresses these +challenges. Our approach introduces two key innovations: (1) a novel ranking +loss that rectifies relative magnitude relationships between read counts, +enabling the learning of causal features determining activity levels, and (2) +an iterative algorithm employing self-training and consistency loss to +establish model coherence between activity label and read count predictions. +Furthermore, we contribute three new DEL screening datasets, the first to +comprehensively include multi-dimensional molecular representations, +protein-ligand enrichment values, and their activity labels. These datasets +mitigate data scarcity issues in AI-driven DEL screening research. Rigorous +evaluation on diverse DEL datasets demonstrates DEL-Ranking's superior +performance across multiple correlation metrics, with significant improvements +in binding affinity prediction accuracy. Our model exhibits zero-shot +generalization ability across different protein targets and successfully +identifies potential motifs determining compound binding affinity. This work +advances DEL screening analysis and provides valuable resources for future +research in this area. + +
+
+
+
+
+ + ♻ ☆ DragText: Rethinking Text Embedding in Point-based Image Editing WACV 2025 + + +
+ Point-based image editing enables accurate and flexible control through +content dragging. However, the role of text embedding during the editing +process has not been thoroughly investigated. A significant aspect that remains +unexplored is the interaction between text and image embeddings. During the +progressive editing in a diffusion model, the text embedding remains constant. +As the image embedding increasingly diverges from its initial state, the +discrepancy between the image and text embeddings presents a significant +challenge. In this study, we found that the text prompt significantly +influences the dragging process, particularly in maintaining content integrity +and achieving the desired manipulation. Upon these insights, we propose +DragText, which optimizes text embedding in conjunction with the dragging +process to pair with the modified image embedding. Simultaneously, we +regularize the text optimization process to preserve the integrity of the +original text prompt. Our approach can be seamlessly integrated with existing +diffusion-based drag methods, enhancing performance with only a few lines of +code. + +
+
+ comment: Accepted at WACV 2025; Code is released at + https://github.com/MICV-yonsei/DragText +
+
+
+
+
+ + ♻ ☆ IMWA: Iterative Model Weight Averaging Benefits Class-Imbalanced + Learning Tasks + + +
+ Model Weight Averaging (MWA) is a technique that seeks to enhance model's +performance by averaging the weights of multiple trained models. This paper +first empirically finds that 1) the vanilla MWA can benefit the +class-imbalanced learning, and 2) performing model averaging in the early +epochs of training yields a greater performance improvement than doing that in +later epochs. Inspired by these two observations, in this paper we propose a +novel MWA technique for class-imbalanced learning tasks named Iterative Model +Weight Averaging (IMWA). Specifically, IMWA divides the entire training stage +into multiple episodes. Within each episode, multiple models are concurrently +trained from the same initialized model weight, and subsequently averaged into +a singular model. Then, the weight of this average model serves as a fresh +initialization for the ensuing episode, thus establishing an iterative learning +paradigm. Compared to vanilla MWA, IMWA achieves higher performance +improvements with the same computational cost. Moreover, IMWA can further +enhance the performance of those methods employing EMA strategy, demonstrating +that IMWA and EMA can complement each other. Extensive experiments on various +class-imbalanced learning tasks, i.e., class-imbalanced image classification, +semi-supervised class-imbalanced image classification and semi-supervised +object detection tasks showcase the effectiveness of our IMWA. + +
+
+
+
+
+ + ♻ ☆ Preserve or Modify? Context-Aware Evaluation for Balancing Preservation + and Modification in Text-Guided Image Editing + + +
+ The development of vision-language and generative models has significantly +advanced text-guided image editing, which seeks the \textit{preservation} of +core elements in the source image while implementing \textit{modifications} +based on the target text. However, existing metrics have a +\textbf{context-blindness} problem, indiscriminately applying the same +evaluation criteria on completely different pairs of source image and target +text, biasing towards either modification or preservation. Directional CLIP +similarity, the only metric that considers both source image and target text, +is also biased towards modification aspects and attends to irrelevant editing +regions of the image. We propose \texttt{AugCLIP}, a \textbf{context-aware} +metric that adaptively coordinates preservation and modification aspects, +depending on the specific context of a given source image and target text. This +is done by deriving the CLIP representation of an ideally edited image, that +preserves the source image with necessary modifications to align with target +text. More specifically, using a multi-modal large language model, +\texttt{AugCLIP} augments the textual descriptions of the source and target, +then calculates a modification vector through a hyperplane that separates +source and target attributes in CLIP space. Extensive experiments on five +benchmark datasets, encompassing a diverse range of editing scenarios, show +that \texttt{AugCLIP} aligns remarkably well with human evaluation standards, +outperforming existing metrics. The code will be open-sourced for community +use. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ One Initialization to Rule them All: Fine-tuning via Explained Variance + Adaptation + + +
+ Foundation models (FMs) are pre-trained on large-scale datasets and then +fine-tuned on a downstream task for a specific application. The most successful +and most commonly used fine-tuning method is to update the pre-trained weights +via a low-rank adaptation (LoRA). LoRA introduces new weight matrices that are +usually initialized at random with a uniform rank distribution across the model +weights. Recent works focus on different initialization schemes or the learning +of adaptive ranks during fine-tuning. Both approaches have only been +investigated in isolation, resulting in slow convergence or a uniform rank +distribution, in turn leading to suboptimal performance. We propose to improve +LoRA by initializing the new weights in a data-driven manner by computing +singular value decomposition (SVD) on minibatches of activation vectors. Then, +we initialize the LoRA matrices with the obtained right-singular vectors and +redistribute ranks among all weight matrices to provably store the maximum +amount of information of the downstream data in the newly introduced weights. +In this way, only what information to maintain or neglect during the +fine-tuning process needs to be learned. We call our new method Explained +Variance Adaptation (EVA). We apply EVA to a variety of fine-tuning tasks +ranging from language generation and understanding to image classification and +reinforcement learning. EVA exhibits faster convergence than competitors and +achieves the highest average score across a multitude of tasks per domain while +reducing the number of trainable parameters through rank redistribution. + +
+
+ comment: 11 pages + references and appendix, code available at + https://github.com/ml-jku/EVA +
+
+
+
+
+ + ♻ ☆ CryoFM: A Flow-based Foundation Model for Cryo-EM Densities + + +
+ Cryo-electron microscopy (cryo-EM) is a powerful technique in structural +biology and drug discovery, enabling the study of biomolecules at high +resolution. Significant advancements by structural biologists using cryo-EM +have led to the production of over 38,626 protein density maps at various +resolutions1. However, cryo-EM data processing algorithms have yet to fully +benefit from our knowledge of biomolecular density maps, with only a few recent +models being data-driven but limited to specific tasks. In this study, we +present CryoFM, a foundation model designed as a generative model, learning the +distribution of high-quality density maps and generalizing effectively to +downstream tasks. Built on flow matching, CryoFM is trained to accurately +capture the prior distribution of biomolecular density maps. Furthermore, we +introduce a flow posterior sampling method that leverages CRYOFM as a flexible +prior for several downstream tasks in cryo-EM and cryo-electron tomography +(cryo-ET) without the need for fine-tuning, achieving state-of-the-art +performance on most tasks and demonstrating its potential as a foundational +model for broader applications in these fields. + +
+
+
+
+
+ + ♻ ☆ A Comparative Study of LLM-based ASR and Whisper in Low Resource and + Code Switching Scenario + + +
+ Large Language Models (LLMs) have showcased exceptional performance across +diverse NLP tasks, and their integration with speech encoder is rapidly +emerging as a dominant trend in the Automatic Speech Recognition (ASR) field. +Previous works mainly concentrated on leveraging LLMs for speech recognition in +English and Chinese. However, their potential for addressing speech recognition +challenges in low resource settings remains underexplored. Hence, in this work, +we aim to explore the capability of LLMs in low resource ASR and +Mandarin-English code switching ASR. We also evaluate and compare the +recognition performance of LLM-based ASR systems against Whisper model. +Extensive experiments demonstrate that LLM-based ASR yields a relative gain of +12.8\% over the Whisper model in low resource ASR while Whisper performs better +in Mandarin-English code switching ASR. We hope that this study could shed +light on ASR for low resource scenarios. + +
+
+ comment: This work hasn't been finished yet +
+
+
+
+
+ + ♻ ☆ LLMs Do Not Think Step-by-step In Implicit Reasoning + + +
+ It has been well-known that Chain-of-Thought can remarkably enhance LLMs' +performance on complex tasks. However, because it also introduces slower +inference speeds and higher computational costs, many researches have attempted +to use implicit CoT, which does not need LLMs to explicitly generate the +intermediate steps. But there is still gap between their efficacy and typical +explicit CoT methods. This leaves us a doubt that, does implicit CoT really +equal to explicit CoT? Therefore, in this study, we address this question +through experiments. We probe the information of intermediate steps from the +model's hidden states when it is performing implicit CoT. The results +surprisingly indicate that LLMs hardly think about intermediate steps, +suggesting they may just rely on experience rather than strict step-by-step +reasoning. Moreover, we find LLMs' implicit reasoning capabilities are +susceptible and unstable, reaffirming the necessity of explicit CoT to +effectively support complex tasks. + +
+
+
+
+
+ + ♻ ☆ Scaling Image Tokenizers with Grouped Spherical Quantization + + +
+ Vision tokenizers have gained a lot of attraction due to their scalability +and compactness; previous works depend on old-school GAN-based hyperparameters, +biased comparisons, and a lack of comprehensive analysis of the scaling +behaviours. To tackle those issues, we introduce Grouped Spherical Quantization +(GSQ), featuring spherical codebook initialization and lookup regularization to +constrain codebook latent to a spherical surface. Our empirical analysis of +image tokenizer training strategies demonstrates that GSQ-GAN achieves superior +reconstruction quality over state-of-the-art methods with fewer training +iterations, providing a solid foundation for scaling studies. Building on this, +we systematically examine the scaling behaviours of GSQ, specifically in latent +dimensionality, codebook size, and compression ratios, and their impact on +model performance. Our findings reveal distinct behaviours at high and low +spatial compression levels, underscoring challenges in representing +high-dimensional latent spaces. We show that GSQ can restructure +high-dimensional latent into compact, low-dimensional spaces, thus enabling +efficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x +down-sampling with a reconstruction FID (rFID) of 0.50. + +
+
+
+
+
+ + ♻ ☆ Intelligent Spark Agents: A Modular LangGraph Framework for Scalable, + Visualized, and Enhanced Big Data Machine Learning Workflows + + +
+ This paper introduces a visual process modeling tool for AI and machine +learning in big data contexts, utilizing the LangGraph framework to construct +intelligent Spark agents. The tool represents key machine learning stages - +data preprocessing, feature engineering, model training, and evaluation - as +modular components. Analysts can visually design workflows, which are then +automatically translated into optimized Spark code for execution. This approach +simplifies the complexity of Apache Spark, reduces the learning curve +associated with Scala, and enhances code reusability. The paper discusses the +theoretical foundations, key technologies, and evaluates the effectiveness of +the proposed solution. + +
+
+
+
+
+ + ♻ ☆ Rethinking Spectral Augmentation for Contrast-based Graph + Self-Supervised Learning + + +
+ The recent surge in contrast-based graph self-supervised learning has +prominently featured an intensified exploration of spectral cues. Spectral +augmentation, which involves modifying a graph's spectral properties such as +eigenvalues or eigenvectors, is widely believed to enhance model performance. +However, an intriguing paradox emerges, as methods grounded in seemingly +conflicting assumptions regarding the spectral domain demonstrate notable +enhancements in learning performance. Through extensive empirical studies, we +find that simple edge perturbations - random edge dropping for node-level and +random edge adding for graph-level self-supervised learning - consistently +yield comparable or superior performance while being significantly more +computationally efficient. This suggests that the computational overhead of +sophisticated spectral augmentations may not justify their practical benefits. +Our theoretical analysis of the InfoNCE loss bounds for shallow GNNs further +supports this observation. The proposed insights represent a significant leap +forward in the field, potentially refining the understanding and implementation +of graph self-supervised learning. + +
+
+
+
+
+ + ♻ ☆ Chain-of-Restoration: Multi-Task Image Restoration Models are Zero-Shot + Step-by-Step Universal Image Restorers + + +
+ Despite previous image restoration (IR) methods have often concentrated on +isolated degradations, recent research has increasingly focused on addressing +composite degradations involving a complex combination of multiple isolated +degradations. However, current IR methods for composite degradations require +building training data that contain an exponential number of possible +degradation combinations, which brings in a significant burden. To alleviate +this issue, this paper proposes a new task setting, i.e. Universal Image +Restoration (UIR). Specifically, UIR doesn't require training on all the +degradation combinations but only on a set of degradation bases and then +removing any degradation that these bases can potentially compose in a +zero-shot manner. Inspired by the Chain-of-Thought that prompts large language +models (LLMs) to address problems step-by-step, we propose Chain-of-Restoration +(CoR) mechanism, which instructs models to remove unknown composite +degradations step-by-step. By integrating a simple Degradation Discriminator +into pre-trained multi-task models, CoR facilitates the process where models +remove one degradation basis per step, continuing this process until the image +is fully restored from the unknown composite degradation. Extensive experiments +show that CoR can significantly improve model performance in removing composite +degradations, achieving comparable or better results than those +state-of-the-art (SoTA) methods trained on all degradations. + +
+
+ comment: code: https://github.com/toummHus/Chain-of-Restoration +
+
+
+
+
+ + ♻ ☆ Exploring Machine Learning Models for Lung Cancer Level Classification: + A comparative ML Approach + + +
+ This paper explores machine learning (ML) models for classifying lung cancer +levels to improve diagnostic accuracy and prognosis. Through parameter tuning +and rigorous evaluation, we assess various ML algorithms. Techniques like +minimum child weight and learning rate monitoring were used to reduce +overfitting and optimize performance. Our findings highlight the robust +performance of Deep Neural Network (DNN) models across all phases. Ensemble +methods, including voting and bagging, also showed promise in enhancing +predictive accuracy and robustness. However, Support Vector Machine (SVM) +models with the Sigmoid kernel faced challenges, indicating a need for further +refinement. Overall, our study provides insights into ML-based lung cancer +classification, emphasizing the importance of parameter tuning to optimize +model performance and improve diagnostic accuracy in oncological care. + +
+
+
+
+
+ + ♻ ☆ Exploring Homogeneous and Heterogeneous Consistent Label Associations + for Unsupervised Visible-Infrared Person ReID + + +
+ Unsupervised visible-infrared person re-identification (USL-VI-ReID) +endeavors to retrieve pedestrian images of the same identity from different +modalities without annotations. While prior work focuses on establishing +cross-modality pseudo-label associations to bridge the modality-gap, they +ignore maintaining the instance-level homogeneous and heterogeneous consistency +between the feature space and the pseudo-label space, resulting in coarse +associations. In response, we introduce a Modality-Unified Label Transfer +(MULT) module that simultaneously accounts for both homogeneous and +heterogeneous fine-grained instance-level structures, yielding high-quality +cross-modality label associations. It models both homogeneous and heterogeneous +affinities, leveraging them to quantify the inconsistency between the +pseudo-label space and the feature space, subsequently minimizing it. The +proposed MULT ensures that the generated pseudo-labels maintain alignment +across modalities while upholding structural consistency within intra-modality. +Additionally, a straightforward plug-and-play Online Cross-memory Label +Refinement (OCLR) module is proposed to further mitigate the side effects of +noisy pseudo-labels while simultaneously aligning different modalities, coupled +with an Alternative Modality-Invariant Representation Learning (AMIRL) +framework. Experiments demonstrate that our proposed method outperforms +existing state-of-the-art USL-VI-ReID methods, highlighting the superiority of +our MULT in comparison to other cross-modality association methods. Code is +available at https://github.com/FranklinLingfeng/code_for_MULT. + +
+
+ comment: Accepted by IJCV2024 +
+
+
+
+
+ + ♻ ☆ PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object + Detection in Bird's-Eye-View + + +
+ Recently, LSS-based multi-view 3D object detection provides an economical and +deployment-friendly solution for autonomous driving. However, all the existing +LSS-based methods transform multi-view image features into a Cartesian +Bird's-Eye-View(BEV) representation, which does not take into account the +non-uniform image information distribution and hardly exploits the view +symmetry. In this paper, in order to adapt the image information distribution +and preserve the view symmetry by regular convolution, we propose to employ the +polar BEV representation to substitute the Cartesian BEV representation. To +achieve this, we elaborately tailor three modules: a polar view transformer to +generate the polar BEV representation, a polar temporal fusion module for +fusing historical polar BEV features and a polar detection head to predict the +polar-parameterized representation of the object. In addition, we design a 2D +auxiliary detection head and a spatial attention enhancement module to improve +the quality of feature extraction in perspective view and BEV, respectively. +Finally, we integrate the above improvements into a novel multi-view 3D object +detector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves +the superior performance. The code is available at +https://github.com/Yzichen/PolarBEVDet.git.(This work has been submitted to the +IEEE for possible publication. Copyright may be transferred without notice, +after which this version may no longer be accessible) + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ SurvMamba: State Space Model with Multi-grained Multi-modal Interaction + for Survival Prediction + + +
+ Multi-modal learning that combines pathological images with genomic data has +significantly enhanced the accuracy of survival prediction. Nevertheless, +existing methods have not fully utilized the inherent hierarchical structure +within both whole slide images (WSIs) and transcriptomic data, from which +better intra-modal representations and inter-modal integration could be +derived. Moreover, many existing studies attempt to improve multi-modal +representations through attention mechanisms, which inevitably lead to high +complexity when processing high-dimensional WSIs and transcriptomic data. +Recently, a structured state space model named Mamba emerged as a promising +approach for its superior performance in modeling long sequences with low +complexity. In this study, we propose Mamba with multi-grained multi-modal +interaction (SurvMamba) for survival prediction. SurvMamba is implemented with +a Hierarchical Interaction Mamba (HIM) module that facilitates efficient +intra-modal interactions at different granularities, thereby capturing more +detailed local features as well as rich global representations. In addition, an +Interaction Fusion Mamba (IFM) module is used for cascaded inter-modal +interactive fusion, yielding more comprehensive features for survival +prediction. Comprehensive evaluations on five TCGA datasets demonstrate that +SurvMamba outperforms other existing methods in terms of performance and +computational cost. + +
+
+
+
+
+ + ♻ ☆ Nl2Hltl2Plan: Scaling Up Natural Language Understanding for Multi-Robots + Through Hierarchical Temporal Logic Task Representation + + +
+ To enable non-experts to specify long-horizon, multi-robot collaborative +tasks, language models are increasingly used to translate natural language +commands into formal specifications. However, because translation can occur in +multiple ways, such translations may lack accuracy or lead to inefficient +multi-robot planning. Our key insight is that concise hierarchical +specifications can simplify planning while remaining straightforward to derive +from human instructions. We propose Nl2Hltl2Plan, a framework that translates +natural language commands into hierarchical Linear Temporal Logic (LTL) and +solves the corresponding planning problem. The translation involves two steps +leveraging Large Language Models (LLMs). First, an LLM transforms instructions +into a Hierarchical Task Tree, capturing logical and temporal relations. Next, +a fine-tuned LLM converts sub-tasks into flat LTL formulas, which are +aggregated into hierarchical specifications, with the lowest level +corresponding to ordered robot actions. These specifications are then used with +off-the-shelf planners. Our Nl2Hltl2Plan demonstrates the potential of LLMs in +hierarchical reasoning for multi-robot task planning. Evaluations in simulation +and real-world experiments with human participants show that Nl2Hltl2Plan +outperforms existing methods, handling more complex instructions while +achieving higher success rates and lower costs in task allocation and planning. +Additional details are available at https://nl2hltl2plan.github.io . + +
+
+
+
+
+ + ♻ ☆ Inductive Meta-path Learning for Schema-complex Heterogeneous + Information Networks + + +
+ Heterogeneous Information Networks (HINs) are information networks with +multiple types of nodes and edges. The concept of meta-path, i.e., a sequence +of entity types and relation types connecting two entities, is proposed to +provide the meta-level explainable semantics for various HIN tasks. +Traditionally, meta-paths are primarily used for schema-simple HINs, e.g., +bibliographic networks with only a few entity types, where meta-paths are often +enumerated with domain knowledge. However, the adoption of meta-paths for +schema-complex HINs, such as knowledge bases (KBs) with hundreds of entity and +relation types, has been limited due to the computational complexity associated +with meta-path enumeration. Additionally, effectively assessing meta-paths +requires enumerating relevant path instances, which adds further complexity to +the meta-path learning process. To address these challenges, we propose +SchemaWalk, an inductive meta-path learning framework for schema-complex HINs. +We represent meta-paths with schema-level representations to support the +learning of the scores of meta-paths for varying relations, mitigating the need +of exhaustive path instance enumeration for each relation. Further, we design a +reinforcement-learning based path-finding agent, which directly navigates the +network schema (i.e., schema graph) to learn policies for establishing +meta-paths with high coverage and confidence for multiple relations. Extensive +experiments on real data sets demonstrate the effectiveness of our proposed +paradigm. + +
+
+
+
+
+ + ♻ ☆ The use of large language models to enhance cancer clinical trial + educational materials + + +
+ Cancer clinical trials often face challenges in recruitment and engagement +due to a lack of participant-facing informational and educational resources. +This study investigated the potential of Large Language Models (LLMs), +specifically GPT4, in generating patient-friendly educational content from +clinical trial informed consent forms. Using data from ClinicalTrials.gov, we +employed zero-shot learning for creating trial summaries and one-shot learning +for developing multiple-choice questions, evaluating their effectiveness +through patient surveys and crowdsourced annotation. Results showed that +GPT4-generated summaries were both readable and comprehensive, and may improve +patients' understanding and interest in clinical trials. The multiple-choice +questions demonstrated high accuracy and agreement with crowdsourced +annotators. For both resource types, hallucinations were identified that +require ongoing human oversight. The findings demonstrate the potential of LLMs +"out-of-the-box" to support the generation of clinical trial education +materials with minimal trial-specific engineering, but implementation with a +human-in-the-loop is still needed to avoid misinformation risks. + +
+
+
+
+
+ + ♻ ☆ EMOVOME: A Dataset for Emotion Recognition in Spontaneous Real-Life + Speech + + +
+ Spontaneous datasets for Speech Emotion Recognition (SER) are scarce and +frequently derived from laboratory environments or staged scenarios, such as TV +shows, limiting their application in real-world contexts. We developed and +publicly released the Emotional Voice Messages (EMOVOME) dataset, including 999 +voice messages from real conversations of 100 Spanish speakers on a messaging +app, labeled in continuous and discrete emotions by expert and non-expert +annotators. We evaluated speaker-independent SER models using acoustic features +as baseline and transformer-based models. We compared the results with +reference datasets including acted and elicited speech, and analyzed the +influence of annotators and gender fairness. The pre-trained +UniSpeech-SAT-Large model achieved the highest results, 61.64% and 55.57% +Unweighted Accuracy (UA) for 3-class valence and arousal prediction +respectively on EMOVOME, a 10% improvement over baseline models. For the +emotion categories, 42.58% UA was obtained. EMOVOME performed lower than the +acted RAVDESS dataset. The elicited IEMOCAP dataset also outperformed EMOVOME +in predicting emotion categories, while similar results were obtained in +valence and arousal. EMOVOME outcomes varied with annotator labels, showing +better results and fairness when combining expert and non-expert annotations. +This study highlights the gap between controlled and real-life scenarios, +supporting further advancements in recognizing genuine emotions. + +
+
+ comment: This article is a merged version of the description of the EMOVOME + database in arXiv:2402.17496v1 and the speech emotion recognition models in + arXiv:2403.02167v1. This work has been submitted to the IEEE for possible + publication +
+
+
+
+
+ + ♻ ☆ RelCon: Relative Contrastive Learning for a Motion Foundation Model for + Wearable Data + + +
+ We present RelCon, a novel self-supervised *Rel*ative *Con*trastive learning +approach that uses a learnable distance measure in combination with a softened +contrastive loss for training an motion foundation model from wearable sensors. +The learnable distance measure captures motif similarity and domain-specific +semantic information such as rotation invariance. The learned distance provides +a measurement of semantic similarity between a pair of accelerometer +time-series segments, which is used to measure the distance between an anchor +and various other sampled candidate segments. The self-supervised model is +trained on 1 billion segments from 87,376 participants from a large wearables +dataset. The model achieves strong performance across multiple downstream +tasks, encompassing both classification and regression. To our knowledge, we +are the first to show the generalizability of a self-supervised learning model +with motion data from wearables across distinct evaluation tasks. + +
+
+
+
+
+ + ♻ ☆ Synesthesia of Machines (SoM)-Enhanced ISAC Precoding for Vehicular + Networks with Double Dynamics + + +
+ Integrated sensing and communication (ISAC) technology is vital for vehicular +networks, yet the time-varying communication channels and rapid movement of +targets present significant challenges for real-time precoding design. +Traditional optimization-based methods are computationally complex and depend +on perfect prior information, which is often unavailable in double-dynamic +scenarios. In this paper, we propose a synesthesia of machine (SoM)-enhanced +precoding paradigm that leverages modalities such as positioning and channel +information to adapt to these dynamics. Utilizing a deep reinforcement learning +(DRL) framework, our approach pushes ISAC performance boundaries. We also +introduce a parameter-shared actor-critic architecture to accelerate training +in complex state and action spaces. Extensive experiments validate the +superiority of our method over existing approaches. + +
+
+ comment: Submitted to IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ FocDepthFormer: Transformer with latent LSTM for Depth Estimation from + Focal Stack + + +
+ Most existing methods for depth estimation from a focal stack of images +employ convolutional neural networks (CNNs) using 2D or 3D convolutions over a +fixed set of images. However, their effectiveness is constrained by the local +properties of CNN kernels, which restricts them to process only focal stacks of +fixed number of images during both training and inference. This limitation +hampers their ability to generalize to stacks of arbitrary lengths. To overcome +these limitations, we present a novel Transformer-based network, +FocDepthFormer, which integrates a Transformer with an LSTM module and a CNN +decoder. The Transformer's self-attention mechanism allows for the learning of +more informative spatial features by implicitly performing non-local +cross-referencing. The LSTM module is designed to integrate representations +across image stacks of varying lengths. Additionally, we employ multi-scale +convolutional kernels in an early-stage encoder to capture low-level features +at different degrees of focus/defocus. By incorporating the LSTM, +FocDepthFormer can be pre-trained on large-scale monocular RGB depth estimation +datasets, improving visual pattern learning and reducing reliance on +difficult-to-obtain focal stack data. Extensive experiments on diverse focal +stack benchmark datasets demonstrate that our model outperforms +state-of-the-art approaches across multiple evaluation metrics. + +
+
+ comment: 30 pages, 20 figures, Conference paper +
+
+
+
+
+ + ♻ ☆ COVID-19 Probability Prediction Using Machine Learning: An Infectious + Approach + + +
+ The ongoing COVID-19 pandemic continues to pose significant challenges to +global public health, despite the widespread availability of vaccines. Early +detection of the disease remains paramount in curbing its transmission and +mitigating its impact on public health systems. In response, this study delves +into the application of advanced machine learning (ML) techniques for +predicting COVID-19 infection probability. We conducted a rigorous +investigation into the efficacy of various ML models, including XGBoost, LGBM, +AdaBoost, Logistic Regression, Decision Tree, RandomForest, CatBoost, KNN, and +Deep Neural Networks (DNN). Leveraging a dataset comprising 4000 samples, with +3200 allocated for training and 800 for testing, our experiment offers +comprehensive insights into the performance of these models in COVID-19 +prediction. Our findings reveal that Deep Neural Networks (DNN) emerge as the +top-performing model, exhibiting superior accuracy and recall metrics. With an +impressive accuracy rate of 89%, DNN demonstrates remarkable potential in early +COVID-19 detection. This underscores the efficacy of deep learning approaches +in leveraging complex data patterns to identify COVID-19 infections accurately. +This study underscores the critical role of machine learning, particularly deep +learning methodologies, in augmenting early detection efforts amidst the +ongoing pandemic. The success of DNN in accurately predicting COVID-19 +infection probability highlights the importance of continued research and +development in leveraging advanced technologies to combat infectious diseases. + +
+
+
+
+
+ + ♻ ☆ Wonderful Team: Zero-Shot Physical Task Planning with Visual LLMs + + +
+ We introduce Wonderful Team, a multi-agent Vision Large Language Model (VLLM) +framework for executing high level robotic planning in a zero-shot regime. In +our context, zero-shot high-level planning means that for a novel environment, +we provide a VLLM with an image of the robot's surroundings and a task +description, and the VLLM outputs the sequence of actions necessary for the +robot to complete the task. Unlike previous methods for high-level visual +planning for robotic manipulation, our method uses VLLMs for the entire +planning process, enabling a more tightly integrated loop between perception, +control, and planning. As a result, Wonderful Team's performance on a +real-world semantic and physical planning tasks often exceeds methods that rely +on separate vision systems. For example, we see an average 40% success-rate +improvement on VimaBench over prior methods such as NLaP, an average 30% +improvement over Trajectory Generators on tasks from the Trajectory Generator +paper including drawing and wiping a plate, and an average 70% improvement over +Trajectory Generators on a new set of semantic reasoning tasks including +environment re-arrangement with implicit linguistic constraints. We hope these +results highlight the rapid improvements of VLLMs in the past year, and +motivate the community to consider VLLMs as an option for some high-level +robotic planning problems in the future. + +
+
+ comment: aka Wonderful Team +
+
+
+
+
+ + ♻ ☆ Scorecards for Synthetic Medical Data Evaluation and Reporting + + +
+ Although interest in synthetic medical data (SMD) for training and testing AI +methods is growing, the absence of a standardized framework to evaluate its +quality and applicability hinders its wider adoption. Here, we outline an +evaluation framework designed to meet the unique requirements of medical +applications, and introduce SMD Card, which can serve as comprehensive reports +that accompany artificially generated datasets. This card provides a +transparent and standardized framework for evaluating and reporting the quality +of synthetic data, which can benefit SMD developers, users, and regulators, +particularly for AI models using SMD in regulatory submissions. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+
+
+
+ + Genomics 3 + +
+
+
+ + ☆ Timestamp calibration for time-series single cell RNA-seq expression + data + + +
+ Timestamp automatic annotation (TAA) is a crucial procedure for analyzing +time-series ScRNA-seq data, as they unveil dynamic biological developments and +cell regeneration process. However, current TAA methods heavily rely on manual +timestamps, often overlooking their reliability. This oversight can +significantly degrade the performance of timestamp automatic annotation due to +noisy timestamps. Nevertheless, the current approach for addressing this issue +tends to select less critical cleaned samples for timestamp calibration. To +tackle this challenge, we have developed a novel timestamp calibration model +called ScPace for handling noisy labeled time-series ScRNA-seq data. This +approach incorporates a latent variable indicator within a base classifier +instead of probability sampling to detect noisy samples effectively. To +validate our proposed method, we conducted experiments on both simulated and +real time-series ScRNA-seq datasets. Cross-validation experiments with +different artificial mislabeling rates demonstrate that ScPace outperforms +previous approaches. Furthermore, after calibrating the timestamps of the +original time-series ScRNA-seq data using our method, we performed supervised +pseudotime analysis, revealing that ScPace enhances its performance +significantly. These findings suggest that ScPace is an effective tool for +timestamp calibration by enabling reclassification and deletion of detected +noisy labeled samples while maintaining robustness across diverse ranges of +time-series ScRNA-seq datasets. The source code is available at +https://github.com/OPUS-Lightphenexx/ScPace. + +
+
+
+
+
+ + ☆ gghic: A Versatile R Package for Exploring and Visualizing 3D Genome + Organization + + +
+ Motivation: The three-dimensional (3D) organization of the genome plays a +critical role in regulating gene expression and maintaining cellular +homeostasis. Disruptions in this spatial organization can result in abnormal +chromatin interactions, contributing to the development of various diseases +including cancer. Advances in chromosome conformation capture technologies, +such as Hi-C, have enabled researchers to study genome architecture at high +resolution. However, the efficient visualization and interpretation of these +complex datasets remain a major challenge, particularly when integrating +genomic annotations and inter-chromosomal interactions. + Results: We present gghic, an R package that extends the ggplot2 framework to +enable intuitive and customizable visualization of genomic interaction data. +gghic introduces novel layers for generating triangular heatmaps of chromatin +interactions and annotating them with features such as chromatin loops, +topologically associated domains (TADs), gene/transcript models, and data +tracks (e.g., ChIP-seq signals). The package supports data from multiple +chromosomes, facilitating the exploration of inter-chromosomal interactions. +Built to integrate seamlessly with the R/Bioconductor ecosystem, gghic is +compatible with widely used genomic data formats, including HiCExperiment and +GInteractions objects. We demonstrate the utility of gghic by replicating a +published figure showing a translocation event in T-cell acute lymphoblastic +leukemia (T-ALL), highlighting its ability to integrate genomic annotations and +generate publication-quality figures. + Availability and implementation: The R package can be accessed at +https://github.com/jasonwong-lab/gghic and is distributed under the GNU General +Public License version 3.0. + +
+
+
+
+
+ + ☆ Deep Learning in Single-Cell and Spatial Transcriptomics Data Analysis: + Advances and Challenges from a Data Science Perspective + + +
+ The development of single-cell and spatial transcriptomics has revolutionized +our capacity to investigate cellular properties, functions, and interactions in +both cellular and spatial contexts. However, the analysis of single-cell and +spatial omics data remains challenging. First, single-cell sequencing data are +high-dimensional and sparse, often contaminated by noise and uncertainty, +obscuring the underlying biological signals. Second, these data often encompass +multiple modalities, including gene expression, epigenetic modifications, and +spatial locations. Integrating these diverse data modalities is crucial for +enhancing prediction accuracy and biological interpretability. Third, while the +scale of single-cell sequencing has expanded to millions of cells, high-quality +annotated datasets are still limited. Fourth, the complex correlations of +biological tissues make it difficult to accurately reconstruct cellular states +and spatial contexts. Traditional feature engineering-based analysis methods +struggle to deal with the various challenges presented by intricate biological +networks. Deep learning has emerged as a powerful tool capable of handling +high-dimensional complex data and automatically identifying meaningful +patterns, offering significant promise in addressing these challenges. This +review systematically analyzes these challenges and discusses related deep +learning approaches. Moreover, we have curated 21 datasets from 9 benchmarks, +encompassing 58 computational methods, and evaluated their performance on the +respective modeling tasks. Finally, we highlight three areas for future +development from a technical, dataset, and application perspective. This work +will serve as a valuable resource for understanding how deep learning can be +effectively utilized in single-cell and spatial transcriptomics analyses, while +inspiring novel approaches to address emerging challenges. + +
+
+
+
+
+
+
+
+ + Machine Learning 148 + +
+
+
+ + ☆ Navigation World Models + + +
+ Navigation is a fundamental skill of agents with visual-motor capabilities. +We introduce a Navigation World Model (NWM), a controllable video generation +model that predicts future visual observations based on past observations and +navigation actions. To capture complex environment dynamics, NWM employs a +Conditional Diffusion Transformer (CDiT), trained on a diverse collection of +egocentric videos of both human and robotic agents, and scaled up to 1 billion +parameters. In familiar environments, NWM can plan navigation trajectories by +simulating them and evaluating whether they achieve the desired goal. Unlike +supervised navigation policies with fixed behavior, NWM can dynamically +incorporate constraints during planning. Experiments demonstrate its +effectiveness in planning trajectories from scratch or by ranking trajectories +sampled from an external policy. Furthermore, NWM leverages its learned visual +priors to imagine trajectories in unfamiliar environments from a single input +image, making it a flexible and powerful tool for next-generation navigation +systems. + +
+
+ comment: project page: https://www.amirbar.net/nwm/ +
+
+
+
+
+ + ☆ Best-of-N Jailbreaking + + +
+ We introduce Best-of-N (BoN) Jailbreaking, a simple black-box algorithm that +jailbreaks frontier AI systems across modalities. BoN Jailbreaking works by +repeatedly sampling variations of a prompt with a combination of augmentations +- such as random shuffling or capitalization for textual prompts - until a +harmful response is elicited. We find that BoN Jailbreaking achieves high +attack success rates (ASRs) on closed-source language models, such as 89% on +GPT-4o and 78% on Claude 3.5 Sonnet when sampling 10,000 augmented prompts. +Further, it is similarly effective at circumventing state-of-the-art +open-source defenses like circuit breakers. BoN also seamlessly extends to +other modalities: it jailbreaks vision language models (VLMs) such as GPT-4o +and audio language models (ALMs) like Gemini 1.5 Pro, using modality-specific +augmentations. BoN reliably improves when we sample more augmented prompts. +Across all modalities, ASR, as a function of the number of samples (N), +empirically follows power-law-like behavior for many orders of magnitude. BoN +Jailbreaking can also be composed with other black-box algorithms for even more +effective attacks - combining BoN with an optimized prefix attack achieves up +to a 35% increase in ASR. Overall, our work indicates that, despite their +capability, language models are sensitive to seemingly innocuous changes to +inputs, which attackers can exploit across modalities. + +
+
+
+
+
+ + ☆ Perception Tokens Enhance Visual Reasoning in Multimodal Language Models + + +
+ Multimodal language models (MLMs) still face challenges in fundamental visual +perception tasks where specialized models excel. Tasks requiring reasoning +about 3D structures benefit from depth estimation, and reasoning about 2D +object instances benefits from object detection. Yet, MLMs can not produce +intermediate depth or boxes to reason over. Finetuning MLMs on relevant data +doesn't generalize well and outsourcing computation to specialized vision tools +is too compute-intensive and memory-inefficient. To address this, we introduce +Perception Tokens, intrinsic image representations designed to assist reasoning +tasks where language is insufficient. Perception tokens act as auxiliary +reasoning tokens, akin to chain-of-thought prompts in language models. For +example, in a depth-related task, an MLM augmented with perception tokens can +reason by generating a depth map as tokens, enabling it to solve the problem +effectively. We propose AURORA, a training method that augments MLMs with +perception tokens for improved reasoning over visual inputs. AURORA leverages a +VQVAE to transform intermediate image representations, such as depth maps into +a tokenized format and bounding box tokens, which is then used in a multi-task +training framework. AURORA achieves notable improvements across counting +benchmarks: +10.8% on BLINK, +11.3% on CVBench, and +8.3% on SEED-Bench, +outperforming finetuning approaches in generalization across datasets. It also +improves on relative depth: over +6% on BLINK. With perception tokens, AURORA +expands the scope of MLMs beyond language-based reasoning, paving the way for +more effective visual reasoning capabilities. + +
+
+
+
+
+ + ☆ NODE-AdvGAN: Improving the transferability and perceptual similarity of + adversarial examples by dynamic-system-driven adversarial generative model + + +
+ Understanding adversarial examples is crucial for improving the model's +robustness, as they introduce imperceptible perturbations that deceive models. +Effective adversarial examples, therefore, offer the potential to train more +robust models by removing their singularities. We propose NODE-AdvGAN, a novel +approach that treats adversarial generation as a continuous process and employs +a Neural Ordinary Differential Equation (NODE) for simulating the dynamics of +the generator. By mimicking the iterative nature of traditional gradient-based +methods, NODE-AdvGAN generates smoother and more precise perturbations that +preserve high perceptual similarity when added to benign images. We also +propose a new training strategy, NODE-AdvGAN-T, which enhances transferability +in black-box attacks by effectively tuning noise parameters during training. +Experiments demonstrate that NODE-AdvGAN and NODE-AdvGAN-T generate more +effective adversarial examples that achieve higher attack success rates while +preserving better perceptual quality than traditional GAN-based methods. + +
+
+
+
+
+ + ☆ Evaluating Gender Bias Transfer between Pre-trained and Prompt-Adapted + Language Models + + +
+ Large language models (LLMs) are increasingly being adapted to achieve +task-specificity for deployment in real-world decision systems. Several +previous works have investigated the bias transfer hypothesis (BTH) by studying +the effect of the fine-tuning adaptation strategy on model fairness to find +that fairness in pre-trained masked language models have limited effect on the +fairness of models when adapted using fine-tuning. In this work, we expand the +study of BTH to causal models under prompt adaptations, as prompting is an +accessible, and compute-efficient way to deploy models in real-world systems. +In contrast to previous works, we establish that intrinsic biases in +pre-trained Mistral, Falcon and Llama models are strongly correlated (rho >= +0.94) with biases when the same models are zero- and few-shot prompted, using a +pronoun co-reference resolution task. Further, we find that bias transfer +remains strongly correlated even when LLMs are specifically prompted to exhibit +fair or biased behavior (rho >= 0.92), and few-shot length and stereotypical +composition are varied (rho >= 0.97). Our findings highlight the importance of +ensuring fairness in pre-trained LLMs, especially when they are later used to +perform downstream tasks via prompt adaptation. + +
+
+
+
+
+ + ☆ A Review on Scientific Knowledge Extraction using Large Language Models + in Biomedical Sciences + + +
+ The rapid advancement of large language models (LLMs) has opened new +boundaries in the extraction and synthesis of medical knowledge, particularly +within evidence synthesis. This paper reviews the state-of-the-art applications +of LLMs in the biomedical domain, exploring their effectiveness in automating +complex tasks such as evidence synthesis and data extraction from a biomedical +corpus of documents. While LLMs demonstrate remarkable potential, significant +challenges remain, including issues related to hallucinations, contextual +understanding, and the ability to generalize across diverse medical tasks. We +highlight critical gaps in the current research literature, particularly the +need for unified benchmarks to standardize evaluations and ensure reliability +in real-world applications. In addition, we propose directions for future +research, emphasizing the integration of state-of-the-art techniques such as +retrieval-augmented generation (RAG) to enhance LLM performance in evidence +synthesis. By addressing these challenges and utilizing the strengths of LLMs, +we aim to improve access to medical literature and facilitate meaningful +discoveries in healthcare. + +
+
+ comment: 9 pages, 1 table, 1 figure, conference paper +
+
+
+
+
+ + ☆ FANAL -- Financial Activity News Alerting Language Modeling Framework + + +
+ In the rapidly evolving financial sector, the accurate and timely +interpretation of market news is essential for stakeholders needing to navigate +unpredictable events. This paper introduces FANAL (Financial Activity News +Alerting Language Modeling Framework), a specialized BERT-based framework +engineered for real-time financial event detection and analysis, categorizing +news into twelve distinct financial categories. FANAL leverages silver-labeled +data processed through XGBoost and employs advanced fine-tuning techniques, +alongside ORBERT (Odds Ratio BERT), a novel variant of BERT fine-tuned with +ORPO (Odds Ratio Preference Optimization) for superior class-wise probability +calibration and alignment with financial event relevance. We evaluate FANAL's +performance against leading large language models, including GPT-4o, Llama-3.1 +8B, and Phi-3, demonstrating its superior accuracy and cost efficiency. This +framework sets a new standard for financial intelligence and responsiveness, +significantly outstripping existing models in both performance and +affordability. + +
+
+ comment: Accepted for the IEEE International Workshop on Large Language Models + for Finance, 2024. This is a preprint version +
+
+
+
+
+ + ☆ KKLIP: Knowledge Distillation Exploiting K-means Clustering for + Language-Image Pre-Training + + +
+ Recently, CLIP has emerged as a valuable model for aligning image and text +information in multi-modal scenarios. However, researchers have observed +limitations in the ability of CLIP's text and image encoders to extract +detailed knowledge from caption-image pairs. In response, this paper introduces +KKLIP, a novel approach designed to enhance the quality of CLIP by +incorporating a new knowledge distillation (KD) method derived from Llama 2. +Our method comprises three objectives: Text Embedding Distillation, Concept +Learning, and Contrastive Learning. Firstly, Text Embedding Distillation +involves training the KKLIP text encoder to emulate the teacher model, Llama 2. +Secondly, Concept Learning assigns a soft concept label to each caption-image +pair through offline k-means clustering of text information from Llama 2, +allowing KKLIP to learn from these soft concept labels. Finally, Contrastive +Learning harmonizes text and image embeddings. Our experimental results +demonstrate that KKLIP enhances the quality of both text and image encoders. + +
+
+
+
+
+ + ☆ Self-test loss functions for learning weak-form operators and gradient + flows + + +
+ The construction of loss functions presents a major challenge in data-driven +modeling involving weak-form operators in PDEs and gradient flows, particularly +due to the need to select test functions appropriately. We address this +challenge by introducing self-test loss functions, which employ test functions +that depend on the unknown parameters, specifically for cases where the +operator depends linearly on the unknowns. The proposed self-test loss function +conserves energy for gradient flows and coincides with the expected +log-likelihood ratio for stochastic differential equations. Importantly, it is +quadratic, facilitating theoretical analysis of identifiability and +well-posedness of the inverse problem, while also leading to efficient +parametric or nonparametric regression algorithms. It is computationally +simple, requiring only low-order derivatives or even being entirely +derivative-free, and numerical experiments demonstrate its robustness against +noisy and discrete data. + +
+
+
+
+
+ + ☆ A Bidirectional Siamese Recurrent Neural Network for Accurate Gait + Recognition Using Body Landmarks + + +
+ Gait recognition is a significant biometric technique for person +identification, particularly in scenarios where other physiological biometrics +are impractical or ineffective. In this paper, we address the challenges +associated with gait recognition and present a novel approach to improve its +accuracy and reliability. The proposed method leverages advanced techniques, +including sequential gait landmarks obtained through the Mediapipe pose +estimation model, Procrustes analysis for alignment, and a Siamese +biGRU-dualStack Neural Network architecture for capturing temporal +dependencies. Extensive experiments were conducted on large-scale cross-view +datasets to demonstrate the effectiveness of the approach, achieving high +recognition accuracy compared to other models. The model demonstrated +accuracies of 95.7%, 94.44%, 87.71%, and 86.6% on CASIA-B, SZU RGB-D, OU-MVLP, +and Gait3D datasets respectively. The results highlight the potential +applications of the proposed method in various practical domains, indicating +its significant contribution to the field of gait recognition. + +
+
+
+
+
+ + ☆ Soft Checksums to Flag Untrustworthy Machine Learning Surrogate + Predictions and Application to Atomic Physics Simulations + + +
+ Trained neural networks (NN) are attractive as surrogate models to replace +costly calculations in physical simulations, but are often unknowingly applied +to states not adequately represented in the training dataset. We present the +novel technique of soft checksums for scientific machine learning, a +general-purpose method to differentiate between trustworthy predictions with +small errors on in-distribution (ID) data points, and untrustworthy predictions +with large errors on out-of-distribution (OOD) data points. By adding a check +node to the existing output layer, we train the model to learn the chosen +checksum function encoded within the NN predictions and show that violations of +this function correlate with high prediction errors. As the checksum function +depends only on the NN predictions, we can calculate the checksum error for any +prediction with a single forward pass, incurring negligible time and memory +costs. Additionally, we find that incorporating the checksum function into the +loss function and exposing the NN to OOD data points during the training +process improves separation between ID and OOD predictions. By applying soft +checksums to a physically complex and high-dimensional non-local thermodynamic +equilibrium atomic physics dataset, we show that a well-chosen threshold +checksum error can effectively separate ID and OOD predictions. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ TRENDy: Temporal Regression of Effective Non-linear Dynamics + + +
+ Spatiotemporal dynamics pervade the natural sciences, from the morphogen +dynamics underlying patterning in animal pigmentation to the protein waves +controlling cell division. A central challenge lies in understanding how +controllable parameters induce qualitative changes in system behavior called +bifurcations. This endeavor is made particularly difficult in realistic +settings where governing partial differential equations (PDEs) are unknown and +data is limited and noisy. To address this challenge, we propose TRENDy +(Temporal Regression of Effective Nonlinear Dynamics), an equation-free +approach to learning low-dimensional, predictive models of spatiotemporal +dynamics. Following classical work in spatial coarse-graining, TRENDy first +maps input data to a low-dimensional space of effective dynamics via a cascade +of multiscale filtering operations. Our key insight is the recognition that +these effective dynamics can be fit by a neural ordinary differential equation +(NODE) having the same parameter space as the input PDE. The preceding +filtering operations strongly regularize the phase space of the NODE, making +TRENDy significantly more robust to noise compared to existing methods. We +train TRENDy to predict the effective dynamics of synthetic and real data +representing dynamics from across the physical and life sciences. We then +demonstrate how our framework can automatically locate both Turing and Hopf +bifurcations in unseen regions of parameter space. We finally apply our method +to the analysis of spatial patterning of the ocellated lizard through +development. We found that TRENDy's effective state not only accurately +predicts spatial changes over time but also identifies distinct pattern +features unique to different anatomical regions, highlighting the potential +influence of surface geometry on reaction-diffusion mechanisms and their role +in driving spatially varying pattern dynamics. + +
+
+ comment: 10 pages, 14 appendix pages, 5 figures, 7 appendix figures +
+
+
+
+
+ + ☆ Beyond algorithm hyperparameters: on preprocessing hyperparameters and + associated pitfalls in machine learning applications + + +
+ Adequately generating and evaluating prediction models based on supervised +machine learning (ML) is often challenging, especially for less experienced +users in applied research areas. Special attention is required in settings +where the model generation process involves hyperparameter tuning, i.e. +data-driven optimization of different types of hyperparameters to improve the +predictive performance of the resulting model. Discussions about tuning +typically focus on the hyperparameters of the ML algorithm (e.g., the minimum +number of observations in each terminal node for a tree-based algorithm). In +this context, it is often neglected that hyperparameters also exist for the +preprocessing steps that are applied to the data before it is provided to the +algorithm (e.g., how to handle missing feature values in the data). As a +consequence, users experimenting with different preprocessing options to +improve model performance may be unaware that this constitutes a form of +hyperparameter tuning - albeit informal and unsystematic - and thus may fail to +report or account for this optimization. To illuminate this issue, this paper +reviews and empirically illustrates different procedures for generating and +evaluating prediction models, explicitly addressing the different ways +algorithm and preprocessing hyperparameters are typically handled by applied ML +users. By highlighting potential pitfalls, especially those that may lead to +exaggerated performance claims, this review aims to further improve the quality +of predictive modeling in ML applications. + +
+
+
+
+
+ + ☆ Flow Matching with General Discrete Paths: A Kinetic-Optimal Perspective + + +
+ The design space of discrete-space diffusion or flow generative models are +significantly less well-understood than their continuous-space counterparts, +with many works focusing only on a simple masked construction. In this work, we +aim to take a holistic approach to the construction of discrete generative +models based on continuous-time Markov chains, and for the first time, allow +the use of arbitrary discrete probability paths, or colloquially, corruption +processes. Through the lens of optimizing the symmetric kinetic energy, we +propose velocity formulas that can be applied to any given probability path, +completely decoupling the probability and velocity, and giving the user the +freedom to specify any desirable probability path based on expert knowledge +specific to the data domain. Furthermore, we find that a special construction +of mixture probability paths optimizes the symmetric kinetic energy for the +discrete case. We empirically validate the usefulness of this new design space +across multiple modalities: text generation, inorganic material generation, and +image generation. We find that we can outperform the mask construction even in +text with kinetic-optimal mixture paths, while we can make use of +domain-specific constructions of the probability path over the visual domain. + +
+
+
+
+
+ + ☆ Tight PAC-Bayesian Risk Certificates for Contrastive Learning + + +
+ Contrastive representation learning is a modern paradigm for learning +representations of unlabeled data via augmentations -- precisely, contrastive +models learn to embed semantically similar pairs of samples (positive pairs) +closer than independently drawn samples (negative samples). In spite of its +empirical success and widespread use in foundation models, statistical theory +for contrastive learning remains less explored. Recent works have developed +generalization error bounds for contrastive losses, but the resulting risk +certificates are either vacuous (certificates based on Rademacher complexity or +$f$-divergence) or require strong assumptions about samples that are +unreasonable in practice. The present paper develops non-vacuous PAC-Bayesian +risk certificates for contrastive representation learning, considering the +practical considerations of the popular SimCLR framework. Notably, we take into +account that SimCLR reuses positive pairs of augmented data as negative samples +for other data, thereby inducing strong dependence and making classical PAC or +PAC-Bayesian bounds inapplicable. We further refine existing bounds on the +downstream classification loss by incorporating SimCLR-specific factors, +including data augmentation and temperature scaling, and derive risk +certificates for the contrastive zero-one risk. The resulting bounds for +contrastive loss and downstream prediction are much tighter than those of +previous risk certificates, as demonstrated by experiments on CIFAR-10. + +
+
+
+
+
+ + ☆ Convolutional Neural Networks and Mixture of Experts for Intrusion + Detection in 5G Networks and beyond + + +
+ The advent of 6G/NextG networks comes along with a series of benefits, +including extreme capacity, reliability, and efficiency. However, these +networks may become vulnerable to new security threats. Therefore, 6G/NextG +networks must be equipped with advanced Artificial Intelligence algorithms, in +order to evade these attacks. Existing studies on the intrusion detection task +rely on the train of shallow machine learning classifiers, including Logistic +Regression, Decision Trees, and so on, yielding suboptimal performance. Others +are based on deep neural networks consisting of static components, which are +not conditional on the input. This limits their representation power and +efficiency. To resolve these issues, we present the first study integrating +Mixture of Experts (MoE) for identifying malicious traffic. Specifically, we +use network traffic data and convert the 1D array of features into a 2D matrix. +Next, we pass this matrix through convolutional neural network (CNN) layers +followed by batch normalization and max pooling layers. After obtaining the +representation vector via the CNN layers, a sparsely gated MoE layer is used. +This layer consists of a set of experts (dense layers) and a router, where the +router assigns weights to the output of each expert. Sparsity is achieved by +choosing the most relevant experts of the total ones. Finally, we perform a +series of ablation experiments to prove the effectiveness of our proposed +model. Experiments are conducted on the 5G-NIDD dataset, a network intrusion +detection dataset generated from a real 5G test network. Results show that our +introduced approach reaches weighted F1-score up to 99.95% achieving comparable +performance to existing approaches. Findings also show that our proposed model +achieves multiple advantages over state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Cluster Specific Representation Learning + + +
+ Representation learning aims to extract meaningful lower-dimensional +embeddings from data, known as representations. Despite its widespread +application, there is no established definition of a ``good'' representation. +Typically, the representation quality is evaluated based on its performance in +downstream tasks such as clustering, de-noising, etc. However, this +task-specific approach has a limitation where a representation that performs +well for one task may not necessarily be effective for another. This highlights +the need for a more agnostic formulation, which is the focus of our work. We +propose a downstream-agnostic formulation: when inherent clusters exist in the +data, the representations should be specific to each cluster. Under this idea, +we develop a meta-algorithm that jointly learns cluster-specific +representations and cluster assignments. As our approach is easy to integrate +with any representation learning framework, we demonstrate its effectiveness in +various setups, including Autoencoders, Variational Autoencoders, Contrastive +learning models, and Restricted Boltzmann Machines. We qualitatively compare +our cluster-specific embeddings to standard embeddings and downstream tasks +such as de-noising and clustering. While our method slightly increases runtime +and parameters compared to the standard model, the experiments clearly show +that it extracts the inherent cluster structures in the data, resulting in +improved performance in relevant applications. + +
+
+
+
+
+ + ☆ YT-30M: A multi-lingual multi-category dataset of YouTube comments + + +
+ This paper introduces two large-scale multilingual comment datasets, YT-30M +(and YT-100K) from YouTube. The analysis in this paper is performed on a +smaller sample (YT-100K) of YT-30M. Both the datasets: YT-30M (full) and +YT-100K (randomly selected 100K sample from YT-30M) are publicly released for +further research. YT-30M (YT-100K) contains 32236173 (108694) comments posted +by YouTube channel that belong to YouTube categories. Each comment is +associated with a video ID, comment ID, commentor name, commentor channel ID, +comment text, upvotes, original channel ID and category of the YouTube channel +(e.g., 'News & Politics', 'Science & Technology', etc.). + +
+
+
+
+
+ + ☆ Validity and efficiency of the conformal CUSUM procedure + + +
+ In this paper we study the validity and efficiency of a conformal version of +the CUSUM procedure for change detection both experimentally and theoretically. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ State Frequency Estimation for Anomaly Detection + + +
+ Many works have studied the efficacy of state machines for detecting +anomalies within NetFlows. These works typically learn a model from unlabeled +data and compute anomaly scores for arbitrary traces based on their likelihood +of occurrence or how well they fit within the model. However, these methods do +not dynamically adapt their scores based on the traces seen at test time. This +becomes a problem when an adversary produces seemingly common traces in their +attack, causing the model to miss the detection by assigning low anomaly +scores. We propose SEQUENT, a new approach that uses the state visit frequency +to adapt its scoring for anomaly detection dynamically. SEQUENT subsequently +uses the scores to generate root causes for anomalies. These allow the grouping +of alarms and simplify the analysis of anomalies. Our evaluation of SEQUENT on +three NetFlow datasets indicates that our approach outperforms existing +methods, demonstrating its effectiveness in detecting anomalies. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ PBP: Post-training Backdoor Purification for Malware Classifiers NDSS 2025 + + +
+ In recent years, the rise of machine learning (ML) in cybersecurity has +brought new challenges, including the increasing threat of backdoor poisoning +attacks on ML malware classifiers. For instance, adversaries could inject +malicious samples into public malware repositories, contaminating the training +data and potentially misclassifying malware by the ML model. Current +countermeasures predominantly focus on detecting poisoned samples by leveraging +disagreements within the outputs of a diverse set of ensemble models on +training data points. However, these methods are not suitable for scenarios +where Machine Learning-as-a-Service (MLaaS) is used or when users aim to remove +backdoors from a model after it has been trained. Addressing this scenario, we +introduce PBP, a post-training defense for malware classifiers that mitigates +various types of backdoor embeddings without assuming any specific backdoor +embedding mechanism. Our method exploits the influence of backdoor attacks on +the activation distribution of neural networks, independent of the +trigger-embedding method. In the presence of a backdoor attack, the activation +distribution of each layer is distorted into a mixture of distributions. By +regulating the statistics of the batch normalization layers, we can guide a +backdoored model to perform similarly to a clean one. Our method demonstrates +substantial advantages over several state-of-the-art methods, as evidenced by +experiments on two datasets, two types of backdoor methods, and various attack +configurations. Notably, our approach requires only a small portion of the +training data -- only 1\% -- to purify the backdoor and reduce the attack +success rate from 100\% to almost 0\%, a 100-fold improvement over the baseline +methods. Our code is available at +\url{https://github.com/judydnguyen/pbp-backdoor-purification-official}. + +
+
+ comment: Accepted at NDSS 2025 +
+
+
+
+
+ + ☆ SINGER: Vivid Audio-driven Singing Video Generation with Multi-scale + Spectral Diffusion Model + + +
+ Recent advancements in generative models have significantly enhanced talking +face video generation, yet singing video generation remains underexplored. The +differences between human talking and singing limit the performance of existing +talking face video generation models when applied to singing. The fundamental +differences between talking and singing-specifically in audio characteristics +and behavioral expressions-limit the effectiveness of existing models. We +observe that the differences between singing and talking audios manifest in +terms of frequency and amplitude. To address this, we have designed a +multi-scale spectral module to help the model learn singing patterns in the +spectral domain. Additionally, we develop a spectral-filtering module that aids +the model in learning the human behaviors associated with singing audio. These +two modules are integrated into the diffusion model to enhance singing video +generation performance, resulting in our proposed model, SINGER. Furthermore, +the lack of high-quality real-world singing face videos has hindered the +development of the singing video generation community. To address this gap, we +have collected an in-the-wild audio-visual singing dataset to facilitate +research in this area. Our experiments demonstrate that SINGER is capable of +generating vivid singing videos and outperforms state-of-the-art methods in +both objective and subjective evaluations. + +
+
+
+
+
+ + ☆ Assessing Foundation Models' Transferability to Physiological Signals in + Precision Medicine + + +
+ The success of precision medicine requires computational models that can +effectively process and interpret diverse physiological signals across +heterogeneous patient populations. While foundation models have demonstrated +remarkable transfer capabilities across various domains, their effectiveness in +handling individual-specific physiological signals - crucial for precision +medicine - remains largely unexplored. This work introduces a systematic +pipeline for rapidly and efficiently evaluating foundation models' transfer +capabilities in medical contexts. Our pipeline employs a three-stage approach. +First, it leverages physiological simulation software to generate diverse, +clinically relevant scenarios, particularly focusing on data-scarce medical +conditions. This simulation-based approach enables both targeted capability +assessment and subsequent model fine-tuning. Second, the pipeline projects +these simulated signals through the foundation model to obtain embeddings, +which are then evaluated using linear methods. This evaluation quantifies the +model's ability to capture three critical aspects: physiological feature +independence, temporal dynamics preservation, and medical scenario +differentiation. Finally, the pipeline validates these representations through +specific downstream medical tasks. Initial testing of our pipeline on the +Moirai time series foundation model revealed significant limitations in +physiological signal processing, including feature entanglement, temporal +dynamics distortion, and reduced scenario discrimination. These findings +suggest that current foundation models may require substantial architectural +modifications or targeted fine-tuning before deployment in clinical settings. + +
+
+ comment: Presented at the precision medicine workshop at the AI in Medicine + conference (2024) in Salt Lake City +
+
+
+
+
+ + ☆ Learning Semantic Association Rules from Internet of Things Data + + +
+ Association Rule Mining (ARM) is the task of discovering commonalities in +data in the form of logical implications. ARM is used in the Internet of Things +(IoT) for different tasks including monitoring and decision-making. However, +existing methods give limited consideration to IoT-specific requirements such +as heterogeneity and volume. Furthermore, they do not utilize important static +domain-specific description data about IoT systems, which is increasingly +represented as knowledge graphs. In this paper, we propose a novel ARM pipeline +for IoT data that utilizes both dynamic sensor data and static IoT system +metadata. Furthermore, we propose an Autoencoder-based Neurosymbolic ARM method +(Aerial) as part of the pipeline to address the high volume of IoT data and +reduce the total number of rules that are resource-intensive to process. Aerial +learns a neural representation of a given data and extracts association rules +from this representation by exploiting the reconstruction (decoding) mechanism +of an autoencoder. Extensive evaluations on 3 IoT datasets from 2 domains show +that ARM on both static and dynamic IoT data results in more generically +applicable rules while Aerial can learn a more concise set of high-quality +association rules than the state-of-the-art with full coverage over the +datasets. + +
+
+
+
+
+ + ☆ Deep Operator BSDE: a Numerical Scheme to Approximate the Solution + Operators + + +
+ Motivated by dynamic risk measures and conditional $g$-expectations, in this +work we propose a numerical method to approximate the solution operator given +by a Backward Stochastic Differential Equation (BSDE). The main ingredients for +this are the Wiener chaos decomposition and the classical Euler scheme for +BSDEs. We show convergence of this scheme under very mild assumptions, and +provide a rate of convergence in more restrictive cases. We then implement it +using neural networks, and we present several numerical examples where we can +check the accuracy of the method. + +
+
+
+
+
+ + ☆ Can neural operators always be continuously discretized? + + +
+ We consider the problem of discretization of neural operators between Hilbert +spaces in a general framework including skip connections. We focus on bijective +neural operators through the lens of diffeomorphisms in infinite dimensions. +Framed using category theory, we give a no-go theorem that shows that +diffeomorphisms between Hilbert spaces or Hilbert manifolds may not admit any +continuous approximations by diffeomorphisms on finite-dimensional spaces, even +if the approximations are nonlinear. The natural way out is the introduction of +strongly monotone diffeomorphisms and layerwise strongly monotone neural +operators which have continuous approximations by strongly monotone +diffeomorphisms on finite-dimensional spaces. For these, one can guarantee +discretization invariance, while ensuring that finite-dimensional +approximations converge not only as sequences of functions, but that their +representations converge in a suitable sense as well. Finally, we show that +bilipschitz neural operators may always be written in the form of an +alternating composition of strongly monotone neural operators, plus a simple +isometry. Thus we realize a rigorous platform for discretization of a +generalization of a neural operator. We also show that neural operators of this +type may be approximated through the composition of finite-rank residual neural +operators, where each block is strongly monotone, and may be inverted locally +via iteration. We conclude by providing a quantitative approximation result for +the discretization of general bilipschitz neural operators. + +
+
+
+
+
+ + ☆ Risk-aware Classification via Uncertainty Quantification + + +
+ Autonomous and semi-autonomous systems are using deep learning models to +improve decision-making. However, deep classifiers can be overly confident in +their incorrect predictions, a major issue especially in safety-critical +domains. The present study introduces three foundational desiderata for +developing real-world risk-aware classification systems. Expanding upon the +previously proposed Evidential Deep Learning (EDL), we demonstrate the unity +between these principles and EDL's operational attributes. We then augment EDL +empowering autonomous agents to exercise discretion during structured +decision-making when uncertainty and risks are inherent. We rigorously examine +empirical scenarios to substantiate these theoretical innovations. In contrast +to existing risk-aware classifiers, our proposed methodologies consistently +exhibit superior performance, underscoring their transformative potential in +risk-conscious classification strategies. + +
+
+ comment: Accepted for publication in Expert Systems with Applications +
+
+
+
+
+ + ☆ Reactive Orchestration for Hierarchical Federated Learning Under a + Communication Cost Budget + + +
+ Deploying a Hierarchical Federated Learning (HFL) pipeline across the +computing continuum (CC) requires careful organization of participants into a +hierarchical structure with intermediate aggregation nodes between FL clients +and the global FL server. This is challenging to achieve due to (i) cost +constraints, (ii) varying data distributions, and (iii) the volatile operating +environment of the CC. In response to these challenges, we present a framework +for the adaptive orchestration of HFL pipelines, designed to be reactive to +client churn and infrastructure-level events, while balancing communication +cost and ML model accuracy. Our mechanisms identify and react to events that +cause HFL reconfiguration actions at runtime, building on multi-level +monitoring information (model accuracy, resource availability, resource cost). +Moreover, our framework introduces a generic methodology for estimating +reconfiguration costs to continuously re-evaluate the quality of adaptation +actions, while being extensible to optimize for various HFL performance +criteria. By extending the Kubernetes ecosystem, our framework demonstrates the +ability to react promptly and effectively to changes in the operating +environment, making the best of the available communication cost budget and +effectively balancing costs and ML performance at runtime. + +
+
+
+
+
+ + ☆ Classical Shadows with Improved Median-of-Means Estimation + + +
+ The classical shadows protocol, introduced by Huang et al. [Nat. Phys. 16, +1050 (2020)], makes use of the median-of-means (MoM) estimator to efficiently +estimate the expectation values of $M$ observables with failure probability +$\delta$ using only $\mathcal{O}(\log(M/\delta))$ measurements. In their +analysis, Huang et al. used loose constants in their asymptotic performance +bounds for simplicity. However, the specific values of these constants can +significantly affect the number of shots used in practical implementations. To +address this, we studied a modified MoM estimator proposed by Minsker [PMLR +195, 5925 (2023)] that uses optimal constants and involves a U-statistic over +the data set. For efficient estimation, we implemented two types of incomplete +U-statistics estimators, the first based on random sampling and the second +based on cyclically permuted sampling. We compared the performance of the +original and modified estimators when used with the classical shadows protocol +with single-qubit Clifford unitaries (Pauli measurements) for an Ising spin +chain, and global Clifford unitaries (Clifford measurements) for the +Greenberger-Horne-Zeilinger (GHZ) state. While the original estimator +outperformed the modified estimators for Pauli measurements, the modified +estimators showed improved performance over the original estimator for Clifford +measurements. Our findings highlight the importance of tailoring estimators to +specific measurement settings to optimize the performance of the classical +shadows protocol in practical applications. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Granular Ball Twin Support Vector Machine with Universum Data + + +
+ Classification with support vector machines (SVM) often suffers from limited +performance when relying solely on labeled data from target classes and is +sensitive to noise and outliers. Incorporating prior knowledge from Universum +data and more robust data representations can enhance accuracy and efficiency. +Motivated by these findings, we propose a novel Granular Ball Twin Support +Vector Machine with Universum Data (GBU-TSVM) that extends the TSVM framework +to leverage both Universum samples and granular ball computing during model +training. Unlike existing TSVM methods, the proposed GBU-TSVM represents data +instances as hyper-balls rather than points in the feature space. This +innovative approach improves the model's robustness and efficiency, +particularly in handling noisy and large datasets. By grouping data points into +granular balls, the model achieves superior computational efficiency, increased +noise resistance, and enhanced interpretability. Additionally, the inclusion of +Universum data, which consists of samples that are not strictly from the target +classes, further refines the classification boundaries. This integration +enriches the model with contextual information, refining classification +boundaries and boosting overall accuracy. Experimental results on UCI benchmark +datasets demonstrate that the GBU-TSVM outperforms existing TSVM models in both +accuracy and computational efficiency. These findings highlight the potential +of the GBU-TSVM model in setting a new standard in data representation and +classification. + +
+
+
+
+
+ + ☆ AI-Driven Day-to-Day Route Choice + + +
+ Understanding travelers' route choices can help policymakers devise optimal +operational and planning strategies for both normal and abnormal circumstances. +However, existing choice modeling methods often rely on predefined assumptions +and struggle to capture the dynamic and adaptive nature of travel behavior. +Recently, Large Language Models (LLMs) have emerged as a promising alternative, +demonstrating remarkable ability to replicate human-like behaviors across +various fields. Despite this potential, their capacity to accurately simulate +human route choice behavior in transportation contexts remains doubtful. To +satisfy this curiosity, this paper investigates the potential of LLMs for route +choice modeling by introducing an LLM-empowered agent, "LLMTraveler." This +agent integrates an LLM as its core, equipped with a memory system that learns +from past experiences and makes decisions by balancing retrieved data and +personality traits. The study systematically evaluates the LLMTraveler's +ability to replicate human-like decision-making through two stages: (1) +analyzing its route-switching behavior in single origin-destination (OD) pair +congestion game scenarios, where it demonstrates patterns align with laboratory +data but are not fully explained by traditional models, and (2) testing its +capacity to model day-to-day (DTD) adaptive learning behaviors on the Ortuzar +and Willumsen (OW) network, producing results comparable to Multinomial Logit +(MNL) and Reinforcement Learning (RL) models. These experiments demonstrate +that the framework can partially replicate human-like decision-making in route +choice while providing natural language explanations for its decisions. This +capability offers valuable insights for transportation policymaking, such as +simulating traveler responses to new policies or changes in the network. + +
+
+
+
+
+ + ☆ On Approximability of $\ell_2^2$ Min-Sum Clustering + + +
+ The $\ell_2^2$ min-sum $k$-clustering problem is to partition an input set +into clusters $C_1,\ldots,C_k$ to minimize $\sum_{i=1}^k\sum_{p,q\in +C_i}\|p-q\|_2^2$. Although $\ell_2^2$ min-sum $k$-clustering is NP-hard, it is +not known whether it is NP-hard to approximate $\ell_2^2$ min-sum +$k$-clustering beyond a certain factor. + In this paper, we give the first hardness-of-approximation result for the +$\ell_2^2$ min-sum $k$-clustering problem. We show that it is NP-hard to +approximate the objective to a factor better than $1.056$ and moreover, +assuming a balanced variant of the Johnson Coverage Hypothesis, it is NP-hard +to approximate the objective to a factor better than 1.327. + We then complement our hardness result by giving the first +$(1+\varepsilon)$-coreset construction for $\ell_2^2$ min-sum $k$-clustering. +Our coreset uses $\mathcal{O}\left(k^{\varepsilon^{-4}}\right)$ space and can +be leveraged to achieve a polynomial-time approximation scheme with runtime +$nd\cdot f(k,\varepsilon^{-1})$, where $d$ is the underlying dimension of the +input dataset and $f$ is a fixed function. + Finally, we consider a learning-augmented setting, where the algorithm has +access to an oracle that outputs a label $i\in[k]$ for input point, thereby +implicitly partitioning the input dataset into $k$ clusters that induce an +approximately optimal solution, up to some amount of adversarial error +$\alpha\in\left[0,\frac{1}{2}\right)$. We give a polynomial-time algorithm that +outputs a $\frac{1+\gamma\alpha}{(1-\alpha)^2}$-approximation to $\ell_2^2$ +min-sum $k$-clustering, for a fixed constant $\gamma>0$. + +
+
+
+
+
+ + ☆ Multi-Action Restless Bandits with Weakly Coupled Constraints: + Simultaneous Learning and Control + + +
+ We study a system with finitely many groups of multi-action bandit processes, +each of which is a Markov decision process (MDP) with finite state and action +spaces and potentially different transition matrices when taking different +actions. The bandit processes of the same group share the same state and action +spaces and, given the same action that is taken, the same transition matrix. +All the bandit processes across various groups are subject to multiple weakly +coupled constraints over their state and action variables. Unlike the past +studies that focused on the offline case, we consider the online case without +assuming full knowledge of transition matrices and reward functions a priori +and propose an effective scheme that enables simultaneous learning and control. +We prove the convergence of the relevant processes in both the timeline and the +number of the bandit processes, referred to as the convergence in the time and +the magnitude dimensions. Moreover, we prove that the relevant processes +converge exponentially fast in the magnitude dimension, leading to +exponentially diminishing performance deviation between the proposed online +algorithms and offline optimality. + +
+
+ comment: 70 pages,0 figure +
+
+
+
+
+ + ☆ Scalable Bayesian Tensor Ring Factorization for Multiway Data Analysis ICONIP 2023 + + +
+ Tensor decompositions play a crucial role in numerous applications related to +multi-way data analysis. By employing a Bayesian framework with +sparsity-inducing priors, Bayesian Tensor Ring (BTR) factorization offers +probabilistic estimates and an effective approach for automatically adapting +the tensor ring rank during the learning process. However, previous BTR method +employs an Automatic Relevance Determination (ARD) prior, which can lead to +sub-optimal solutions. Besides, it solely focuses on continuous data, whereas +many applications involve discrete data. More importantly, it relies on the +Coordinate-Ascent Variational Inference (CAVI) algorithm, which is inadequate +for handling large tensors with extensive observations. These limitations +greatly limit its application scales and scopes, making it suitable only for +small-scale problems, such as image/video completion. To address these issues, +we propose a novel BTR model that incorporates a nonparametric Multiplicative +Gamma Process (MGP) prior, known for its superior accuracy in identifying +latent structures. To handle discrete data, we introduce the P\'olya-Gamma +augmentation for closed-form updates. Furthermore, we develop an efficient +Gibbs sampler for consistent posterior simulation, which reduces the +computational complexity of previous VI algorithm by two orders, and an online +EM algorithm that is scalable to extremely large tensors. To showcase the +advantages of our model, we conduct extensive experiments on both simulation +data and real-world applications. + +
+
+ comment: ICONIP 2023 +
+
+
+
+
+ + ☆ FlashAttention on a Napkin: A Diagrammatic Approach to Deep Learning + IO-Awareness + + +
+ Optimizing deep learning algorithms currently requires slow, manual +derivation, potentially leaving much performance untapped. Methods like +FlashAttention have achieved a x6 performance improvement over native PyTorch +by avoiding unnecessary data transfers, but required three iterations over +three years. Automated compiled methods have consistently lagged behind. GPUs +are limited by both transfers to processors and available compute, with +transfer bandwidth having improved at a far slower pace. Already, transfer +bandwidth accounts for 46% of GPU energy costs. This indicates the future of +energy and capital-efficient algorithms relies on improved consideration of +transfer costs (IO-awareness) and a systematic method for deriving optimized +algorithms. In this paper, we present a diagrammatic approach to deep learning +models which, with simple relabelings, derive optimal implementations and +performance models that consider low-level memory. Diagrams generalize down the +GPU hierarchy, providing a universal performance model for comparing hardware +and quantization choices. Diagrams generate pseudocode, which reveals the +application of hardware-specific features such as coalesced memory access, +tensor core operations, and overlapped computation. We present attention +algorithms for Ampere, which fits 13 warps per SM (FlashAttention fits 8), and +for Hopper, which has improved overlapping and may achieve 1.32 PFLOPs. + +
+
+
+
+
+ + ☆ Path-Guided Particle-based Sampling + + +
+ Particle-based Bayesian inference methods by sampling from a partition-free +target (posterior) distribution, e.g., Stein variational gradient descent +(SVGD), have attracted significant attention. We propose a path-guided +particle-based sampling~(PGPS) method based on a novel Log-weighted Shrinkage +(LwS) density path linking an initial distribution to the target distribution. +We propose to utilize a Neural network to learn a vector field motivated by the +Fokker-Planck equation of the designed density path. Particles, initiated from +the initial distribution, evolve according to the ordinary differential +equation defined by the vector field. The distribution of these particles is +guided along a density path from the initial distribution to the target +distribution. The proposed LwS density path allows for an efficient search of +modes of the target distribution while canonical methods fail. We theoretically +analyze the Wasserstein distance of the distribution of the PGPS-generated +samples and the target distribution due to approximation and discretization +errors. Practically, the proposed PGPS-LwS method demonstrates higher Bayesian +inference accuracy and better calibration ability in experiments conducted on +both synthetic and real-world Bayesian learning tasks, compared to baselines, +such as SVGD and Langevin dynamics, etc. + +
+
+
+
+
+ + ☆ Conveying Emotions to Robots through Touch and Sound + + +
+ Human emotions can be conveyed through nuanced touch gestures. However, there +is a lack of understanding of how consistently emotions can be conveyed to +robots through touch. This study explores the consistency of touch-based +emotional expression toward a robot by integrating tactile and auditory sensory +reading of affective haptic expressions. We developed a piezoresistive pressure +sensor and used a microphone to mimic touch and sound channels, respectively. +In a study with 28 participants, each conveyed 10 emotions to a robot using +spontaneous touch gestures. Our findings reveal a statistically significant +consistency in emotion expression among participants. However, some emotions +obtained low intraclass correlation values. Additionally, certain emotions with +similar levels of arousal or valence did not exhibit significant differences in +the way they were conveyed. We subsequently constructed a multi-modal +integrating touch and audio features to decode the 10 emotions. A support +vector machine (SVM) model demonstrated the highest accuracy, achieving 40% for +10 classes, with "Attention" being the most accurately conveyed emotion at a +balanced accuracy of 87.65%. + +
+
+
+
+
+ + ☆ Gaussian Processes for Probabilistic Estimates of Earthquake Ground + Shaking: A 1-D Proof-of-Concept NeurIPS 2024 + + +
+ Estimates of seismic wave speeds in the Earth (seismic velocity models) are +key input parameters to earthquake simulations for ground motion prediction. +Owing to the non-uniqueness of the seismic inverse problem, typically many +velocity models exist for any given region. The arbitrary choice of which +velocity model to use in earthquake simulations impacts ground motion +predictions. However, current hazard analysis methods do not account for this +source of uncertainty. We present a proof-of-concept ground motion prediction +workflow for incorporating uncertainties arising from inconsistencies between +existing seismic velocity models. Our analysis is based on the probabilistic +fusion of overlapping seismic velocity models using scalable Gaussian process +(GP) regression. Specifically, we fit a GP to two synthetic 1-D velocity +profiles simultaneously, and show that the predictive uncertainty accounts for +the differences between the models. We subsequently draw velocity model samples +from the predictive distribution and estimate peak ground displacement using +acoustic wave propagation through the velocity models. The resulting +distribution of possible ground motion amplitudes is much wider than would be +predicted by simulating shaking using only the two input velocity models. This +proof-of-concept illustrates the importance of probabilistic methods for +physics-based seismic hazard analysis. + +
+
+ comment: 8 pages, 2 figures, accepted in the Machine Learning and the Physical + Sciences Workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ Nonparametric Filtering, Estimation and Classification using Neural Jump + ODEs + + +
+ Neural Jump ODEs model the conditional expectation between observations by +neural ODEs and jump at arrival of new observations. They have demonstrated +effectiveness for fully data-driven online forecasting in settings with +irregular and partial observations, operating under weak regularity +assumptions. This work extends the framework to input-output systems, enabling +direct applications in online filtering and classification. We establish +theoretical convergence guarantees for this approach, providing a robust +solution to $L^2$-optimal filtering. Empirical experiments highlight the +model's superior performance over classical parametric methods, particularly in +scenarios with complex underlying distributions. These results emphasise the +approach's potential in time-sensitive domains such as finance and health +monitoring, where real-time accuracy is crucial. + +
+
+
+
+
+ + ☆ NeRF and Gaussian Splatting SLAM in the Wild + + +
+ Navigating outdoor environments with visual Simultaneous Localization and +Mapping (SLAM) systems poses significant challenges due to dynamic scenes, +lighting variations, and seasonal changes, requiring robust solutions. While +traditional SLAM methods struggle with adaptability, deep learning-based +approaches and emerging neural radiance fields as well as Gaussian +Splatting-based SLAM methods, offer promising alternatives. However, these +methods have primarily been evaluated in controlled indoor environments with +stable conditions, leaving a gap in understanding their performance in +unstructured and variable outdoor settings. This study addresses this gap by +evaluating these methods in natural outdoor environments, focusing on camera +tracking accuracy, robustness to environmental factors, and computational +efficiency, highlighting distinct trade-offs. Extensive evaluations demonstrate +that neural SLAM methods achieve superior robustness, particularly under +challenging conditions such as low light, but at a high computational cost. At +the same time, traditional methods perform the best across seasons but are +highly sensitive to variations in lighting conditions. The code of the +benchmark is publicly available at +https://github.com/iis-esslingen/nerf-3dgs-benchmark. + +
+
+ comment: 5 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Learning on One Mode: Addressing Multi-Modality in Offline Reinforcement + Learning + + +
+ Offline reinforcement learning (RL) seeks to learn optimal policies from +static datasets without interacting with the environment. A common challenge is +handling multi-modal action distributions, where multiple behaviours are +represented in the data. Existing methods often assume unimodal behaviour +policies, leading to suboptimal performance when this assumption is violated. +We propose Weighted Imitation Learning on One Mode (LOM), a novel approach that +focuses on learning from a single, promising mode of the behaviour policy. By +using a Gaussian mixture model to identify modes and selecting the best mode +based on expected returns, LOM avoids the pitfalls of averaging over +conflicting actions. Theoretically, we show that LOM improves performance while +maintaining simplicity in policy learning. Empirically, LOM outperforms +existing methods on standard D4RL benchmarks and demonstrates its effectiveness +in complex, multi-modal scenarios. + +
+
+
+
+
+ + ☆ Variable-Speed Teaching-Playback as Real-World Data Augmentation for + Imitation Learning + + +
+ Because imitation learning relies on human demonstrations in hard-to-simulate +settings, the inclusion of force control in this method has resulted in a +shortage of training data, even with a simple change in speed. Although the +field of data augmentation has addressed the lack of data, conventional methods +of data augmentation for robot manipulation are limited to simulation-based +methods or downsampling for position control. This paper proposes a novel +method of data augmentation that is applicable to force control and preserves +the advantages of real-world datasets. We applied teaching-playback at variable +speeds as real-world data augmentation to increase both the quantity and +quality of environmental reactions at variable speeds. An experiment was +conducted on bilateral control-based imitation learning using a method of +imitation learning equipped with position-force control. We evaluated the +effect of real-world data augmentation on two tasks, pick-and-place and wiping, +at variable speeds, each from two human demonstrations at fixed speed. The +results showed a maximum 55% increase in success rate from a simple change in +speed of real-world reactions and improved accuracy along the +duration/frequency command by gathering environmental reactions at variable +speeds. + +
+
+ comment: 16 pages, 12 figures, 4 tables. This is a preprint of an article + submitted for consideration in ADVANCED ROBOTICS, copyright Taylor & Francis + and Robotics Society of Japan; ADVANCED ROBOTICS is available online at + http://www.tandfonline.com/ +
+
+
+
+
+ + ☆ Dynamic Consistent $k$-Center Clustering with Optimal Recourse + + +
+ Given points from an arbitrary metric space and a sequence of point updates +sent by an adversary, what is the minimum recourse per update (i.e., the +minimum number of changes needed to the set of centers after an update), in +order to maintain a constant-factor approximation to a $k$-clustering problem? +This question has received attention in recent years under the name consistent +clustering. + Previous works by Lattanzi and Vassilvitskii [ICLM '17] and Fichtenberger, +Lattanzi, Norouzi-Fard, and Svensson [SODA '21] studied $k$-clustering +objectives, including the $k$-center and the $k$-median objectives, under only +point insertions. In this paper we study the $k$-center objective in the fully +dynamic setting, where the update is either a point insertion or a point +deletion. Before our work, {\L}\k{a}cki, Haeupler, Grunau, Rozho\v{n}, and +Jayaram [SODA '24] gave a deterministic fully dynamic constant-factor +approximation algorithm for the $k$-center objective with worst-case recourse +of $2$ per update. + In this work, we prove that the $k$-center clustering problem admits optimal +recourse bounds by developing a deterministic fully dynamic constant-factor +approximation algorithm with worst-case recourse of $1$ per update. Moreover +our algorithm performs simple choices based on light data structures, and thus +is arguably more direct and faster than the previous one which uses a +sophisticated combinatorial structure. Additionally, we develop a new +deterministic decremental algorithm and a new deterministic incremental +algorithm, both of which maintain a $6$-approximate $k$-center solution with +worst-case recourse of $1$ per update. Our incremental algorithm improves over +the $8$-approximation algorithm by Charikar, Chekuri, Feder, and Motwani [STOC +'97]. Finally, we remark that since all three of our algorithms are +deterministic, they work against an adaptive adversary. + +
+
+ comment: In Proceedings SODA 2025 +
+
+
+
+
+ + ☆ Channel Reflection: Knowledge-Driven Data Augmentation for EEG-Based + Brain-Computer Interfaces + + +
+ A brain-computer interface (BCI) enables direct communication between the +human brain and external devices. Electroencephalography (EEG) based BCIs are +currently the most popular for able-bodied users. To increase +user-friendliness, usually a small amount of user-specific EEG data are used +for calibration, which may not be enough to develop a pure data-driven decoding +model. To cope with this typical calibration data shortage challenge in +EEG-based BCIs, this paper proposes a parameter-free channel reflection (CR) +data augmentation approach that incorporates prior knowledge on the channel +distributions of different BCI paradigms in data augmentation. Experiments on +eight public EEG datasets across four different BCI paradigms (motor imagery, +steady-state visual evoked potential, P300, and seizure classifications) using +different decoding algorithms demonstrated that: 1) CR is effective, i.e., it +can noticeably improve the classification accuracy; 2) CR is robust, i.e., it +consistently outperforms existing data augmentation approaches in the +literature; and, 3) CR is flexible, i.e., it can be combined with other data +augmentation approaches to further increase the performance. We suggest that +data augmentation approaches like CR should be an essential step in EEG-based +BCIs. Our code is available online. + +
+
+
+
+
+ + ☆ Survey of different Large Language Model Architectures: Trends, + Benchmarks, and Challenges + + +
+ Large Language Models (LLMs) represent a class of deep learning models adept +at understanding natural language and generating coherent responses to various +prompts or queries. These models far exceed the complexity of conventional +neural networks, often encompassing dozens of neural network layers and +containing billions to trillions of parameters. They are typically trained on +vast datasets, utilizing architectures based on transformer blocks. Present-day +LLMs are multi-functional, capable of performing a range of tasks from text +generation and language translation to question answering, as well as code +generation and analysis. An advanced subset of these models, known as +Multimodal Large Language Models (MLLMs), extends LLM capabilities to process +and interpret multiple data modalities, including images, audio, and video. +This enhancement empowers MLLMs with capabilities like video editing, image +comprehension, and captioning for visual content. This survey provides a +comprehensive overview of the recent advancements in LLMs. We begin by tracing +the evolution of LLMs and subsequently delve into the advent and nuances of +MLLMs. We analyze emerging state-of-the-art MLLMs, exploring their technical +features, strengths, and limitations. Additionally, we present a comparative +analysis of these models and discuss their challenges, potential limitations, +and prospects for future development. + +
+
+
+
+
+ + ☆ Beyond [cls]: Exploring the true potential of Masked Image Modeling + representations + + +
+ Masked Image Modeling (MIM) has emerged as a popular method for +Self-Supervised Learning (SSL) of visual representations. However, for +high-level perception tasks, MIM-pretrained models offer lower out-of-the-box +representation quality than the Joint-Embedding Architectures (JEA) - another +prominent SSL paradigm. To understand this performance gap, we analyze the +information flow in Vision Transformers (ViT) learned by both approaches. We +reveal that whereas JEAs construct their representation on a selected set of +relevant image fragments, MIM models aggregate nearly whole image content. +Moreover, we demonstrate that MIM-trained ViTs retain valuable information +within their patch tokens, which is not effectively captured by the global +[cls] token representations. Therefore, selective aggregation of relevant patch +tokens, without any fine-tuning, results in consistently higher-quality of MIM +representations. To our knowledge, we are the first to highlight the lack of +effective representation aggregation as an emergent issue of MIM and propose +directions to address it, contributing to future advances in Self-Supervised +Learning. + +
+
+
+
+
+ + ☆ Continual Low-Rank Scaled Dot-product Attention + + +
+ Transformers are widely used for their ability to capture data relations in +sequence processing, with great success for a wide range of static tasks. +However, the computational and memory footprint of their main component, i.e., +the Scaled Dot-product Attention, is commonly overlooked. This makes their +adoption in applications involving stream data processing with constraints in +response latency, computational and memory resources infeasible. Some works +have proposed methods to lower the computational cost of transformers, i.e. +low-rank approximations, sparsity in attention, and efficient formulations for +Continual Inference. In this paper, we introduce a new formulation of the +Scaled Dot-product Attention based on the Nystr\"om approximation that is +suitable for Continual Inference. In experiments on Online Audio Classification +and Online Action Detection tasks, the proposed Continual Scaled Dot-product +Attention can lower the number of operations by up to three orders of magnitude +compared to the original Transformers while retaining the predictive +performance of competing models. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ ClusterKV: Manipulating LLM KV Cache in Semantic Space for Recallable + Compression + + +
+ Large Language Models (LLMs) have been widely deployed in a variety of +applications, and the context length is rapidly increasing to handle tasks such +as long-document QA and complex logical reasoning. However, long context poses +significant challenges for inference efficiency, including high memory costs of +key-value (KV) cache and increased latency due to extensive memory accesses. +Recent works have proposed compressing KV cache to approximate computation, but +these methods either evict tokens permanently, never recalling them for later +inference, or recall previous tokens at the granularity of pages divided by +textual positions. Both approaches degrade the model accuracy and output +quality. To achieve efficient and accurate recallable KV cache compression, we +introduce ClusterKV, which recalls tokens at the granularity of semantic +clusters. We design and implement efficient algorithms and systems for +clustering, selection, indexing and caching. Experiment results show that +ClusterKV attains negligible accuracy loss across various tasks with 32k +context lengths, using only a 1k to 2k KV cache budget, and achieves up to a +2$\times$ speedup in latency and a 2.5$\times$ improvement in decoding +throughput. Compared to SoTA recallable KV compression methods, ClusterKV +demonstrates higher model accuracy and output quality, while maintaining or +exceeding inference efficiency. + +
+
+
+
+
+ + ☆ Semi-Supervised Transfer Boosting (SS-TrBoosting) + + +
+ Semi-supervised domain adaptation (SSDA) aims at training a high-performance +model for a target domain using few labeled target data, many unlabeled target +data, and plenty of auxiliary data from a source domain. Previous works in SSDA +mainly focused on learning transferable representations across domains. +However, it is difficult to find a feature space where the source and target +domains share the same conditional probability distribution. Additionally, +there is no flexible and effective strategy extending existing unsupervised +domain adaptation (UDA) approaches to SSDA settings. In order to solve the +above two challenges, we propose a novel fine-tuning framework, semi-supervised +transfer boosting (SS-TrBoosting). Given a well-trained deep learning-based UDA +or SSDA model, we use it as the initial model, generate additional base +learners by boosting, and then use all of them as an ensemble. More +specifically, half of the base learners are generated by supervised domain +adaptation, and half by semi-supervised learning. Furthermore, for more +efficient data transmission and better data privacy protection, we propose a +source data generation approach to extend SS-TrBoosting to semi-supervised +source-free domain adaptation (SS-SFDA). Extensive experiments showed that +SS-TrBoosting can be applied to a variety of existing UDA, SSDA and SFDA +approaches to further improve their performance. + +
+
+
+
+
+ + ☆ Node Classification With Integrated Reject Option + + +
+ One of the key tasks in graph learning is node classification. While Graph +neural networks have been used for various applications, their adaptivity to +reject option setting is not previously explored. In this paper, we propose +NCwR, a novel approach to node classification in Graph Neural Networks (GNNs) +with an integrated reject option, which allows the model to abstain from making +predictions when uncertainty is high. We propose both cost-based and +coverage-based methods for classification with abstention in node +classification setting using GNNs. We perform experiments using our method on +three standard citation network datasets Cora, Citeseer and Pubmed and compare +with relevant baselines. We also model the Legal judgment prediction problem on +ILDC dataset as a node classification problem where nodes represent legal cases +and edges represent citations. We further interpret the model by analyzing the +cases that the model abstains from predicting by visualizing which part of the +input features influenced this decision. + +
+
+
+
+
+ + ☆ Semi-decentralized Training of Spatio-Temporal Graph Neural Networks for + Traffic Prediction + + +
+ In smart mobility, large networks of geographically distributed sensors +produce vast amounts of high-frequency spatio-temporal data that must be +processed in real time to avoid major disruptions. Traditional centralized +approaches are increasingly unsuitable to this task, as they struggle to scale +with expanding sensor networks, and reliability issues in central components +can easily affect the whole deployment. To address these challenges, we explore +and adapt semi-decentralized training techniques for Spatio-Temporal Graph +Neural Networks (ST-GNNs) in smart mobility domain. We implement a simulation +framework where sensors are grouped by proximity into multiple cloudlets, each +handling a subgraph of the traffic graph, fetching node features from other +cloudlets to train its own local ST-GNN model, and exchanging model updates +with other cloudlets to ensure consistency, enhancing scalability and removing +reliance on a centralized aggregator. We perform extensive comparative +evaluation of four different ST-GNN training setups -- centralized, traditional +FL, server-free FL, and Gossip Learning -- on large-scale traffic datasets, the +METR-LA and PeMS-BAY datasets, for short-, mid-, and long-term vehicle speed +predictions. Experimental results show that semi-decentralized setups are +comparable to centralized approaches in performance metrics, while offering +advantages in terms of scalability and fault tolerance. In addition, we +highlight often overlooked issues in existing literature for distributed +ST-GNNs, such as the variation in model performance across different +geographical areas due to region-specific traffic patterns, and the significant +communication overhead and computational costs that arise from the large +receptive field of GNNs, leading to substantial data transfers and increased +computation of partial embeddings. + +
+
+ comment: 8 pages, 4 figures, 3 tables, conference +
+
+
+
+
+ + ☆ Towards Understanding and Quantifying Uncertainty for Text-to-Image + Generation + + +
+ Uncertainty quantification in text-to-image (T2I) generative models is +crucial for understanding model behavior and improving output reliability. In +this paper, we are the first to quantify and evaluate the uncertainty of T2I +models with respect to the prompt. Alongside adapting existing approaches +designed to measure uncertainty in the image space, we also introduce +Prompt-based UNCertainty Estimation for T2I models (PUNC), a novel method +leveraging Large Vision-Language Models (LVLMs) to better address uncertainties +arising from the semantics of the prompt and generated images. PUNC utilizes a +LVLM to caption a generated image, and then compares the caption with the +original prompt in the more semantically meaningful text space. PUNC also +enables the disentanglement of both aleatoric and epistemic uncertainties via +precision and recall, which image-space approaches are unable to do. Extensive +experiments demonstrate that PUNC outperforms state-of-the-art uncertainty +estimation techniques across various settings. Uncertainty quantification in +text-to-image generation models can be used on various applications including +bias detection, copyright protection, and OOD detection. We also introduce a +comprehensive dataset of text prompts and generation pairs to foster further +research in uncertainty quantification for generative models. Our findings +illustrate that PUNC not only achieves competitive performance but also enables +novel applications in evaluating and improving the trustworthiness of +text-to-image models. + +
+
+ comment: 28 pages and 22 figures +
+
+
+
+
+ + ☆ LEP-QNN: Loan Eligibility Prediction Using Quantum Neural Networks + + +
+ Predicting loan eligibility with high accuracy remains a significant +challenge in the finance sector. Accurate predictions enable financial +institutions to make informed decisions, mitigate risks, and effectively adapt +services to meet customer needs. However, the complexity and the +high-dimensional nature of financial data have always posed significant +challenges to achieving this level of precision. To overcome these issues, we +propose a novel approach that employs Quantum Machine Learning (QML) for Loan +Eligibility Prediction using Quantum Neural Networks (LEP-QNN).Our innovative +approach achieves an accuracy of 98% in predicting loan eligibility from a +single, comprehensive dataset. This performance boost is attributed to the +strategic implementation of a dropout mechanism within the quantum circuit, +aimed at minimizing overfitting and thereby improving the model's predictive +reliability. In addition, our exploration of various optimizers leads to +identifying the most efficient setup for our LEP-QNN framework, optimizing its +performance. We also rigorously evaluate the resilience of LEP-QNN under +different quantum noise scenarios, ensuring its robustness and dependability +for quantum computing environments. This research showcases the potential of +QML in financial predictions and establishes a foundational guide for advancing +QML technologies, marking a step towards developing advanced, quantum-driven +financial decision-making tools. + +
+
+ comment: 8 pages. 6 figures, 3 tables +
+
+
+
+
+ + ☆ Testing Neural Network Verifiers: A Soundness Benchmark with Hidden + Counterexamples + + +
+ In recent years, many neural network (NN) verifiers have been developed to +formally verify certain properties of neural networks such as robustness. +Although many benchmarks have been constructed to evaluate the performance of +NN verifiers, they typically lack a ground-truth for hard instances where no +current verifier can verify and no counterexample can be found, which makes it +difficult to check the soundness of a new verifier if it claims to verify hard +instances which no other verifier can do. We propose to develop a soundness +benchmark for NN verification. Our benchmark contains instances with +deliberately inserted counterexamples while we also try to hide the +counterexamples from regular adversarial attacks which can be used for finding +counterexamples. We design a training method to produce neural networks with +such hidden counterexamples. Our benchmark aims to be used for testing the +soundness of NN verifiers and identifying falsely claimed verifiability when it +is known that hidden counterexamples exist. We systematically construct our +benchmark and generate instances across diverse model architectures, activation +functions, input sizes, and perturbation radii. We demonstrate that our +benchmark successfully identifies bugs in state-of-the-art NN verifiers, as +well as synthetic bugs, providing a crucial step toward enhancing the +reliability of testing NN verifiers. Our code is available at +https://github.com/MVP-Harry/SoundnessBench and our benchmark is available at +https://huggingface.co/datasets/SoundnessBench/SoundnessBench. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Topological Trajectory Classification and Landmark Inference on + Simplicial Complexes + + +
+ We consider the problem of classifying trajectories on a discrete or +discretised 2-dimensional manifold modelled by a simplicial complex. Previous +works have proposed to project the trajectories into the harmonic eigenspace of +the Hodge Laplacian, and then cluster the resulting embeddings. However, if the +considered space has vanishing homology (i.e., no "holes"), then the harmonic +space of the 1-Hodge Laplacian is trivial and thus the approach fails. Here we +propose to view this issue akin to a sensor placement problem and present an +algorithm that aims to learn "optimal holes" to distinguish a set of given +trajectory classes. Specifically, given a set of labelled trajectories, which +we interpret as edge-flows on the underlying simplicial complex, we search for +2-simplices whose deletion results in an optimal separation of the trajectory +labels according to the corresponding spectral embedding of the trajectories +into the harmonic space. Finally, we generalise this approach to the +unsupervised setting. + +
+
+ comment: 5 pages, 4 figures, Accepted at the 58th Annual Asilomar Conference + on Signals, Systems, and Computers 2024 +
+
+
+
+
+ + ☆ Generalized Diffusion Model with Adjusted Offset Noise + + +
+ Diffusion models have become fundamental tools for modeling data +distributions in machine learning and have applications in image generation, +drug discovery, and audio synthesis. Despite their success, these models face +challenges when generating data with extreme brightness values, as evidenced by +limitations in widely used frameworks like Stable Diffusion. Offset noise has +been proposed as an empirical solution to this issue, yet its theoretical basis +remains insufficiently explored. In this paper, we propose a generalized +diffusion model that naturally incorporates additional noise within a rigorous +probabilistic framework. Our approach modifies both the forward and reverse +diffusion processes, enabling inputs to be diffused into Gaussian distributions +with arbitrary mean structures. We derive a loss function based on the evidence +lower bound, establishing its theoretical equivalence to offset noise with +certain adjustments, while broadening its applicability. Experiments on +synthetic datasets demonstrate that our model effectively addresses +brightness-related challenges and outperforms conventional methods in +high-dimensional scenarios. + +
+
+
+
+
+ + ☆ Unifying KV Cache Compression for Large Language Models with LeanKV + + +
+ Large language models (LLMs) demonstrate exceptional performance but incur +high serving costs due to substantial memory demands, with the key-value (KV) +cache being a primary bottleneck. Existing KV cache compression methods, +including quantization and pruning, struggle with limitations such as uniform +treatment of keys and values and static memory allocation across attention +heads. To address these challenges, we introduce LeanKV, a unified KV cache +compression framework that enhances LLM serving efficiency without compromising +accuracy through three innovations: (1) Hetero-KV quantization, which stores +keys at a higher precision than values to reflect their greater impact on +attention computations; (2) per-head dynamic sparsity, which allocates memory +based on token importance per head and per request; and (3) unified KV +compression, integrating mixed-precision quantization and selective pruning to +enable a smooth tradeoff between model accuracy and memory efficiency. To +efficiently support these techniques, LeanKV introduces systems optimizations +including unified paging and on-GPU parallel memory management. Implemented on +vLLM, LeanKV compresses the KV cache by $3.0\times$ to $5.0\times$ without +accuracy loss and up to $11.0\times$ with under 5% accuracy loss, enhancing +throughput by $1.9\times$ to $2.5\times$, and up to $6.9\times$. + +
+
+
+
+
+ + ☆ Sinkhorn Algorithm for Sequentially Composed Optimal Transports + + +
+ Sinkhorn algorithm is the de-facto standard approximation algorithm for +optimal transport, which has been applied to a variety of applications, +including image processing and natural language processing. In theory, the +proof of its convergence follows from the convergence of the Sinkhorn--Knopp +algorithm for the matrix scaling problem, and Altschuler et al. show that its +worst-case time complexity is in near-linear time. Very recently, sequentially +composed optimal transports were proposed by Watanabe and Isobe as a +hierarchical extension of optimal transports. In this paper, we present an +efficient approximation algorithm, namely Sinkhorn algorithm for sequentially +composed optimal transports, for its entropic regularization. Furthermore, we +present a theoretical analysis of the Sinkhorn algorithm, namely (i) its +exponential convergence to the optimal solution with respect to the Hilbert +pseudometric, and (ii) a worst-case complexity analysis for the case of one +sequential composition. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Few-Shot Learning with Adaptive Weight Masking in Conditional GANs + + +
+ Deep learning has revolutionized various fields, yet its efficacy is hindered +by overfitting and the requirement of extensive annotated data, particularly in +few-shot learning scenarios where limited samples are available. This paper +introduces a novel approach to few-shot learning by employing a Residual Weight +Masking Conditional Generative Adversarial Network (RWM-CGAN) for data +augmentation. The proposed model integrates residual units within the generator +to enhance network depth and sample quality, coupled with a weight mask +regularization technique in the discriminator to improve feature learning from +small-sample categories. This method addresses the core issues of robustness +and generalization in few-shot learning by providing a controlled and clear +augmentation of the sample space. Extensive experiments demonstrate that +RWM-CGAN not only expands the sample space effectively but also enriches the +diversity and quality of generated samples, leading to significant improvements +in detection and classification accuracy on public datasets. The paper +contributes to the advancement of few-shot learning by offering a practical +solution to the challenges posed by data scarcity and the need for rapid +generalization to new tasks or categories. + +
+
+
+
+
+ + ☆ Enhancing Recommendation Systems with GNNs and Addressing Over-Smoothing + + +
+ This paper addresses key challenges in enhancing recommendation systems by +leveraging Graph Neural Networks (GNNs) and addressing inherent limitations +such as over-smoothing, which reduces model effectiveness as network hierarchy +deepens. The proposed approach introduces three GNN-based recommendation +models, specifically designed to mitigate over-smoothing through innovative +mechanisms like residual connections and identity mapping within the +aggregation propagation process. These modifications enable more effective +information flow across layers, preserving essential user-item interaction +details to improve recommendation accuracy. Additionally, the study emphasizes +the critical need for interpretability in recommendation systems, aiming to +provide transparent and justifiable suggestions tailored to dynamic user +preferences. By integrating collaborative filtering with GNN architectures, the +proposed models not only enhance predictive accuracy but also align +recommendations more closely with individual behaviors, adapting to nuanced +shifts in user interests. This work advances the field by tackling both +technical and user-centric challenges, contributing to the development of +robust and explainable recommendation systems capable of managing the +complexity and scale of modern online environments. + +
+
+
+
+
+ + ☆ Revolve: Optimizing AI Systems by Tracking Response Evolution in Textual + Optimization + + +
+ Recent advancements in large language models (LLMs) have significantly +enhanced the ability of LLM-based systems to perform complex tasks through +natural language processing and tool interaction. However, optimizing these +LLM-based systems for specific tasks remains challenging, often requiring +manual interventions like prompt engineering and hyperparameter tuning. +Existing automatic optimization methods, such as textual feedback-based +techniques (e.g., TextGrad), tend to focus on immediate feedback, analogous to +using immediate derivatives in traditional numerical gradient descent. However, +relying solely on such feedback can be limited when the adjustments made in +response to this feedback are either too small or fluctuate irregularly, +potentially slowing down or even stalling the optimization process. To overcome +these challenges, more adaptive methods are needed, especially in situations +where the system's response is evolving slowly or unpredictably. In this paper, +we introduce REVOLVE, an optimization method that tracks how "R"esponses +"EVOLVE" across iterations in LLM systems. By focusing on the evolution of +responses over time, REVOLVE enables more stable and effective optimization by +making thoughtful, progressive adjustments at each step. Experimental results +demonstrate that REVOLVE outperforms competitive baselines, achieving a 7.8% +improvement in prompt optimization, a 20.72% gain in solution refinement, and a +29.17% increase in code optimization. Additionally, REVOLVE converges in fewer +iterations, resulting in significant computational savings. These advantages +highlight its adaptability and efficiency, positioning REVOLVE as a valuable +tool for optimizing LLM-based systems and accelerating the development of +next-generation AI technologies. Code is available at: +https://github.com/Peiyance/REVOLVE. + +
+
+ comment: 20 pages, 2 figures +
+
+
+
+
+ + ☆ Hybrid deep learning-based strategy for the hepatocellular carcinoma + cancer grade classification of H&E stained liver histopathology images + + +
+ Hepatocellular carcinoma (HCC) is a common type of liver cancer whose +early-stage diagnosis is a common challenge, mainly due to the manual +assessment of hematoxylin and eosin-stained whole slide images, which is a +time-consuming process and may lead to variability in decision-making. For +accurate detection of HCC, we propose a hybrid deep learning-based architecture +that uses transfer learning to extract the features from pre-trained +convolutional neural network (CNN) models and a classifier made up of a +sequence of fully connected layers. This study uses a publicly available The +Cancer Genome Atlas Hepatocellular Carcinoma (TCGA-LIHC)database (n=491) for +model development and database of Kasturba Gandhi Medical College (KMC), India +for validation. The pre-processing step involves patch extraction, colour +normalization, and augmentation that results in 3920 patches for the TCGA +dataset. The developed hybrid deep neural network consisting of a CNN-based +pre-trained feature extractor and a customized artificial neural network-based +classifier is trained using five-fold cross-validation. For this study, eight +different state-of-the-art models are trained and tested as feature extractors +for the proposed hybrid model. The proposed hybrid model with ResNet50-based +feature extractor provided the sensitivity, specificity, F1-score, accuracy, +and AUC of 100.00%, 100.00%, 100.00%, 100.00%, and 1.00, respectively on the +TCGA database. On the KMC database, EfficientNetb3 resulted in the optimal +choice of the feature extractor giving sensitivity, specificity, F1-score, +accuracy, and AUC of 96.97, 98.85, 96.71, 96.71, and 0.99, respectively. The +proposed hybrid models showed improvement in accuracy of 2% and 4% over the +pre-trained models in TCGA-LIHC and KMC databases. + +
+
+ comment: 14 figure, 9 tables +
+
+
+
+
+ + ☆ A Scalable Quantum Neural Network for Approximate SRBB-Based Unitary + Synthesis + + +
+ In this work, scalable quantum neural networks are introduced to approximate +unitary evolutions through the Standard Recursive Block Basis (SRBB) and, +subsequently, redesigned with a reduced number of CNOTs. This algebraic +approach to the problem of unitary synthesis exploits Lie algebras and their +topological features to obtain scalable parameterizations of unitary operators. +First, the recursive algorithm that builds the SRBB is presented, framed in the +original scalability scheme already known to the literature only from a +theoretical point of view. Unexpectedly, 2-qubit systems emerge as a special +case outside this scheme. Furthermore, an algorithm to reduce the number of +CNOTs is proposed, thus deriving a new implementable scaling scheme that +requires one single layer of approximation. From the mathematical algorithm, +the scalable CNOT-reduced quantum neural network is implemented and its +performance is assessed with a variety of different unitary matrices, both +sparse and dense, up to 6 qubits via the PennyLane library. The effectiveness +of the approximation is measured with different metrics in relation to two +optimizers: a gradient-based method and the Nelder-Mead method. The approximate +SRBB-based synthesis algorithm with CNOT-reduction is also tested on real +hardware and compared with other valid approximation and decomposition methods +available in the literature. + +
+
+ comment: Journal +
+
+
+
+
+ + ☆ UTSD: Unified Time Series Diffusion Model + + +
+ Transformer-based architectures have achieved unprecedented success in time +series analysis. However, facing the challenge of across-domain modeling, +existing studies utilize statistical prior as prompt engineering fails under +the huge distribution shift among various domains. In this paper, a Unified +Time Series Diffusion (UTSD) model is established for the first time to model +the multi-domain probability distribution, utilizing the powerful probability +distribution modeling ability of Diffusion. Unlike the autoregressive models +that capture the conditional probabilities of the prediction horizon to the +historical sequence, we use a diffusion denoising process to model the mixture +distribution of the cross-domain data and generate the prediction sequence for +the target domain directly utilizing conditional sampling. The proposed UTSD +contains three pivotal designs: (1) The condition network captures the +multi-scale fluctuation patterns from the observation sequence, which are +utilized as context representations to guide the denoising network to generate +the prediction sequence; (2) Adapter-based fine-tuning strategy, the +multi-domain universal representation learned in the pretraining stage is +utilized for downstream tasks in target domains; (3) The diffusion and +denoising process on the actual sequence space, combined with the improved +classifier free guidance as the conditional generation strategy, greatly +improves the stability and accuracy of the downstream task. We conduct +extensive experiments on mainstream benchmarks, and the pre-trained UTSD +outperforms existing foundation models on all data domains, exhibiting superior +zero-shot generalization ability. After training from scratch, UTSD achieves +comparable performance against domain-specific proprietary models. The +empirical results validate the potential of UTSD as a time series foundational +model. + +
+
+
+
+
+ + ☆ Point-GN: A Non-Parametric Network Using Gaussian Positional Encoding + for Point Cloud Classification WACV + + +
+ This paper introduces Point-GN, a novel non-parametric network for efficient +and accurate 3D point cloud classification. Unlike conventional deep learning +models that rely on a large number of trainable parameters, Point-GN leverages +non-learnable components-specifically, Farthest Point Sampling (FPS), k-Nearest +Neighbors (k-NN), and Gaussian Positional Encoding (GPE)-to extract both local +and global geometric features. This design eliminates the need for additional +training while maintaining high performance, making Point-GN particularly +suited for real-time, resource-constrained applications. We evaluate Point-GN +on two benchmark datasets, ModelNet40 and ScanObjectNN, achieving +classification accuracies of 85.29% and 85.89%, respectively, while +significantly reducing computational complexity. Point-GN outperforms existing +non-parametric methods and matches the performance of fully trained models, all +with zero learnable parameters. Our results demonstrate that Point-GN is a +promising solution for 3D point cloud classification in practical, real-time +environments. + +
+
+ comment: This paper has been accepted for presentation at the IEEE Winter + Conference on Applications of Computer Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Less is More: A Stealthy and Efficient Adversarial Attack Method for + DRL-based Autonomous Driving Policies + + +
+ Despite significant advancements in deep reinforcement learning (DRL)-based +autonomous driving policies, these policies still exhibit vulnerability to +adversarial attacks. This vulnerability poses a formidable challenge to the +practical deployment of these policies in autonomous driving. Designing +effective adversarial attacks is an indispensable prerequisite for enhancing +the robustness of these policies. In view of this, we present a novel stealthy +and efficient adversarial attack method for DRL-based autonomous driving +policies. Specifically, we introduce a DRL-based adversary designed to trigger +safety violations (e.g., collisions) by injecting adversarial samples at +critical moments. We model the attack as a mixed-integer optimization problem +and formulate it as a Markov decision process. Then, we train the adversary to +learn the optimal policy for attacking at critical moments without domain +knowledge. Furthermore, we introduce attack-related information and a +trajectory clipping method to enhance the learning capability of the adversary. +Finally, we validate our method in an unprotected left-turn scenario across +different traffic densities. The experimental results show that our method +achieves more than 90% collision rate within three attacks in most cases. +Furthermore, our method achieves more than 130% improvement in attack +efficiency compared to the unlimited attack method. + +
+
+
+
+
+ + ☆ MILLION: A General Multi-Objective Framework with Controllable Risk for + Portfolio Management VLDB 2025 + + +
+ Portfolio management is an important yet challenging task in AI for FinTech, +which aims to allocate investors' budgets among different assets to balance the +risk and return of an investment. In this study, we propose a general +Multi-objectIve framework with controLLable rIsk for pOrtfolio maNagement +(MILLION), which consists of two main phases, i.e., return-related maximization +and risk control. Specifically, in the return-related maximization phase, we +introduce two auxiliary objectives, i.e., return rate prediction, and return +rate ranking, combined with portfolio optimization to remit the overfitting +problem and improve the generalization of the trained model to future markets. +Subsequently, in the risk control phase, we propose two methods, i.e., +portfolio interpolation and portfolio improvement, to achieve fine-grained risk +control and fast risk adaption to a user-specified risk level. For the +portfolio interpolation method, we theoretically prove that the risk can be +perfectly controlled if the to-be-set risk level is in a proper interval. In +addition, we also show that the return rate of the adjusted portfolio after +portfolio interpolation is no less than that of the min-variance optimization, +as long as the model in the reward maximization phase is effective. +Furthermore, the portfolio improvement method can achieve greater return rates +while keeping the same risk level compared to portfolio interpolation. +Extensive experiments are conducted on three real-world datasets. The results +demonstrate the effectiveness and efficiency of the proposed framework. + +
+
+ comment: accepted by VLDB 2025 +
+
+
+
+
+ + ☆ A Granger-Causal Perspective on Gradient Descent with Application to + Pruning + + +
+ Stochastic Gradient Descent (SGD) is the main approach to optimizing neural +networks. Several generalization properties of deep networks, such as +convergence to a flatter minima, are believed to arise from SGD. This article +explores the causality aspect of gradient descent. Specifically, we show that +the gradient descent procedure has an implicit granger-causal relationship +between the reduction in loss and a change in parameters. By suitable +modifications, we make this causal relationship explicit. A causal approach to +gradient descent has many significant applications which allow greater control. +In this article, we illustrate the significance of the causal approach using +the application of Pruning. The causal approach to pruning has several +interesting properties - (i) We observe a phase shift as the percentage of +pruned parameters increase. Such phase shift is indicative of an optimal +pruning strategy. (ii) After pruning, we see that minima becomes "flatter", +explaining the increase in accuracy after pruning weights. + +
+
+
+
+
+ + ☆ Hamiltonian-based neural networks for systems under nonholonomic + constraints + + +
+ There has been increasing interest in methodologies that incorporate physics +priors into neural network architectures to enhance their modeling +capabilities. A family of these methodologies that has gained traction are +Hamiltonian neural networks (HNN) and their variations. These architectures +explicitly encode Hamiltonian mechanics both in their structure and loss +function. Although Hamiltonian systems under nonholonomic constraints are in +general not Hamiltonian, it is possible to formulate them in pseudo-Hamiltonian +form, equipped with a Lie bracket which is almost Poisson. This opens the +possibility of using some principles of HNNs in systems under nonholonomic +constraints. The goal of the present work is to develop a modified Hamiltonian +neural network architecture capable of modeling Hamiltonian systems under +holonomic and nonholonomic constraints. A three-network parallel architecture +is proposed to simultaneously learn the Hamiltonian of the system, the +constraints, and their associated multipliers. A rolling disk and a ball on a +spinning table are considered as canonical examples to assess the performance +of the proposed Hamiltonian architecture. The experiments are then repeated +with a noisy training set to study modeling performance under more realistic +conditions. + +
+
+
+
+
+ + ☆ Learning Whole-Body Loco-Manipulation for Omni-Directional Task Space + Pose Tracking with a Wheeled-Quadrupedal-Manipulator + + +
+ In this paper, we study the whole-body loco-manipulation problem using +reinforcement learning (RL). Specifically, we focus on the problem of how to +coordinate the floating base and the robotic arm of a wheeled-quadrupedal +manipulator robot to achieve direct six-dimensional (6D) end-effector (EE) pose +tracking in task space. Different from conventional whole-body +loco-manipulation problems that track both floating-base and end-effector +commands, the direct EE pose tracking problem requires inherent balance among +redundant degrees of freedom in the whole-body motion. We leverage RL to solve +this challenging problem. To address the associated difficulties, we develop a +novel reward fusion module (RFM) that systematically integrates reward terms +corresponding to different tasks in a nonlinear manner. In such a way, the +inherent multi-stage and hierarchical feature of the loco-manipulation problem +can be carefully accommodated. By combining the proposed RFM with the a +teacher-student RL training paradigm, we present a complete RL scheme to +achieve 6D EE pose tracking for the wheeled-quadruped manipulator robot. +Extensive simulation and hardware experiments demonstrate the significance of +the RFM. In particular, we enable smooth and precise tracking performance, +achieving state-of-the-art tracking position error of less than 5 cm, and +rotation error of less than 0.1 rad. Please refer to +https://clearlab-sustech.github.io/RFM_loco_mani/ for more experimental videos. + +
+
+
+
+
+ + ☆ Data Acquisition for Improving Model Fairness using Reinforcement + Learning + + +
+ Machine learning systems are increasingly being used in critical decision +making such as healthcare, finance, and criminal justice. Concerns around their +fairness have resulted in several bias mitigation techniques that emphasize the +need for high-quality data to ensure fairer decisions. However, the role of +earlier stages of machine learning pipelines in mitigating model bias has not +been explored well. In this paper, we focus on the task of acquiring additional +labeled data points for training the downstream machine learning model to +rapidly improve its fairness. Since not all data points in a data pool are +equally beneficial to the task of fairness, we generate an ordering in which +data points should be acquired. We present DataSift, a data acquisition +framework based on the idea of data valuation that relies on partitioning and +multi-armed bandits to determine the most valuable data points to acquire. Over +several iterations, DataSift selects a partition and randomly samples a batch +of data points from the selected partition, evaluates the benefit of acquiring +the batch on model fairness, and updates the utility of partitions depending on +the benefit. To further improve the effectiveness and efficiency of evaluating +batches, we leverage influence functions that estimate the effect of acquiring +a batch without retraining the model. We empirically evaluate DataSift on +several real-world and synthetic datasets and show that the fairness of a +machine learning model can be significantly improved even while acquiring a few +data points. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ☆ Provably Extending PageRank-based Local Clustering Algorithm to Weighted + Directed Graphs with Self-Loops and to Hypergraphs + + +
+ Local clustering aims to find a compact cluster near the given starting +instances. This work focuses on graph local clustering, which has broad +applications beyond graphs because of the internal connectivities within +various modalities. While most existing studies on local graph clustering adopt +the discrete graph setting (i.e., unweighted graphs without self-loops), +real-world graphs can be more complex. In this paper, we extend the +non-approximating Andersen-Chung-Lang ("ACL") algorithm beyond discrete graphs +and generalize its quadratic optimality to a wider range of graphs, including +weighted, directed, and self-looped graphs and hypergraphs. Specifically, +leveraging PageRank, we propose two algorithms: GeneralACL for graphs and +HyperACL for hypergraphs. We theoretically prove that, under two mild +conditions, both algorithms can identify a quadratically optimal local cluster +in terms of conductance with at least 1/2 probability. On the property of +hypergraphs, we address a fundamental gap in the literature by defining +conductance for hypergraphs from the perspective of hypergraph random walks. +Additionally, we provide experiments to validate our theoretical findings. + +
+
+ comment: Preprint, 42 pages +
+
+
+
+
+ + ☆ Preference-based Pure Exploration + + +
+ We study the preference-based pure exploration problem for bandits with +vector-valued rewards. The rewards are ordered using a (given) preference cone +$\mathcal{C}$ and our the goal is to identify the set of Pareto optimal arms. +First, to quantify the impact of preferences, we derive a novel lower bound on +the sample complexity for identifying the most preferred policy with confidence +level $1-\delta$. Our lower bound elicits the role played by the geometry of +the preference cone and punctuates the difference in hardness compared to +existing best-arm identification variants of the problem. We further explicate +this geometry when rewards follow Gaussian distributions. We then provide a +convex relaxation of the lower bound. and leverage it to design +Preference-based Track and Stop (PreTS) algorithm that identifies the most +preferred policy. Finally, we show that sample complexity of PreTS is +asymptotically tight by deriving a new concentration inequality for +vector-valued rewards. + +
+
+
+
+
+ + ☆ Surveying the Effects of Quality, Diversity, and Complexity in Synthetic + Data From Large Language Models + + +
+ Synthetic data generation with Large Language Models is a promising paradigm +for augmenting natural data over a nearly infinite range of tasks. Given this +variety, direct comparisons among synthetic data generation algorithms are +scarce, making it difficult to understand where improvement comes from and what +bottlenecks exist. We propose to evaluate algorithms via the makeup of +synthetic data generated by each algorithm in terms of data quality, diversity, +and complexity. We choose these three characteristics for their significance in +open-ended processes and the impact each has on the capabilities of downstream +models. We find quality to be essential for in-distribution model +generalization, diversity to be essential for out-of-distribution +generalization, and complexity to be beneficial for both. Further, we emphasize +the existence of Quality-Diversity trade-offs in training data and the +downstream effects on model performance. We then examine the effect of various +components in the synthetic data pipeline on each data characteristic. This +examination allows us to taxonomize and compare synthetic data generation +algorithms through the components they utilize and the resulting effects on +data QDC composition. This analysis extends into a discussion on the importance +of balancing QDC in synthetic data for efficient reinforcement learning and +self-improvement algorithms. Analogous to the QD trade-offs in training data, +often there exist trade-offs between model output quality and output diversity +which impact the composition of synthetic data. We observe that many models are +currently evaluated and optimized only for output quality, thereby limiting +output diversity and the potential for self-improvement. We argue that +balancing these trade-offs is essential to the development of future +self-improvement algorithms and highlight a number of works making progress in +this direction. + +
+
+
+
+
+ + ☆ Theoretical limitations of multi-layer Transformer + + +
+ Transformers, especially the decoder-only variants, are the backbone of most +modern large language models; yet we do not have much understanding of their +expressive power except for the simple $1$-layer case. + Due to the difficulty of analyzing multi-layer models, all previous work +relies on unproven complexity conjectures to show limitations for multi-layer +Transformers. In this work, we prove the first $\textit{unconditional}$ lower +bound against multi-layer decoder-only transformers. For any constant $L$, we +prove that any $L$-layer decoder-only transformer needs a polynomial model +dimension ($n^{\Omega(1)}$) to perform sequential composition of $L$ functions +over an input of $n$ tokens. + As a consequence, our results give: (1) the first depth-width trade-off for +multi-layer transformers, exhibiting that the $L$-step composition task is +exponentially harder for $L$-layer models compared to $(L+1)$-layer ones; (2) +an unconditional separation between encoder and decoder, exhibiting a hard task +for decoders that can be solved by an exponentially shallower and smaller +encoder; (3) a provable advantage of chain-of-thought, exhibiting a task that +becomes exponentially easier with chain-of-thought. + On the technical side, we propose the multi-party $\textit{autoregressive}$ +$\textit{communication}$ $\textit{model}$ that captures the computation of a +decoder-only Transformer. We also introduce a new proof technique that finds a +certain $\textit{indistinguishable}$ $\textit{decomposition}$ of all possible +inputs iteratively for proving lower bounds in this model. We believe our new +communication model and proof technique will be helpful to further understand +the computational power of transformers. + +
+
+
+
+
+ + ☆ Unified Inductive Logic: From Formal Learning to Statistical Inference + to Supervised Learning + + +
+ While the traditional conception of inductive logic is Carnapian, I develop a +Peircean alternative and use it to unify formal learning theory, statistics, +and a significant part of machine learning: supervised learning. Some crucial +standards for evaluating non-deductive inferences have been assumed separately +in those areas, but can actually be justified by a unifying principle. + +
+
+
+
+
+ + ☆ How Many Ratings per Item are Necessary for Reliable Significance + Testing? + + +
+ Most approaches to machine learning evaluation assume that machine and human +responses are repeatable enough to be measured against data with unitary, +authoritative, "gold standard" responses, via simple metrics such as accuracy, +precision, and recall that assume scores are independent given the test item. +However, AI models have multiple sources of stochasticity and the human raters +who create gold standards tend to disagree with each other, often in meaningful +ways, hence a single output response per input item may not provide enough +information. We introduce methods for determining whether an (existing or +planned) evaluation dataset has enough responses per item to reliably compare +the performance of one model to another. We apply our methods to several of +very few extant gold standard test sets with multiple disaggregated responses +per item and show that there are usually not enough responses per item to +reliably compare the performance of one model against another. Our methods also +allow us to estimate the number of responses per item for hypothetical datasets +with similar response distributions to the existing datasets we study. When two +models are very far apart in their predictive performance, fewer raters are +needed to confidently compare them, as expected. However, as the models draw +closer, we find that a larger number of raters than are currently typical in +annotation collection are needed to ensure that the power analysis correctly +reflects the difference in performance. + +
+
+
+
+
+ + ☆ 3D Interaction Geometric Pre-training for Molecular Relational Learning + + +
+ Molecular Relational Learning (MRL) is a rapidly growing field that focuses +on understanding the interaction dynamics between molecules, which is crucial +for applications ranging from catalyst engineering to drug discovery. Despite +recent progress, earlier MRL approaches are limited to using only the 2D +topological structure of molecules, as obtaining the 3D interaction geometry +remains prohibitively expensive. This paper introduces a novel 3D geometric +pre-training strategy for MRL (3DMRL) that incorporates a 3D virtual +interaction environment, overcoming the limitations of costly traditional +quantum mechanical calculation methods. With the constructed 3D virtual +interaction environment, 3DMRL trains 2D MRL model to learn the overall 3D +geometric information of molecular interaction through contrastive learning. +Moreover, fine-grained interaction between molecules is learned through force +prediction loss, which is crucial in understanding the wide range of molecular +interaction processes. Extensive experiments on various tasks using real-world +datasets, including out-of-distribution and extrapolation scenarios, +demonstrate the effectiveness of 3DMRL, showing up to a 24.93\% improvement in +performance across 40 tasks. + +
+
+
+
+
+ + ☆ Incorporating System-level Safety Requirements in Perception Models via + Reinforcement Learning + + +
+ Perception components in autonomous systems are often developed and optimized +independently of downstream decision-making and control components, relying on +established performance metrics like accuracy, precision, and recall. +Traditional loss functions, such as cross-entropy loss and negative +log-likelihood, focus on reducing misclassification errors but fail to consider +their impact on system-level safety, overlooking the varying severities of +system-level failures caused by these errors. To address this limitation, we +propose a novel training paradigm that augments the perception component with +an understanding of system-level safety objectives. Central to our approach is +the translation of system-level safety requirements, formally specified using +the rulebook formalism, into safety scores. These scores are then incorporated +into the reward function of a reinforcement learning framework for fine-tuning +perception models with system-level safety objectives. Simulation results +demonstrate that models trained with this approach outperform baseline +perception models in terms of system-level safety. + +
+
+
+
+
+ + Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large + Vision-Language Model via Causality Analysis WACV2025 + + +
+ Recent advancements in large vision-language models (LVLM) have significantly +enhanced their ability to comprehend visual inputs alongside natural language. +However, a major challenge in their real-world application is hallucination, +where LVLMs generate non-existent visual elements, eroding user trust. The +underlying mechanism driving this multimodal hallucination is poorly +understood. Minimal research has illuminated whether contexts such as sky, +tree, or grass field involve the LVLM in hallucinating a frisbee. We +hypothesize that hidden factors, such as objects, contexts, and semantic +foreground-background structures, induce hallucination. This study proposes a +novel causal approach: a hallucination probing system to identify these hidden +factors. By analyzing the causality between images, text prompts, and network +saliency, we systematically explore interventions to block these factors. Our +experimental findings show that a straightforward technique based on our +analysis can significantly reduce hallucinations. Additionally, our analyses +indicate the potential to edit network internals to minimize hallucinated +outputs. + +
+
+ comment: Accepted by WACV2025 +
+
+
+
+
+ + ☆ SAVER: A Toolbox for Sampling-Based, Probabilistic Verification of + Neural Networks + + +
+ We present a neural network verification toolbox to 1) assess the probability +of satisfaction of a constraint, and 2) synthesize a set expansion factor to +achieve the probability of satisfaction. Specifically, the tool box establishes +with a user-specified level of confidence whether the output of the neural +network for a given input distribution is likely to be contained within a given +set. Should the tool determine that the given set cannot satisfy the likelihood +constraint, the tool also implements an approach outlined in this paper to +alter the constraint set to ensure that the user-defined satisfaction +probability is achieved. The toolbox is comprised of sampling-based approaches +which exploit the properties of signed distance function to define set +containment. + +
+
+ comment: 7 pages, 8 figures, submitted to the 28th ACM International + Conference on Hybrid Systems: Computation and Control +
+
+
+
+
+ + ☆ BGTplanner: Maximizing Training Accuracy for Differentially Private + Federated Recommenders via Strategic Privacy Budget Allocation + + +
+ To mitigate the rising concern about privacy leakage, the federated +recommender (FR) paradigm emerges, in which decentralized clients co-train the +recommendation model without exposing their raw user-item rating data. The +differentially private federated recommender (DPFR) further enhances FR by +injecting differentially private (DP) noises into clients. Yet, current DPFRs, +suffering from noise distortion, cannot achieve satisfactory accuracy. Various +efforts have been dedicated to improving DPFRs by adaptively allocating the +privacy budget over the learning process. However, due to the intricate +relation between privacy budget allocation and model accuracy, existing works +are still far from maximizing DPFR accuracy. To address this challenge, we +develop BGTplanner (Budget Planner) to strategically allocate the privacy +budget for each round of DPFR training, improving overall training performance. +Specifically, we leverage the Gaussian process regression and historical +information to predict the change in recommendation accuracy with a certain +allocated privacy budget. Additionally, Contextual Multi-Armed Bandit (CMAB) is +harnessed to make privacy budget allocation decisions by reconciling the +current improvement and long-term privacy constraints. Our extensive +experimental results on real datasets demonstrate that \emph{BGTplanner} +achieves an average improvement of 6.76\% in training performance compared to +state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Inverse Delayed Reinforcement Learning + + +
+ Inverse Reinforcement Learning (IRL) has demonstrated effectiveness in a +variety of imitation tasks. In this paper, we introduce an IRL framework +designed to extract rewarding features from expert trajectories affected by +delayed disturbances. Instead of relying on direct observations, our approach +employs an efficient off-policy adversarial training framework to derive expert +features and recover optimal policies from augmented delayed observations. +Empirical evaluations in the MuJoCo environment under diverse delay settings +validate the effectiveness of our method. Furthermore, we provide a theoretical +analysis showing that recovering expert policies from augmented delayed +observations outperforms using direct delayed observations. + +
+
+
+
+
+ + ☆ Harnessing Loss Decomposition for Long-Horizon Wave Predictions via Deep + Neural Networks NeurIPS + + +
+ Accurate prediction over long time horizons is crucial for modeling complex +physical processes such as wave propagation. Although deep neural networks show +promise for real-time forecasting, they often struggle with accumulating phase +and amplitude errors as predictions extend over a long period. To address this +issue, we propose a novel loss decomposition strategy that breaks down the loss +into separate phase and amplitude components. This technique improves the +long-term prediction accuracy of neural networks in wave propagation tasks by +explicitly accounting for numerical errors, improving stability, and reducing +error accumulation over extended forecasts. + +
+
+ comment: 6 pages, 4 figures, NeurIPS Machine Learning for Physical Sciences + workshop +
+
+
+
+
+ + ☆ Higher Order Transformers: Efficient Attention Mechanism for Tensor + Structured Data + + +
+ Transformers are now ubiquitous for sequence modeling tasks, but their +extension to multi-dimensional data remains a challenge due to the quadratic +cost of the attention mechanism. In this paper, we propose Higher-Order +Transformers (HOT), a novel architecture designed to efficiently process data +with more than two axes, i.e. higher-order tensors. To address the +computational challenges associated with high-order tensor attention, we +introduce a novel Kronecker factorized attention mechanism that reduces the +attention cost to quadratic in each axis' dimension, rather than quadratic in +the total size of the input tensor. To further enhance efficiency, HOT +leverages kernelized attention, reducing the complexity to linear. This +strategy maintains the model's expressiveness while enabling scalable attention +computation. We validate the effectiveness of HOT on two high-dimensional +tasks, including multivariate time series forecasting, and 3D medical image +classification. Experimental results demonstrate that HOT achieves competitive +performance while significantly improving computational efficiency, showcasing +its potential for tackling a wide range of complex, multi-dimensional data. + +
+
+
+
+
+ + ♻ ☆ Yo'LLaVA: Your Personalized Language and Vision Assistant NeurIPS 2024 + + +
+ Large Multimodal Models (LMMs) have shown remarkable capabilities across a +variety of tasks (e.g., image captioning, visual question answering). While +broad, their knowledge remains generic (e.g., recognizing a dog), and they are +unable to handle personalized subjects (e.g., recognizing a user's pet dog). +Human reasoning, in contrast, typically operates within the context of specific +subjects in our surroundings. For example, one might ask, "What should I buy +for my dog's birthday?"; as opposed to a generic inquiry about "What should I +buy for a dog's birthday?". Similarly, when looking at a friend's image, the +interest lies in seeing their activities (e.g., "my friend is holding a cat"), +rather than merely observing generic human actions (e.g., "a man is holding a +cat"). In this paper, we introduce the novel task of personalizing LMMs, so +that they can have conversations about a specific subject. We propose Yo'LLaVA, +which learns to embed a personalized subject into a set of latent tokens given +a handful of example images of the subject. Our qualitative and quantitative +analyses reveal that Yo'LLaVA can learn the concept more efficiently using +fewer tokens and more effectively encode the visual attributes compared to +strong prompting baselines (e.g., LLaVA). + +
+
+ comment: NeurIPS 2024; Project page: https://thaoshibe.github.io/YoLLaVA +
+
+
+
+
+ + ♻ ☆ DynaMITE-RL: A Dynamic Model for Improved Temporal Meta-Reinforcement + Learning + + +
+ We introduce DynaMITE-RL, a meta-reinforcement learning (meta-RL) approach to +approximate inference in environments where the latent state evolves at varying +rates. We model episode sessions - parts of the episode where the latent state +is fixed - and propose three key modifications to existing meta-RL methods: +consistency of latent information within sessions, session masking, and prior +latent conditioning. We demonstrate the importance of these modifications in +various domains, ranging from discrete Gridworld environments to +continuous-control and simulated robot assistive tasks, demonstrating that +DynaMITE-RL significantly outperforms state-of-the-art baselines in sample +efficiency and inference returns. + +
+
+
+
+
+ + ♻ ☆ Fast and reliable uncertainty quantification with neural network + ensembles for industrial image classification + + +
+ Image classification with neural networks (NNs) is widely used in industrial +processes, situations where the model likely encounters unknown objects during +deployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make +confident yet incorrect predictions when confronted with OOD data. To increase +the models' reliability, they should quantify the uncertainty in their own +predictions, communicating when the output should (not) be trusted. Deep +ensembles, composed of multiple independent NNs, have been shown to perform +strongly but are computationally expensive. Recent research has proposed more +efficient NN ensembles, namely the snapshot, batch, and multi-input +multi-output ensemble. This study investigates the predictive and uncertainty +performance of efficient NN ensembles in the context of image classification +for industrial processes. It is the first to provide a comprehensive comparison +and it proposes a novel Diversity Quality metric to quantify the ensembles' +performance on the in-distribution and OOD sets in one single metric. The +results highlight the batch ensemble as a cost-effective and competitive +alternative to the deep ensemble. It matches the deep ensemble in both +uncertainty and accuracy while exhibiting considerable savings in training +time, test time, and memory storage. + +
+
+ comment: Submitted to Annals of Operations Research +
+
+
+
+
+ + ♻ ☆ Marconi: Prefix Caching for the Era of Hybrid LLMs + + +
+ Hybrid models that combine the language modeling capabilities of Attention +layers with the efficiency of Recurrent layers (e.g., State Space Models) have +gained traction in practically supporting long contexts in Large Language Model +serving. Yet, the unique properties of these models complicate the usage of +complementary efficiency optimizations such as prefix caching that skip +redundant computations across requests. Most notably, their use of in-place +state updates for recurrent layers precludes rolling back cache entries for +partial sequence overlaps, and instead mandates only exact-match cache hits; +the effect is a deluge of (large) cache entries per sequence, most of which +yield minimal reuse opportunities. We present Marconi, the first system that +supports efficient prefix caching with Hybrid LLMs. Key to Marconi are its +novel admission and eviction policies that more judiciously assess potential +cache entries based not only on recency, but also on (1) forecasts of their +reuse likelihood across a taxonomy of different hit scenarios, and (2) the +compute savings that hits deliver relative to memory footprints. Across diverse +workloads and Hybrid models, Marconi achieves up to 34.4$\times$ higher token +hit rates (71.1% or 617 ms lower TTFT) compared to state-of-the-art prefix +caching systems. + +
+
+
+
+
+ + ♻ ☆ Instance-Warp: Saliency Guided Image Warping for Unsupervised Domain + Adaptation WACV 2025 + + +
+ Driving is challenging in conditions like night, rain, and snow. Lack of good +labeled datasets has hampered progress in scene understanding under such +conditions. Unsupervised Domain Adaptation (UDA) using large labeled clear-day +datasets is a promising research direction in such cases. However, many UDA +methods are trained with dominant scene backgrounds (e.g., roads, sky, +sidewalks) that appear dramatically different across domains. As a result, they +struggle to learn effective features of smaller and often sparse foreground +objects (e.g., people, vehicles, signs). + In this work, we improve UDA training by applying in-place image warping to +focus on salient objects. We design instance-level saliency guidance to +adaptively oversample object regions and undersample background areas, which +reduces adverse effects from background context and enhances backbone feature +learning. Our approach improves adaptation across geographies, lighting, and +weather conditions, and is agnostic to the task (segmentation, detection), +domain adaptation algorithm, saliency guidance, and underlying model +architecture. Result highlights include +6.1 mAP50 for BDD100K Clear +$\rightarrow$ DENSE Foggy, +3.7 mAP50 for BDD100K Day $\rightarrow$ Night, +3.0 +mAP50 for BDD100K Clear $\rightarrow$ Rainy, and +6.3 mIoU for Cityscapes +$\rightarrow$ ACDC. Besides, Our method adds minimal training memory and no +additional inference latency. Code is available at +https://github.com/ShenZheng2000/Instance-Warp + +
+
+ comment: WACV 2025 Accepted Paper +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Data Deduplication for Enhancing Federated Learning + of Language Models (Extended Version) NDSS + + +
+ Deduplication is a vital preprocessing step that enhances machine learning +model performance and saves training time and energy. However, enhancing +federated learning through deduplication poses challenges, especially regarding +scalability and potential privacy violations if deduplication involves sharing +all clients' data. In this paper, we address the problem of deduplication in a +federated setup by introducing a pioneering protocol, Efficient +Privacy-Preserving Multi-Party Deduplication (EP-MPD). It efficiently removes +duplicates from multiple clients' datasets without compromising data privacy. +EP-MPD is constructed in a modular fashion, utilizing two novel variants of the +Private Set Intersection protocol. Our extensive experiments demonstrate the +significant benefits of deduplication in federated learning of large language +models. For instance, we observe up to 19.62\% improvement in perplexity and up +to 27.95\% reduction in running time while varying the duplication level +between 10\% and 30\%. EP-MPD effectively balances privacy and performance in +federated learning, making it a valuable solution for large-scale applications. + +
+
+ comment: Accepted at the Network and Distributed Systems Security (NDSS) + Symposium, 2025 +
+
+
+
+
+ + ♻ ☆ Towards Time Series Reasoning with LLMs NeurIPS + + +
+ Multi-modal large language models (MLLMs) have enabled numerous advances in +understanding and reasoning in domains like vision, but we have not yet seen +this broad success for time-series. Although prior works on time-series MLLMs +have shown promising performance in time-series forecasting, very few works +show how an LLM could be used for time-series reasoning in natural language. We +propose a novel multi-modal time-series LLM approach that learns generalizable +information across various domains with powerful zero-shot performance. First, +we train a lightweight time-series encoder on top of an LLM to directly extract +time-series information. Then, we fine-tune our model with chain-of-thought +augmented time-series tasks to encourage the model to generate reasoning paths. +We show that our model learns a latent representation that reflects specific +time-series features (e.g. slope, frequency), as well as outperforming GPT-4o +on a set of zero-shot reasoning tasks on a variety of domains. + +
+
+ comment: Oral Presentation at 2024 NeurIPS Workshop on Time Series in the Age + of Large Models +
+
+
+
+
+ + ♻ ☆ Towards Size-Independent Generalization Bounds for Deep Operator Nets + + +
+ In recent times machine learning methods have made significant advances in +becoming a useful tool for analyzing physical systems. A particularly active +area in this theme has been "physics-informed machine learning" which focuses +on using neural nets for numerically solving differential equations. In this +work, we aim to advance the theory of measuring out-of-sample error while +training DeepONets - which is among the most versatile ways to solve P.D.E +systems in one-shot. Firstly, for a class of DeepONets, we prove a bound on +their Rademacher complexity which does not explicitly scale with the width of +the nets involved. Secondly, we use this to show how the Huber loss can be +chosen so that for these DeepONet classes generalization error bounds can be +obtained that have no explicit dependence on the size of the nets. The +effective capacity measure for DeepONets that we thus derive is also shown to +correlate with the behavior of generalization error in experiments. + +
+
+ comment: 33 pages, 7 figures; Published in TMLR, December 2024 +
+
+
+
+
+ + ♻ ☆ Fast Computation of Leave-One-Out Cross-Validation for $k$-NN Regression + + +
+ We describe a fast computation method for leave-one-out cross-validation +(LOOCV) for $k$-nearest neighbours ($k$-NN) regression. We show that, under a +tie-breaking condition for nearest neighbours, the LOOCV estimate of the mean +square error for $k$-NN regression is identical to the mean square error of +$(k+1)$-NN regression evaluated on the training data, multiplied by the scaling +factor $(k+1)^2/k^2$. Therefore, to compute the LOOCV score, one only needs to +fit $(k+1)$-NN regression only once, and does not need to repeat +training-validation of $k$-NN regression for the number of training data. +Numerical experiments confirm the validity of the fast computation method. + +
+
+ comment: To appear in Transactions of Machine Learning Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Coverage-Constrained Human-AI Cooperation with Multiple Experts + + +
+ Human-AI cooperative classification (HAI-CC) approaches aim to develop hybrid +intelligent systems that enhance decision-making in various high-stakes +real-world scenarios by leveraging both human expertise and AI capabilities. +Current HAI-CC methods primarily focus on learning-to-defer (L2D), where +decisions are deferred to human experts, and learning-to-complement (L2C), +where AI and human experts make predictions cooperatively. However, a notable +research gap remains in effectively exploring both L2D and L2C under diverse +expert knowledge to improve decision-making, particularly when constrained by +the cooperation cost required to achieve a target probability for AI-only +selection (i.e., coverage). In this paper, we address this research gap by +proposing the Coverage-constrained Learning to Defer and Complement with +Specific Experts (CL2DC) method. CL2DC makes final decisions through either AI +prediction alone or by deferring to or complementing a specific expert, +depending on the input data. Furthermore, we propose a coverage-constrained +optimisation to control the cooperation cost, ensuring it approximates a target +probability for AI-only selection. This approach enables an effective +assessment of system performance within a specified budget. Also, CL2DC is +designed to address scenarios where training sets contain multiple noisy-label +annotations without any clean-label references. Comprehensive evaluations on +both synthetic and real-world datasets demonstrate that CL2DC achieves superior +performance compared to state-of-the-art HAI-CC methods. + +
+
+
+
+
+ + ♻ ☆ Distributionally robust self-supervised learning for tabular data NeurIPS2024 + + +
+ Machine learning (ML) models trained using Empirical Risk Minimization (ERM) +often exhibit systematic errors on specific subpopulations of tabular data, +known as error slices. Learning robust representation in presence of error +slices is challenging, especially in self-supervised settings during the +feature reconstruction phase, due to high cardinality features and the +complexity of constructing error sets. Traditional robust representation +learning methods are largely focused on improving worst group performance in +supervised setting in computer vision, leaving a gap in approaches tailored for +tabular data. We address this gap by developing a framework to learn robust +representation in tabular data during self-supervised pre-training. Our +approach utilizes an encoder-decoder model trained with Masked Language +Modeling (MLM) loss to learn robust latent representations. This paper applies +the Just Train Twice (JTT) and Deep Feature Reweighting (DFR) methods during +the pre-training phase for tabular data. These methods fine-tune the ERM +pre-trained model by up-weighting error-prone samples or creating balanced +datasets for specific categorical features. This results in specialized models +for each feature, which are then used in an ensemble approach to enhance +downstream classification performance. This methodology improves robustness +across slices, thus enhancing overall generalization performance. Extensive +experiments across various datasets demonstrate the efficacy of our approach. +The code is available: +\url{https://github.com/amazon-science/distributionally-robust-self-supervised-learning-for-tabular-data}. + +
+
+ comment: TRL Workshop@NeurIPS2024 +
+
+
+
+
+ + ♻ ☆ Automatically Interpreting Millions of Features in Large Language Models + + +
+ While the activations of neurons in deep neural networks usually do not have +a simple human-understandable interpretation, sparse autoencoders (SAEs) can be +used to transform these activations into a higher-dimensional latent space +which may be more easily interpretable. However, these SAEs can have millions +of distinct latent features, making it infeasible for humans to manually +interpret each one. In this work, we build an open-source automated pipeline to +generate and evaluate natural language explanations for SAE features using +LLMs. We test our framework on SAEs of varying sizes, activation functions, and +losses, trained on two different open-weight LLMs. We introduce five new +techniques to score the quality of explanations that are cheaper to run than +the previous state of the art. One of these techniques, intervention scoring, +evaluates the interpretability of the effects of intervening on a feature, +which we find explains features that are not recalled by existing methods. We +propose guidelines for generating better explanations that remain valid for a +broader set of activating contexts, and discuss pitfalls with existing scoring +techniques. We use our explanations to measure the semantic similarity of +independently trained SAEs, and find that SAEs trained on nearby layers of the +residual stream are highly similar. Our large-scale analysis confirms that SAE +latents are indeed much more interpretable than neurons, even when neurons are +sparsified using top-$k$ postprocessing. Our code is available at +https://github.com/EleutherAI/sae-auto-interp, and our explanations are +available at +https://huggingface.co/datasets/EleutherAI/auto_interp_explanations. + +
+
+
+
+
+ + ♻ ☆ Generalization Bounds and Model Complexity for Kolmogorov-Arnold + Networks + + +
+ Kolmogorov-Arnold Network (KAN) is a network structure recently proposed by +Liu et al. (2024) that offers improved interpretability and a more parsimonious +design in many science-oriented tasks compared to multi-layer perceptrons. This +work provides a rigorous theoretical analysis of KAN by establishing +generalization bounds for KAN equipped with activation functions that are +either represented by linear combinations of basis functions or lying in a +low-rank Reproducing Kernel Hilbert Space (RKHS). In the first case, the +generalization bound accommodates various choices of basis functions in forming +the activation functions in each layer of KAN and is adapted to different +operator norms at each layer. For a particular choice of operator norms, the +bound scales with the $l_1$ norm of the coefficient matrices and the Lipschitz +constants for the activation functions, and it has no dependence on +combinatorial parameters (e.g., number of nodes) outside of logarithmic +factors. Moreover, our result does not require the boundedness assumption on +the loss function and, hence, is applicable to a general class of +regression-type loss functions. In the low-rank case, the generalization bound +scales polynomially with the underlying ranks as well as the Lipschitz +constants of the activation functions in each layer. These bounds are +empirically investigated for KANs trained with stochastic gradient descent on +simulated and real data sets. The numerical results demonstrate the practical +relevance of these bounds. + +
+
+
+
+
+ + ♻ ☆ Controlling Counterfactual Harm in Decision Support Systems Based on + Prediction Sets ICML 2024 + + +
+ Decision support systems based on prediction sets help humans solve +multiclass classification tasks by narrowing down the set of potential label +values to a subset of them, namely a prediction set, and asking them to always +predict label values from the prediction sets. While this type of systems have +been proven to be effective at improving the average accuracy of the +predictions made by humans, by restricting human agency, they may cause +harm$\unicode{x2014}$a human who has succeeded at predicting the ground-truth +label of an instance on their own may have failed had they used these systems. +In this paper, our goal is to control how frequently a decision support system +based on prediction sets may cause harm, by design. To this end, we start by +characterizing the above notion of harm using the theoretical framework of +structural causal models. Then, we show that, under a natural, albeit +unverifiable, monotonicity assumption, we can estimate how frequently a system +may cause harm using only predictions made by humans on their own. Further, we +also show that, under a weaker monotonicity assumption, which can be verified +experimentally, we can bound how frequently a system may cause harm again using +only predictions made by humans on their own. Building upon these assumptions, +we introduce a computational framework to design decision support systems based +on prediction sets that are guaranteed to cause harm less frequently than a +user-specified value using conformal risk control. We validate our framework +using real human predictions from two different human subject studies and show +that, in decision support systems based on prediction sets, there is a +trade-off between accuracy and counterfactual harm. + +
+
+ comment: Accepted at the ICML 2024 Workshop on Humans, Algorithmic + Decision-Making and Society and published at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Prediction-Powered Ranking of Large Language Models NeurIPS 2024 + + +
+ Large language models are often ranked according to their level of alignment +with human preferences -- a model is better than other models if its outputs +are more frequently preferred by humans. One of the popular ways to elicit +human preferences utilizes pairwise comparisons between the outputs provided by +different models to the same inputs. However, since gathering pairwise +comparisons by humans is costly and time-consuming, it has become a common +practice to gather pairwise comparisons by a strong large language model -- a +model strongly aligned with human preferences. Surprisingly, practitioners +cannot currently measure the uncertainty that any mismatch between human and +model preferences may introduce in the constructed rankings. In this work, we +develop a statistical framework to bridge this gap. Given a (small) set of +pairwise comparisons by humans and a large set of pairwise comparisons by a +model, our framework provides a rank-set -- a set of possible ranking positions +-- for each of the models under comparison. Moreover, it guarantees that, with +a probability greater than or equal to a user-specified value, the rank-sets +cover the true ranking consistent with the distribution of human pairwise +preferences asymptotically. Using pairwise comparisons made by humans in the +LMSYS Chatbot Arena platform and pairwise comparisons made by three strong +large language models, we empirically demonstrate the effectivity of our +framework and show that the rank-sets constructed using only pairwise +comparisons by the strong large language models are often inconsistent with +(the distribution of) human pairwise preferences. + +
+
+ comment: Published at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Deferred Poisoning: Making the Model More Vulnerable via Hessian + Singularization + + +
+ Recent studies have shown that deep learning models are very vulnerable to +poisoning attacks. Many defense methods have been proposed to address this +issue. However, traditional poisoning attacks are not as threatening as +commonly believed. This is because they often cause differences in how the +model performs on the training set compared to the validation set. Such +inconsistency can alert defenders that their data has been poisoned, allowing +them to take the necessary defensive actions. In this paper, we introduce a +more threatening type of poisoning attack called the Deferred Poisoning Attack. +This new attack allows the model to function normally during the training and +validation phases but makes it very sensitive to evasion attacks or even +natural noise. We achieve this by ensuring the poisoned model's loss function +has a similar value as a normally trained model at each input sample but with a +large local curvature. A similar model loss ensures that there is no obvious +inconsistency between the training and validation accuracy, demonstrating high +stealthiness. On the other hand, the large curvature implies that a small +perturbation may cause a significant increase in model loss, leading to +substantial performance degradation, which reflects a worse robustness. We +fulfill this purpose by making the model have singular Hessian information at +the optimal point via our proposed Singularization Regularization term. We have +conducted both theoretical and empirical analyses of the proposed method and +validated its effectiveness through experiments on image classification tasks. +Furthermore, we have confirmed the hazards of this form of poisoning attack +under more general scenarios using natural noise, offering a new perspective +for research in the field of security. + +
+
+
+
+
+ + ♻ ☆ Can In-context Learning Really Generalize to Out-of-distribution Tasks? + + +
+ In this work, we explore the mechanism of in-context learning (ICL) on +out-of-distribution (OOD) tasks that were not encountered during training. To +achieve this, we conduct synthetic experiments where the objective is to learn +OOD mathematical functions through ICL using a GPT-2 model. We reveal that +Transformers may struggle to learn OOD task functions through ICL. +Specifically, ICL performance resembles implementing a function within the +pretraining hypothesis space and optimizing it with gradient descent based on +the in-context examples. Additionally, we investigate ICL's well-documented +ability to learn unseen abstract labels in context. We demonstrate that such +ability only manifests in the scenarios without distributional shifts and, +therefore, may not serve as evidence of new-task-learning ability. Furthermore, +we assess ICL's performance on OOD tasks when the model is pretrained on +multiple tasks. Both empirical and theoretical analyses demonstrate the +existence of the \textbf{low-test-error preference} of ICL, where it tends to +implement the pretraining function that yields low test error in the testing +context. We validate this through numerical experiments. This new theoretical +result, combined with our empirical findings, elucidates the mechanism of ICL +in addressing OOD tasks. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ♻ ☆ Minimal Learning Machine for Multi-Label Learning + + +
+ Distance-based supervised method, the minimal learning machine, constructs a +predictive model from data by learning a mapping between input and output +distance matrices. In this paper, we propose new methods and evaluate how their +core component, the distance mapping, can be adapted to multi-label learning. +The proposed approach is based on combining the distance mapping with an +inverse distance weighting. Although the proposal is one of the simplest +methods in the multi-label learning literature, it achieves state-of-the-art +performance for small to moderate-sized multi-label learning problems. In +addition to its simplicity, the proposed method is fully deterministic: Its +hyper-parameter can be selected via ranking loss-based statistic which has a +closed form, thus avoiding conventional cross-validation-based hyper-parameter +tuning. In addition, due to its simple linear distance mapping-based +construction, we demonstrate that the proposed method can assess the +uncertainty of the predictions for multi-label classification, which is a +valuable capability for data-centric machine learning pipelines. + +
+
+ comment: Submitted, 29 pages +
+
+
+
+
+ + ♻ ☆ LLM as a Complementary Optimizer to Gradient Descent: A Case Study in + Prompt Tuning + + +
+ Mastering a skill generally relies on both hands-on experience from doers and +insightful, high-level guidance by mentors. Will this strategy also work well +for solving complex non-convex optimization problems? Here, a common +gradient-based optimizer acts like a disciplined doer, making locally optimal +updates at each step. Large Language Models (LLMs) can also search for better +solutions by inferring from natural language instructions, akin to a high-level +mentor. In this paper, we show that these two participators are complementary +to each other and can effectively collaborate as a combined optimization +framework. The collaborative optimization is achieved by alternating between +the gradient-based and LLM-based optimizers. We instruct LLMs to generate +possibly improved solutions by taking parameter trajectories recorded during +the previous stage of gradient-based optimization into account. Inferred +results of LLMs are used as restarting points for the next stage of gradient +optimization. We verify the effectiveness of this optimization framework on +prompt tuning. By leveraging both the locally rigorous gradient-based optimizer +and the high-level deductive LLM-based optimizer, the combined optimization +method consistently yields improvements over competitive baselines on a variety +of tasks. Our results demonstrate the synergistic effect of conventional +gradient-based optimization and the inference ability of LLMs. The code is +released at https://github.com/guozix/LLM-catalyst. + +
+
+
+
+
+ + ♻ ☆ Towards a Robust Soft Baby Robot With Rich Interaction Ability for + Advanced Machine Learning Algorithms + + +
+ Advanced machine learning algorithms require platforms that are extremely +robust and equipped with rich sensory feedback to handle extensive +trial-and-error learning without relying on strong inductive biases. +Traditional robotic designs, while well-suited for their specific use cases, +are often fragile when used with these algorithms. To address this gap -- and +inspired by the vision of enabling curiosity-driven baby robots -- we present a +novel robotic limb designed from scratch. Our design has a hybrid soft-hard +structure, high redundancy with rich non-contact sensors (exclusively cameras), +and easily replaceable failure points. Proof-of-concept experiments using two +contemporary reinforcement learning algorithms on a physical prototype +demonstrate that our design is able to succeed in a simple target-finding task +even under simulated sensor failures, all with minimal human oversight during +extended learning periods. We believe this design represents a concrete step +toward more tailored robotic designs for achieving general-purpose, generally +intelligent robots. + +
+
+ comment: 6 pages in main text + 2 pages of references, 8 figures in main text, + 1 table in main text; source code available at + https://github.com/dylanashley/robot-limb-testai +
+
+
+
+
+ + ♻ ☆ Reducing Optimism Bias in Incomplete Cooperative Games AAMAS 2024 + + +
+ Cooperative game theory has diverse applications in contemporary artificial +intelligence, including domains like interpretable machine learning, resource +allocation, and collaborative decision-making. However, specifying a +cooperative game entails assigning values to exponentially many coalitions, and +obtaining even a single value can be resource-intensive in practice. Yet simply +leaving certain coalition values undisclosed introduces ambiguity regarding +individual contributions to the collective grand coalition. This ambiguity +often leads to players holding overly optimistic expectations, stemming from +either inherent biases or strategic considerations, frequently resulting in +collective claims exceeding the actual grand coalition value. In this paper, we +present a framework aimed at optimizing the sequence for revealing coalition +values, with the overarching goal of efficiently closing the gap between +players' expectations and achievable outcomes in cooperative games. Our +contributions are threefold: (i) we study the individual players' optimistic +completions of games with missing coalition values along with the arising gap, +and investigate its analytical characteristics that facilitate more efficient +optimization; (ii) we develop methods to minimize this gap over classes of +games with a known prior by disclosing values of additional coalitions in both +offline and online fashion; and (iii) we empirically demonstrate the +algorithms' performance in practical scenarios, together with an investigation +into the typical order of revealing coalition values. + +
+
+ comment: Proc. of the 23rd International Conference on Autonomous Agents and + Multiagent Systems (AAMAS 2024) +
+
+
+
+
+ + ♻ ☆ Analysis of Classifier-Free Guidance Weight Schedulers + + +
+ Classifier-Free Guidance (CFG) enhances the quality and condition adherence +of text-to-image diffusion models. It operates by combining the conditional and +unconditional predictions using a fixed weight. However, recent works vary the +weights throughout the diffusion process, reporting superior results but +without providing any rationale or analysis. By conducting comprehensive +experiments, this paper provides insights into CFG weight schedulers. Our +findings suggest that simple, monotonically increasing weight schedulers +consistently lead to improved performances, requiring merely a single line of +code. In addition, more complex parametrized schedulers can be optimized for +further improvement, but do not generalize across different models and tasks. + +
+
+
+
+
+ + ♻ ☆ Self-Improvement in Language Models: The Sharpening Mechanism + + +
+ Recent work in language modeling has raised the possibility of +self-improvement, where a language models evaluates and refines its own +generations to achieve higher performance without external feedback. It is +impossible for this self-improvement to create information that is not already +in the model, so why should we expect that this will lead to improved +capabilities? We offer a new perspective on the capabilities of +self-improvement through a lens we refer to as sharpening. Motivated by the +observation that language models are often better at verifying response quality +than they are at generating correct responses, we formalize self-improvement as +using the model itself as a verifier during post-training in order to +``sharpen'' the model to one placing large mass on high-quality sequences, +thereby amortizing the expensive inference-time computation of generating good +sequences. We begin by introducing a new statistical framework for sharpening +in which the learner aims to sharpen a pre-trained base policy via sample +access, and establish fundamental limits. Then we analyze two natural families +of self-improvement algorithms based on SFT and RLHF. We find that (i) the +SFT-based approach is minimax optimal whenever the initial model has sufficient +coverage, but (ii) the RLHF-based approach can improve over SFT-based +self-improvement by leveraging online exploration, bypassing the need for +coverage. Finally, we empirically validate the sharpening mechanism via +inference-time and amortization experiments. We view these findings as a +starting point toward a foundational understanding that can guide the design +and evaluation of self-improvement algorithms. + +
+
+
+
+
+ + ♻ ☆ Tackling Decision Processes with Non-Cumulative Objectives using + Reinforcement Learning + + +
+ Markov decision processes (MDPs) are used to model a wide variety of +applications ranging from game playing over robotics to finance. Their optimal +policy typically maximizes the expected sum of rewards given at each step of +the decision process. However, a large class of problems does not fit +straightforwardly into this framework: Non-cumulative Markov decision processes +(NCMDPs), where instead of the expected sum of rewards, the expected value of +an arbitrary function of the rewards is maximized. Example functions include +the maximum of the rewards or their mean divided by their standard deviation. +In this work, we introduce a general mapping of NCMDPs to standard MDPs. This +allows all techniques developed to find optimal policies for MDPs, such as +reinforcement learning or dynamic programming, to be directly applied to the +larger class of NCMDPs. Focusing on reinforcement learning, we show +applications in a diverse set of tasks, including classical control, portfolio +optimization in finance, and discrete optimization problems. Given our +approach, we can improve both final performance and training time compared to +relying on standard MDPs. + +
+
+
+
+
+ + ♻ ☆ OpenDriver: An Open-Road Driver State Detection Dataset + + +
+ Among numerous studies for driver state detection, wearable physiological +measurements offer a practical method for real-time monitoring. However, there +are few driver physiological datasets in open-road scenarios, and the existing +datasets suffer from issues such as poor signal quality, small sample sizes, +and short data collection periods. Therefore, in this paper, a large-scale +multimodal driving dataset, OpenDriver, for driver state detection is +developed. The OpenDriver encompasses a total of 3,278 driving trips, with a +signal collection duration spanning approximately 4,600 hours. Two modalities +of driving signals are enrolled in OpenDriver: electrocardiogram (ECG) signals +and six-axis motion data of the steering wheel from a motion measurement unit +(IMU), which were recorded from 81 drivers and their vehicles. Furthermore, +three challenging tasks are involved in our work, namely ECG signal quality +assessment, individual biometric identification based on ECG signals, and +physiological signal analysis in complex driving environments. To facilitate +research in these tasks, corresponding benchmarks have also been introduced. +First, a noisy augmentation strategy is applied to generate a larger-scale ECG +signal dataset with realistic noise simulation for quality assessment. Second, +an end-to-end contrastive learning framework is employed for individual +biometric identification. Finally, a comprehensive analysis of drivers' HRV +features under different driving conditions is conducted. Each benchmark +provides evaluation metrics and reference results. The OpenDriver dataset will +be publicly available at https://github.com/bdne/OpenDriver. + +
+
+ comment: Considering that there are flaws in the statistical data of the + dataset, all the authors agreed to withdraw the manuscript +
+
+
+
+
+ + ♻ ☆ Identifiable Representation and Model Learning for Latent Dynamic + Systems + + +
+ Learning identifiable representations and models from low-level observations +is helpful for an intelligent spacecraft to complete downstream tasks reliably. +For temporal observations, to ensure that the data generating process is +provably inverted, most existing works either assume the noise variables in the +dynamic mechanisms are (conditionally) independent or require that the +interventions can directly affect each latent variable. However, in practice, +the relationship between the exogenous inputs/interventions and the latent +variables may follow some complex deterministic mechanisms. In this work, we +study the problem of identifiable representation and model learning for latent +dynamic systems. The key idea is to use an inductive bias inspired by +controllable canonical forms, which are sparse and input-dependent by +definition. We prove that, for linear and affine nonlinear latent dynamic +systems with sparse input matrices, it is possible to identify the latent +variables up to scaling and determine the dynamic models up to some simple +transformations. The results have the potential to provide some theoretical +guarantees for developing more trustworthy decision-making and control methods +for intelligent spacecrafts. + +
+
+
+
+
+ + ♻ ☆ Leveraging Auxiliary Task Relevance for Enhanced Bearing Fault Diagnosis + through Curriculum Meta-learning + + +
+ The accurate diagnosis of machine breakdowns is crucial for maintaining +operational safety in smart manufacturing. Despite the promise shown by deep +learning in automating fault identification, the scarcity of labeled training +data, particularly for equipment failure instances, poses a significant +challenge. This limitation hampers the development of robust classification +models. Existing methods like model-agnostic meta-learning (MAML) do not +adequately address variable working conditions, affecting knowledge transfer. +To address these challenges, a Related Task Aware Curriculum Meta-learning +(RT-ACM) enhanced fault diagnosis framework is proposed in this paper, inspired +by human cognitive learning processes. RT-ACM improves training by considering +the relevance of auxiliary sensor working conditions, adhering to the principle +of ``paying more attention to more relevant knowledge", and focusing on +``easier first, harder later" curriculum sampling. This approach aids the +meta-learner in achieving a superior convergence state. Extensive experiments +on two real-world datasets demonstrate the superiority of RT-ACM framework. + +
+
+
+
+
+ + ♻ ☆ ROSE: Revolutionizing Open-Set Dense Segmentation with Patch-Wise + Perceptual Large Multimodal Model + + +
+ Advances in CLIP and large multimodal models (LMMs) have enabled +open-vocabulary and free-text segmentation, yet existing models still require +predefined category prompts, limiting free-form category self-generation. Most +segmentation LMMs also remain confined to sparse predictions, restricting their +applicability in open-set environments. In contrast, we propose ROSE, a +Revolutionary Open-set dense SEgmentation LMM, which enables dense mask +prediction and open-category generation through patch-wise perception. Our +method treats each image patch as an independent region of interest candidate, +enabling the model to predict both dense and sparse masks simultaneously. +Additionally, a newly designed instruction-response paradigm takes full +advantage of the generation and generalization capabilities of LMMs, achieving +category prediction independent of closed-set constraints or predefined +categories. To further enhance mask detail and category precision, we introduce +a conversation-based refinement paradigm, integrating the prediction result +from previous step with textual prompt for revision. Extensive experiments +demonstrate that ROSE achieves competitive performance across various +segmentation tasks in a unified framework. Code will be released. + +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning for Finite Space Mean-Field Type Games + + +
+ Mean field type games (MFTGs) describe Nash equilibria between large +coalitions: each coalition consists of a continuum of cooperative agents who +maximize the average reward of their coalition while interacting +non-cooperatively with a finite number of other coalitions. Although the theory +has been extensively developed, we are still lacking efficient and scalable +computational methods. Here, we develop reinforcement learning methods for such +games in a finite space setting with general dynamics and reward functions. We +start by proving that MFTG solution yields approximate Nash equilibria in +finite-size coalition games. We then propose two algorithms. The first is based +on quantization of mean-field spaces and Nash Q-learning. We provide +convergence and stability analysis. We then propose a deep reinforcement +learning algorithm, which can scale to larger spaces. Numerical experiments in +5 environments with mean-field distributions of dimension up to $200$ show the +scalability and efficiency of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Chain-structured neural architecture search for financial time series + forecasting + + +
+ Neural architecture search (NAS) emerged as a way to automatically optimize +neural networks for a specific task and dataset. Despite an abundance of +research on NAS for images and natural language applications, similar studies +for time series data are lacking. Among NAS search spaces, chain-structured are +the simplest and most applicable to small datasets like time series. We compare +three popular NAS strategies on chain-structured search spaces: Bayesian +optimization (specifically Tree-structured Parzen Estimator), the hyperband +method, and reinforcement learning in the context of financial time series +forecasting. These strategies were employed to optimize simple well-understood +neural architectures like the MLP, 1D CNN, and RNN, with more complex temporal +fusion transformers (TFT) and their own optimizers included for comparison. We +find Bayesian optimization and the hyperband method performing best among the +strategies, and RNN and 1D CNN best among the architectures, but all methods +were very close to each other with a high variance due to the difficulty of +working with financial datasets. We discuss our approach to overcome the +variance and provide implementation recommendations for future users and +researchers. + +
+
+ comment: This is the accepted version of the paper published in International + Journal of Data Science and Analytics +
+
+
+
+
+ + ♻ ☆ Explainable fault and severity classification for rolling element + bearings using Kolmogorov-Arnold networks + + +
+ Rolling element bearings are critical components of rotating machinery, with +their performance directly influencing the efficiency and reliability of +industrial systems. At the same time, bearing faults are a leading cause of +machinery failures, often resulting in costly downtime, reduced productivity, +and, in extreme cases, catastrophic damage. This study presents a methodology +that utilizes Kolmogorov-Arnold Networks to address these challenges through +automatic feature selection, hyperparameter tuning and interpretable fault +analysis within a unified framework. By training shallow network architectures +and minimizing the number of selected features, the framework produces +lightweight models that deliver explainable results through feature attribution +and symbolic representations of their activation functions. Validated on two +widely recognized datasets for bearing fault diagnosis, the framework achieved +perfect F1-Scores for fault detection and high performance in fault and +severity classification tasks, including 100% F1-Scores in most cases. Notably, +it demonstrated adaptability by handling diverse fault types, such as imbalance +and misalignment, within the same dataset. The symbolic representations +enhanced model interpretability, while feature attribution offered insights +into the optimal feature types or signals for each studied task. These results +highlight the framework's potential for practical applications, such as +real-time machinery monitoring, and for scientific research requiring efficient +and explainable models. + +
+
+
+
+
+ + ♻ ☆ The Cooperative Network Architecture: Learning Structured Networks as + Representation of Sensory Patterns + + +
+ Nets, cooperative networks of neurons, have been proposed as format for the +representation of sensory signals, as physical implementation of the Gestalt +phenomenon and as solution to the neural binding problem, while the direct +interaction between nets by structure-sensitive matching has been proposed as +basis for object-global operations such as object detection. The nets are +flexibly composed of overlapping net fragments, which are learned from +statistical regularities of sensory input. We here present the cooperative +network architecture (CNA), a concrete model that learns such net structure to +represent input patterns and deals robustly with noise, deformation, and +out-of-distribution data, thus laying the groundwork for a novel neural +architecture. + +
+
+
+
+
+ + ♻ ☆ Local Lesion Generation is Effective for Capsule Endoscopy Image Data + Augmentation in a Limited Data Setting + + +
+ Limited medical imaging datasets challenge deep learning models by increasing +risks of overfitting and reduced generalization, particularly in Generative +Adversarial Networks (GANs), where discriminators may overfit, leading to +training divergence. This constraint also impairs classification models trained +on small datasets. Generative Data Augmentation (GDA) addresses this by +expanding training datasets with synthetic data, although it requires training +a generative model. We propose and evaluate two local lesion generation +approaches to address the challenge of augmenting small medical image datasets. +The first approach employs the Poisson Image Editing algorithm, a classical +image processing technique, to create realistic image composites that +outperform current state-of-the-art methods. The second approach introduces a +novel generative method, leveraging a fine-tuned Image Inpainting GAN to +synthesize realistic lesions within specified regions of real training images. +A comprehensive comparison of the two proposed methods demonstrates that +effective local lesion generation in a data-constrained setting allows for +reaching new state-of-the-art results in capsule endoscopy lesion +classification. Combination of our techniques achieves a macro F1-score of +33.07%, surpassing the previous best result by 7.84 percentage points (p.p.) on +the highly imbalanced Kvasir Capsule Dataset, a benchmark for capsule +endoscopy. To the best of our knowledge, this work is the first to apply a +fine-tuned Image Inpainting GAN for GDA in medical imaging, demonstrating that +an image-conditional GAN can be adapted effectively to limited datasets to +generate high-quality examples, facilitating effective data augmentation. +Additionally, we show that combining this GAN-based approach with classical +image processing techniques further improves the results. + +
+
+ comment: 54 pages, 35 figures +
+
+
+
+
+ + ♻ ☆ Pyramid Vector Quantization for LLMs + + +
+ Recent works on compression of large language models (LLM) using quantization +considered reparameterizing the architecture such that weights are distributed +on the sphere. This demonstratively improves the ability to quantize by +increasing the mathematical notion of coherence, resulting in fewer weight +outliers without affecting the network output. In this work, we aim to further +exploit this spherical geometry of the weights when performing quantization by +considering Pyramid Vector Quantization (PVQ) for large language models. +Arranging points evenly on the sphere is notoriously difficult, especially in +high dimensions, and in case approximate solutions exists, representing points +explicitly in a codebook is typically not feasible due to its additional memory +cost. Instead, PVQ uses a fixed integer lattice on the sphere by projecting +points onto the 1-sphere, which allows for efficient encoding and decoding +without requiring an explicit codebook in memory. To obtain a practical +algorithm, we propose to combine PVQ with scale quantization for which we +derive theoretically optimal quantizations, under empirically verified +assumptions. Further, we extend pyramid vector quantization to use Hessian +information to minimize quantization error under expected feature activations, +instead of only relying on weight magnitudes. Experimentally, we achieves +state-of-the-art quantization performance with pareto-optimal trade-off between +performance and bits per weight and bits per activation, compared to compared +methods. On weight-only, we find that we can quantize a Llama-3 70B model to +3.25 bits per weight and retain 98\% accuracy on downstream tasks. + +
+
+
+
+
+ + ♻ GWQ: Gradient-Aware Weight Quantization for Large Language Models + + +
+ Large language models (LLMs) show impressive performance in solving complex +language tasks. However, its large number of parameters present significant +challenges for the deployment and application of the model on edge devices. +Compressing large language models to low bits can enable them to run on +resource-constrained devices, often leading to performance degradation. To +address this problem, we propose gradient-aware weight quantization (GWQ), the +first quantization approach for low-bit weight quantization that leverages +gradients to localize outliers, requiring only a minimal amount of calibration +data for outlier detection. GWQ retains the weights corresponding to the top 1% +outliers preferentially at FP16 precision, while the remaining non-outlier +weights are stored in a low-bit format. GWQ found experimentally that utilizing +the sensitive weights in the gradient localization model is more scientific +compared to utilizing the sensitive weights in the Hessian matrix localization +model. Compared to current quantization methods, GWQ can be applied to multiple +language models and achieves lower PPL on the WikiText2 and C4 dataset. In the +zero-shot task, GWQ quantized models have higher accuracy compared to other +quantization methods. GWQ is also suitable for multimodal model quantization, +and the quantized Qwen-VL family model is more accurate than other methods. +Zero-shot target detection task dataset RefCOCO outperforms the current +stat-of-the-arts method SPQR. GWQ achieves 1.2 times inference speedup in +comparison to the original model, and effectively reduces the inference memory. + +
+
+
+
+
+ + ♻ ☆ Elephants Never Forget: Memorization and Learning of Tabular Data in + Large Language Models + + +
+ While many have shown how Large Language Models (LLMs) can be applied to a +diverse set of tasks, the critical issues of data contamination and +memorization are often glossed over. In this work, we address this concern for +tabular data. Specifically, we introduce a variety of different techniques to +assess whether a language model has seen a tabular dataset during training. +This investigation reveals that LLMs have memorized many popular tabular +datasets verbatim. We then compare the few-shot learning performance of LLMs on +datasets that were seen during training to the performance on datasets released +after training. We find that LLMs perform better on datasets seen during +training, indicating that memorization leads to overfitting. At the same time, +LLMs show non-trivial performance on novel datasets and are surprisingly robust +to data transformations. We then investigate the in-context statistical +learning abilities of LLMs. While LLMs are significantly better than random at +solving statistical classification problems, the sample efficiency of few-shot +learning lags behind traditional statistical learning algorithms, especially as +the dimension of the problem increases. This suggests that much of the observed +few-shot performance on novel real-world datasets is due to the LLM's world +knowledge. Overall, our results highlight the importance of testing whether an +LLM has seen an evaluation dataset during pre-training. We release the +https://github.com/interpretml/LLM-Tabular-Memorization-Checker Python package +to test LLMs for memorization of tabular datasets. + +
+
+ comment: COLM camera ready, fix typo +
+
+
+
+
+ + ♻ ☆ One Step Learning, One Step Review AAAI + + +
+ Visual fine-tuning has garnered significant attention with the rise of +pre-trained vision models. The current prevailing method, full fine-tuning, +suffers from the issue of knowledge forgetting as it focuses solely on fitting +the downstream training set. In this paper, we propose a novel weight +rollback-based fine-tuning method called OLOR (One step Learning, One step +Review). OLOR combines fine-tuning with optimizers, incorporating a weight +rollback term into the weight update term at each step. This ensures +consistency in the weight range of upstream and downstream models, effectively +mitigating knowledge forgetting and enhancing fine-tuning performance. In +addition, a layer-wise penalty is presented to employ penalty decay and the +diversified decay rate to adjust the weight rollback levels of layers for +adapting varying downstream tasks. Through extensive experiments on various +tasks such as image classification, object detection, semantic segmentation, +and instance segmentation, we demonstrate the general applicability and +state-of-the-art performance of our proposed OLOR. Code is available at +https://github.com/rainbow-xiao/OLOR-AAAI-2024. + +
+
+ comment: Published at the 38th AAAI Conference on Artificial Intelligence + (AAAI 2024) +
+
+
+
+
+ + ♻ ☆ A path-norm toolkit for modern networks: consequences, promises and + challenges + + +
+ This work introduces the first toolkit around path-norms that fully +encompasses general DAG ReLU networks with biases, skip connections and any +operation based on the extraction of order statistics: max pooling, GroupSort +etc. This toolkit notably allows us to establish generalization bounds for +modern neural networks that are not only the most widely applicable path-norm +based ones, but also recover or beat the sharpest known bounds of this type. +These extended path-norms further enjoy the usual benefits of path-norms: ease +of computation, invariance under the symmetries of the network, and improved +sharpness on layered fully-connected networks compared to the product of +operator norms, another complexity measure most commonly used. + The versatility of the toolkit and its ease of implementation allow us to +challenge the concrete promises of path-norm-based generalization bounds, by +numerically evaluating the sharpest known bounds for ResNets on ImageNet. + +
+
+ comment: Erratum: in the published version there was a typo in the definition + of the activation matrix in Definition A.3. This is fixed with this new + version +
+
+
+
+
+ + ♻ ☆ Knowledge Mechanisms in Large Language Models: A Survey and Perspective EMNLP 2024 + + +
+ Understanding knowledge mechanisms in Large Language Models (LLMs) is crucial +for advancing towards trustworthy AGI. This paper reviews knowledge mechanism +analysis from a novel taxonomy including knowledge utilization and evolution. +Knowledge utilization delves into the mechanism of memorization, comprehension +and application, and creation. Knowledge evolution focuses on the dynamic +progression of knowledge within individual and group LLMs. Moreover, we discuss +what knowledge LLMs have learned, the reasons for the fragility of parametric +knowledge, and the potential dark knowledge (hypothesis) that will be +challenging to address. We hope this work can help understand knowledge in LLMs +and provide insights for future research. + +
+
+ comment: EMNLP 2024 Findings; 39 pages (v4) +
+
+
+
+
+ + ♻ ☆ Exploration of Parameter Spaces Assisted by Machine Learning + + +
+ We demonstrate two sampling procedures assisted by machine learning models +via regression and classification. The main objective is the use of a neural +network to suggest points likely inside regions of interest, reducing the +number of evaluations of time consuming calculations. We compare results from +this approach with results from other sampling methods, namely Markov chain +Monte Carlo and MultiNest, obtaining results that range from comparably similar +to arguably better. In particular, we augment our classifier method with a +boosting technique that rapidly increases the efficiency within a few +iterations. We show results from our methods applied to a toy model and the +type II 2HDM, using 3 and 7 free parameters, respectively. The code used for +this paper and instructions are publicly available on the web. + +
+
+ comment: 30 pages, 9 figures. Matches published version. Code and instructions + are available on https://github.com/AHamamd150/MLscanner +
+
+
+
+
+ + ♻ ☆ Learning Developmental Age from 3D Infant Kinetics Using Adaptive Graph + Neural Networks + + +
+ Reliable methods for the neurodevelopmental assessment of infants are +essential for early detection of problems that may need prompt interventions. +Spontaneous motor activity, or 'kinetics', is shown to provide a powerful +surrogate measure of upcoming neurodevelopment. However, its assessment is by +and large qualitative and subjective, focusing on visually identified, +age-specific gestures. In this work, we introduce Kinetic Age (KA), a novel +data-driven metric that quantifies neurodevelopmental maturity by predicting an +infant's age based on their movement patterns. KA offers an interpretable and +generalizable proxy for motor development. Our method leverages 3D video +recordings of infants, processed with pose estimation to extract +spatio-temporal series of anatomical landmarks, which are released as a new +openly available dataset. These data are modeled using adaptive graph +convolutional networks, able to capture the spatio-temporal dependencies in +infant movements. We also show that our data-driven approach achieves +improvement over traditional machine learning baselines based on manually +engineered features. + +
+
+ comment: 15 pages, 9 figures. Code repository available via + https://github.com/deinal/infant-aagcn +
+
+
+
+
+ + ♻ ☆ FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking + Portrait + + +
+ With the rapid advancement of diffusion-based generative models, portrait +image animation has achieved remarkable results. However, it still faces +challenges in temporally consistent video generation and fast sampling due to +its iterative sampling nature. This paper presents FLOAT, an audio-driven +talking portrait video generation method based on flow matching generative +model. We shift the generative modeling from the pixel-based latent space to a +learned motion latent space, enabling efficient design of temporally consistent +motion. To achieve this, we introduce a transformer-based vector field +predictor with a simple yet effective frame-wise conditioning mechanism. +Additionally, our method supports speech-driven emotion enhancement, enabling a +natural incorporation of expressive motions. Extensive experiments demonstrate +that our method outperforms state-of-the-art audio-driven talking portrait +methods in terms of visual quality, motion fidelity, and efficiency. + +
+
+ comment: Project page: https://deepbrainai-research.github.io/float/ +
+
+
+
+
+ + ♻ ☆ Adaptive Dense Reward: Understanding the Gap Between Action and Reward + Space in Alignment + + +
+ Reinforcement Learning from Human Feedback (RLHF) has proven highly effective +in aligning Large Language Models (LLMs) with human preferences. However, the +original RLHF typically optimizes under an overall reward, which can lead to a +suboptimal learning process. This limitation stems from RLHF's lack of +awareness regarding which specific tokens should be reinforced or suppressed. +Moreover, conflicts in supervision can arise, for instance, when a chosen +response includes erroneous tokens, while a rejected response contains accurate +elements. To rectify these shortcomings, increasing dense reward methods, such +as step-wise and token-wise RLHF, have been proposed. However, these existing +methods are limited to specific tasks (like mathematics). In this paper, we +propose the ``Adaptive Message-wise RLHF'' method, which robustly applies to +various tasks. By defining pivot tokens as key indicators, our approach +adaptively identifies essential information and converts sequence-level +supervision into fine-grained, subsequence-level supervision. This aligns the +density of rewards and action spaces more closely with the information density +of the input. Experiments demonstrate that our method can be integrated into +various training methods, significantly mitigating hallucinations and +catastrophic forgetting problems, while outperforming other methods on multiple +evaluation metrics. Our method improves the success rate on adversarial samples +by 10\% compared to the sample-wise approach, and achieves a 1.3\% improvement +on evaluation benchmarks such as MMLU, GSM8K, HumanEval, etc. + +
+
+
+
+
+ + ♻ ☆ Graph Pooling by Local Cluster Selection + + +
+ Graph pooling is a family of operations which take graphs as input and +produce shrinked graphs as output. Modern graph pooling methods are trainable +and, in general inserted in Graph Neural Networks (GNNs) architectures as graph +shrinking operators along the (deep) processing pipeline. This work proposes a +novel procedure for pooling graphs, along with a node-centred graph pooling +operator. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Provably Mitigating Overoptimization in RLHF: Your SFT Loss is + Implicitly an Adversarial Regularizer + + +
+ Aligning generative models with human preference via RLHF typically suffers +from overoptimization, where an imperfectly learned reward model can misguide +the generative model to output undesired responses. We investigate this problem +in a principled manner by identifying the source of the misalignment as a form +of distributional shift and uncertainty in learning human preferences. To +mitigate overoptimization, we first propose a theoretical algorithm that +chooses the best policy for an adversarially chosen reward model; one that +simultaneously minimizes the maximum likelihood estimation of the loss and a +reward penalty term. Here, the reward penalty term is introduced to prevent the +policy from choosing actions with spurious high proxy rewards, resulting in +provable sample efficiency of the algorithm under a partial coverage style +condition. Moving from theory to practice, the proposed algorithm further +enjoys an equivalent but surprisingly easy-to-implement reformulation. Using +the equivalence between reward models and the corresponding optimal policy, the +algorithm features a simple objective that combines: (i) a preference +optimization loss that directly aligns the policy with human preference, and +(ii) a supervised learning loss that explicitly imitates the policy with a +(suitable) baseline distribution. In the context of aligning large language +models (LLM), this objective fuses the direct preference optimization (DPO) +loss with the supervised fine-tuning (SFT) loss to help mitigate the +overoptimization towards undesired responses, for which we name the algorithm +Regularized Preference Optimization (RPO). Experiments of aligning LLMs +demonstrate the improved performance of RPO compared with DPO baselines. Our +work sheds light on the interplay between preference optimization and SFT in +tuning LLMs with both theoretical guarantees and empirical evidence. + +
+
+ comment: Accepted by The Thirty-Eighth Annual Conference on Neural Information + Processing Systems. 31 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ DEL-Ranking: Ranking-Correction Denoising Framework for Elucidating + Molecular Affinities in DNA-Encoded Libraries + + +
+ DNA-encoded library (DEL) screening has revolutionized the detection of +protein-ligand interactions through read counts, enabling rapid exploration of +vast chemical spaces. However, noise in read counts, stemming from nonspecific +interactions, can mislead this exploration process. We present DEL-Ranking, a +novel distribution-correction denoising framework that addresses these +challenges. Our approach introduces two key innovations: (1) a novel ranking +loss that rectifies relative magnitude relationships between read counts, +enabling the learning of causal features determining activity levels, and (2) +an iterative algorithm employing self-training and consistency loss to +establish model coherence between activity label and read count predictions. +Furthermore, we contribute three new DEL screening datasets, the first to +comprehensively include multi-dimensional molecular representations, +protein-ligand enrichment values, and their activity labels. These datasets +mitigate data scarcity issues in AI-driven DEL screening research. Rigorous +evaluation on diverse DEL datasets demonstrates DEL-Ranking's superior +performance across multiple correlation metrics, with significant improvements +in binding affinity prediction accuracy. Our model exhibits zero-shot +generalization ability across different protein targets and successfully +identifies potential motifs determining compound binding affinity. This work +advances DEL screening analysis and provides valuable resources for future +research in this area. + +
+
+
+
+
+ + ♻ ☆ One Initialization to Rule them All: Fine-tuning via Explained Variance + Adaptation + + +
+ Foundation models (FMs) are pre-trained on large-scale datasets and then +fine-tuned on a downstream task for a specific application. The most successful +and most commonly used fine-tuning method is to update the pre-trained weights +via a low-rank adaptation (LoRA). LoRA introduces new weight matrices that are +usually initialized at random with a uniform rank distribution across the model +weights. Recent works focus on different initialization schemes or the learning +of adaptive ranks during fine-tuning. Both approaches have only been +investigated in isolation, resulting in slow convergence or a uniform rank +distribution, in turn leading to suboptimal performance. We propose to improve +LoRA by initializing the new weights in a data-driven manner by computing +singular value decomposition (SVD) on minibatches of activation vectors. Then, +we initialize the LoRA matrices with the obtained right-singular vectors and +redistribute ranks among all weight matrices to provably store the maximum +amount of information of the downstream data in the newly introduced weights. +In this way, only what information to maintain or neglect during the +fine-tuning process needs to be learned. We call our new method Explained +Variance Adaptation (EVA). We apply EVA to a variety of fine-tuning tasks +ranging from language generation and understanding to image classification and +reinforcement learning. EVA exhibits faster convergence than competitors and +achieves the highest average score across a multitude of tasks per domain while +reducing the number of trainable parameters through rank redistribution. + +
+
+ comment: 11 pages + references and appendix, code available at + https://github.com/ml-jku/EVA +
+
+
+
+
+ + ♻ ☆ On Privacy, Security, and Trustworthiness in Distributed Wireless Large + AI Models (WLAM) + + +
+ Combining wireless communication with large artificial intelligence (AI) +models can open up a myriad of novel application scenarios. In sixth generation +(6G) networks, ubiquitous communication and computing resources allow large AI +models to serve democratic large AI models-related services to enable real-time +applications like autonomous vehicles, smart cities, and Internet of Things +(IoT) ecosystems. However, the security considerations and sustainable +communication resources limit the deployment of large AI models over +distributed wireless networks. This paper provides a comprehensive overview of +privacy, security, and trustworthy for distributed wireless large AI model +(WLAM). In particular, a detailed privacy and security are analysis for +distributed WLAM is fist revealed. The classifications and theoretical findings +about privacy and security in distributed WLAM are discussed. Then the +trustworthy and ethics for implementing distributed WLAM are described. +Finally, the comprehensive applications of distributed WLAM are presented in +the context of electromagnetic signal processing. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ CryoFM: A Flow-based Foundation Model for Cryo-EM Densities + + +
+ Cryo-electron microscopy (cryo-EM) is a powerful technique in structural +biology and drug discovery, enabling the study of biomolecules at high +resolution. Significant advancements by structural biologists using cryo-EM +have led to the production of over 38,626 protein density maps at various +resolutions1. However, cryo-EM data processing algorithms have yet to fully +benefit from our knowledge of biomolecular density maps, with only a few recent +models being data-driven but limited to specific tasks. In this study, we +present CryoFM, a foundation model designed as a generative model, learning the +distribution of high-quality density maps and generalizing effectively to +downstream tasks. Built on flow matching, CryoFM is trained to accurately +capture the prior distribution of biomolecular density maps. Furthermore, we +introduce a flow posterior sampling method that leverages CRYOFM as a flexible +prior for several downstream tasks in cryo-EM and cryo-electron tomography +(cryo-ET) without the need for fine-tuning, achieving state-of-the-art +performance on most tasks and demonstrating its potential as a foundational +model for broader applications in these fields. + +
+
+
+
+
+ + ♻ ☆ Phased Consistency Models NeurIPS 2024 + + +
+ Consistency Models (CMs) have made significant progress in accelerating the +generation of diffusion models. However, their application to high-resolution, +text-conditioned image generation in the latent space remains unsatisfactory. +In this paper, we identify three key flaws in the current design of Latent +Consistency Models (LCMs). We investigate the reasons behind these limitations +and propose Phased Consistency Models (PCMs), which generalize the design space +and address the identified limitations. Our evaluations demonstrate that PCMs +outperform LCMs across 1--16 step generation settings. While PCMs are +specifically designed for multi-step refinement, they achieve comparable 1-step +generation results to previously state-of-the-art specifically designed 1-step +methods. Furthermore, we show the methodology of PCMs is versatile and +applicable to video generation, enabling us to train the state-of-the-art +few-step text-to-video generator. Our code is available at +https://github.com/G-U-N/Phased-Consistency-Model. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ MQFL-FHE: Multimodal Quantum Federated Learning Framework with Fully + Homomorphic Encryption + + +
+ The integration of fully homomorphic encryption (FHE) in federated learning +(FL) has led to significant advances in data privacy. However, during the +aggregation phase, it often results in performance degradation of the +aggregated model, hindering the development of robust representational +generalization. In this work, we propose a novel multimodal quantum federated +learning framework that utilizes quantum computing to counteract the +performance drop resulting from FHE. For the first time in FL, our framework +combines a multimodal quantum mixture of experts (MQMoE) model with FHE, +incorporating multimodal datasets for enriched representation and task-specific +learning. Our MQMoE framework enhances performance on multimodal datasets and +combined genomics and brain MRI scans, especially for underrepresented +categories. Our results also demonstrate that the quantum-enhanced approach +mitigates the performance degradation associated with FHE and improves +classification accuracy across diverse datasets, validating the potential of +quantum interventions in enhancing privacy in FL. + +
+
+ comment: 14 pages, 6 figures, 5 Tables. Under Review +
+
+
+
+
+ + ♻ ☆ Stable Consistency Tuning: Understanding and Improving Consistency + Models + + +
+ Diffusion models achieve superior generation quality but suffer from slow +generation speed due to the iterative nature of denoising. In contrast, +consistency models, a new generative family, achieve competitive performance +with significantly faster sampling. These models are trained either through +consistency distillation, which leverages pretrained diffusion models, or +consistency training/tuning directly from raw data. In this work, we propose a +novel framework for understanding consistency models by modeling the denoising +process of the diffusion model as a Markov Decision Process (MDP) and framing +consistency model training as the value estimation through Temporal +Difference~(TD) Learning. More importantly, this framework allows us to analyze +the limitations of current consistency training/tuning strategies. Built upon +Easy Consistency Tuning (ECT), we propose Stable Consistency Tuning (SCT), +which incorporates variance-reduced learning using the score identity. SCT +leads to significant performance improvements on benchmarks such as CIFAR-10 +and ImageNet-64. On ImageNet-64, SCT achieves 1-step FID 2.42 and 2-step FID +1.55, a new SoTA for consistency models. + +
+
+ comment: Code is available at + https://github.com/G-U-N/Stable-Consistency-Tuning +
+
+
+
+
+ + ♻ ☆ Rethinking Spectral Augmentation for Contrast-based Graph + Self-Supervised Learning + + +
+ The recent surge in contrast-based graph self-supervised learning has +prominently featured an intensified exploration of spectral cues. Spectral +augmentation, which involves modifying a graph's spectral properties such as +eigenvalues or eigenvectors, is widely believed to enhance model performance. +However, an intriguing paradox emerges, as methods grounded in seemingly +conflicting assumptions regarding the spectral domain demonstrate notable +enhancements in learning performance. Through extensive empirical studies, we +find that simple edge perturbations - random edge dropping for node-level and +random edge adding for graph-level self-supervised learning - consistently +yield comparable or superior performance while being significantly more +computationally efficient. This suggests that the computational overhead of +sophisticated spectral augmentations may not justify their practical benefits. +Our theoretical analysis of the InfoNCE loss bounds for shallow GNNs further +supports this observation. The proposed insights represent a significant leap +forward in the field, potentially refining the understanding and implementation +of graph self-supervised learning. + +
+
+
+
+
+ + ♻ ☆ AED-PADA:Improving Generalizability of Adversarial Example Detection via + Principal Adversarial Domain Adaptation + + +
+ Adversarial example detection, which can be conveniently applied in many +scenarios, is important in the area of adversarial defense. Unfortunately, +existing detection methods suffer from poor generalization performance, because +their training process usually relies on the examples generated from a single +known adversarial attack and there exists a large discrepancy between the +training and unseen testing adversarial examples. To address this issue, we +propose a novel method, named Adversarial Example Detection via Principal +Adversarial Domain Adaptation (AED-PADA). Specifically, our approach identifies +the Principal Adversarial Domains (PADs), i.e., a combination of features of +the adversarial examples generated by different attacks, which possesses a +large portion of the entire adversarial feature space. Subsequently, we pioneer +to exploit Multi-source Unsupervised Domain Adaptation in adversarial example +detection, with PADs as the source domains. Experimental results demonstrate +the superior generalization ability of our proposed AED-PADA. Note that this +superiority is particularly achieved in challenging scenarios characterized by +employing the minimal magnitude constraint for the perturbations. + +
+
+
+
+
+ + ♻ ☆ Force-Guided Bridge Matching for Full-Atom Time-Coarsened Dynamics of + Peptides + + +
+ Molecular Dynamics (MD) is crucial in various fields such as materials +science, chemistry, and pharmacology to name a few. Conventional MD software +struggles with the balance between time cost and prediction accuracy, which +restricts its wider application. Recently, data-driven approaches based on deep +generative models have been devised for time-coarsened dynamics, which aim at +learning dynamics of diverse molecular systems over a long timestep, enjoying +both universality and efficiency. Nevertheless, most current methods are +designed solely to learn from the data distribution regardless of the +underlying Boltzmann distribution, and the physics priors such as energies and +forces are constantly overlooked. In this work, we propose a conditional +generative model called Force-guided Bridge Matching (FBM), which learns +full-atom time-coarsened dynamics and targets the Boltzmann-constrained +distribution. With the guidance of our delicately-designed intermediate force +field, FBM leverages favourable physics priors into the generation process, +giving rise to enhanced simulations. Experiments on two datasets consisting of +peptides verify our superiority in terms of comprehensive metrics and +demonstrate transferability to unseen systems. + +
+
+
+
+
+ + ♻ ☆ SurvMamba: State Space Model with Multi-grained Multi-modal Interaction + for Survival Prediction + + +
+ Multi-modal learning that combines pathological images with genomic data has +significantly enhanced the accuracy of survival prediction. Nevertheless, +existing methods have not fully utilized the inherent hierarchical structure +within both whole slide images (WSIs) and transcriptomic data, from which +better intra-modal representations and inter-modal integration could be +derived. Moreover, many existing studies attempt to improve multi-modal +representations through attention mechanisms, which inevitably lead to high +complexity when processing high-dimensional WSIs and transcriptomic data. +Recently, a structured state space model named Mamba emerged as a promising +approach for its superior performance in modeling long sequences with low +complexity. In this study, we propose Mamba with multi-grained multi-modal +interaction (SurvMamba) for survival prediction. SurvMamba is implemented with +a Hierarchical Interaction Mamba (HIM) module that facilitates efficient +intra-modal interactions at different granularities, thereby capturing more +detailed local features as well as rich global representations. In addition, an +Interaction Fusion Mamba (IFM) module is used for cascaded inter-modal +interactive fusion, yielding more comprehensive features for survival +prediction. Comprehensive evaluations on five TCGA datasets demonstrate that +SurvMamba outperforms other existing methods in terms of performance and +computational cost. + +
+
+
+
+
+ + ♻ ☆ RelCon: Relative Contrastive Learning for a Motion Foundation Model for + Wearable Data + + +
+ We present RelCon, a novel self-supervised *Rel*ative *Con*trastive learning +approach that uses a learnable distance measure in combination with a softened +contrastive loss for training an motion foundation model from wearable sensors. +The learnable distance measure captures motif similarity and domain-specific +semantic information such as rotation invariance. The learned distance provides +a measurement of semantic similarity between a pair of accelerometer +time-series segments, which is used to measure the distance between an anchor +and various other sampled candidate segments. The self-supervised model is +trained on 1 billion segments from 87,376 participants from a large wearables +dataset. The model achieves strong performance across multiple downstream +tasks, encompassing both classification and regression. To our knowledge, we +are the first to show the generalizability of a self-supervised learning model +with motion data from wearables across distinct evaluation tasks. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Relational Learning for Multimodal Knowledge Graphs + + +
+ Relational learning is an essential task in the domain of knowledge +representation, particularly in knowledge graph completion (KGC). While +relational learning in traditional single-modal settings has been extensively +studied, exploring it within a multimodal KGC context presents distinct +challenges and opportunities. One of the major challenges is inference on newly +discovered relations without any associated training data. This zero-shot +relational learning scenario poses unique requirements for multimodal KGC, +i.e., utilizing multimodality to facilitate relational learning.However, +existing works fail to support the leverage of multimodal information and leave +the problem unexplored. In this paper, we propose a novel end-to-end framework, +consisting of three components, i.e., multimodal learner, structure +consolidator, and relation embedding generator, to integrate diverse multimodal +information and knowledge graph structures to facilitate the zero-shot +relational learning. Evaluation results on three multimodal knowledge graphs +demonstrate the superior performance of our proposed method. + +
+
+ comment: In the Proceedings of the 2024 IEEE International Conference on Big + Data (IEEE BigData 2024) +
+
+
+
+
+ + ♻ ☆ COVID-19 Probability Prediction Using Machine Learning: An Infectious + Approach + + +
+ The ongoing COVID-19 pandemic continues to pose significant challenges to +global public health, despite the widespread availability of vaccines. Early +detection of the disease remains paramount in curbing its transmission and +mitigating its impact on public health systems. In response, this study delves +into the application of advanced machine learning (ML) techniques for +predicting COVID-19 infection probability. We conducted a rigorous +investigation into the efficacy of various ML models, including XGBoost, LGBM, +AdaBoost, Logistic Regression, Decision Tree, RandomForest, CatBoost, KNN, and +Deep Neural Networks (DNN). Leveraging a dataset comprising 4000 samples, with +3200 allocated for training and 800 for testing, our experiment offers +comprehensive insights into the performance of these models in COVID-19 +prediction. Our findings reveal that Deep Neural Networks (DNN) emerge as the +top-performing model, exhibiting superior accuracy and recall metrics. With an +impressive accuracy rate of 89%, DNN demonstrates remarkable potential in early +COVID-19 detection. This underscores the efficacy of deep learning approaches +in leveraging complex data patterns to identify COVID-19 infections accurately. +This study underscores the critical role of machine learning, particularly deep +learning methodologies, in augmenting early detection efforts amidst the +ongoing pandemic. The success of DNN in accurately predicting COVID-19 +infection probability highlights the importance of continued research and +development in leveraging advanced technologies to combat infectious diseases. + +
+
+
+
+
+ + ♻ ☆ Leveraging Visibility Graphs for Enhanced Arrhythmia Classification with + Graph Convolutional Networks + + +
+ Arrhythmias, detectable through electrocardiograms (ECGs), pose significant +health risks, underscoring the need for accurate and efficient automated +detection techniques. While recent advancements in graph-based methods have +demonstrated potential to enhance arrhythmia classification, the challenge lies +in effectively representing ECG signals as graphs. This study investigates the +use of Visibility Graph (VG) and Vector Visibility Graph (VVG) representations +combined with Graph Convolutional Networks (GCNs) for arrhythmia classification +under the ANSI/AAMI standard, ensuring reproducibility and fair comparison with +other techniques. Through extensive experiments on the MIT-BIH dataset, we +evaluate various GCN architectures and preprocessing parameters. Our findings +demonstrate that VG and VVG mappings enable GCNs to classify arrhythmias +directly from raw ECG signals, without the need for preprocessing or noise +removal. Notably, VG offers superior computational efficiency, while VVG +delivers enhanced classification performance by leveraging additional lead +features. The proposed approach outperforms baseline methods in several +metrics, although challenges persist in classifying the supraventricular +ectopic beat (S) class, particularly under the inter-patient paradigm. + +
+
+
+
+
+ + ♻ ☆ Breast Cancer Classification Using Gradient Boosting Algorithms Focusing + on Reducing the False Negative and SHAP for Explainability + + +
+ Cancer is one of the diseases that kill the most women in the world, with +breast cancer being responsible for the highest number of cancer cases and +consequently deaths. However, it can be prevented by early detection and, +consequently, early treatment. Any development for detection or perdition this +kind of cancer is important for a better healthy life. Many studies focus on a +model with high accuracy in cancer prediction, but sometimes accuracy alone may +not always be a reliable metric. This study implies an investigative approach +to studying the performance of different machine learning algorithms based on +boosting to predict breast cancer focusing on the recall metric. Boosting +machine learning algorithms has been proven to be an effective tool for +detecting medical diseases. The dataset of the University of California, Irvine +(UCI) repository has been utilized to train and test the model classifier that +contains their attributes. The main objective of this study is to use +state-of-the-art boosting algorithms such as AdaBoost, XGBoost, CatBoost and +LightGBM to predict and diagnose breast cancer and to find the most effective +metric regarding recall, ROC-AUC, and confusion matrix. Furthermore, our study +is the first to use these four boosting algorithms with Optuna, a library for +hyperparameter optimization, and the SHAP method to improve the +interpretability of our model, which can be used as a support to identify and +predict breast cancer. We were able to improve AUC or recall for all the models +and reduce the False Negative for AdaBoost and LigthGBM the final AUC were more +than 99.41\% for all models. + +
+
+ comment: 9 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ CGGM: A conditional graph generation model with adaptive sparsity for + node anomaly detection in IoT networks + + +
+ Dynamic graphs are extensively employed for detecting anomalous behavior in +nodes within the Internet of Things (IoT). Graph generative models are often +used to address the issue of imbalanced node categories in dynamic graphs. +Nevertheless, the constraints it faces include the monotonicity of adjacency +relationships, the difficulty in constructing multi-dimensional features for +nodes, and the lack of a method for end-to-end generation of multiple +categories of nodes. In this paper, we propose a novel graph generation model, +called CGGM, specifically for generating samples belonging to the minority +class. The framework consists two core module: a conditional graph generation +module and a graph-based anomaly detection module. The generative module adapts +to the sparsity of the matrix by downsampling a noise adjacency matrix, and +incorporates a multi-dimensional feature encoder based on multi-head +self-attention to capture latent dependencies among features. Additionally, a +latent space constraint is combined with the distribution distance to +approximate the latent distribution of real data. The graph-based anomaly +detection module utilizes the generated balanced dataset to predict the node +behaviors. Extensive experiments have shown that CGGM outperforms the +state-of-the-art methods in terms of accuracy and divergence. The results also +demonstrate CGGM can generated diverse data categories, that enhancing the +performance of multi-category classification task. + +
+
+ comment: 10 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ Mitigating Unsafe Feedback with Learning Constraints + + +
+ While there has been progress towards aligning Large Language Models (LLMs) +with human values and ensuring safe behaviour at inference time, safety-guards +can easily be removed when fine-tuned on unsafe and harmful datasets.While this +setting has been treated extensively, another popular training paradigm, +learning from unsafe feedback with reinforcement learning, has previously been +unexplored. This is concerning due to the widespread deployment of feedback +collection systems. We address this gap by providing an analysis of learning +settings where feedback is adversarial and noisy, i.e. that unsafe samples are +preferred over safe ones despite model developers goal to maintain safety. We +find that safety-aligned LLMs easily explore unsafe action spaces through +generating harmful text and optimize for adversarial reward indicating that +current safety guards are not enough to prevent learning from unsafe feedback. +In order to protect against this vulnerability, we adapt a number of both +"implict" and "explicit" harmful fine-tuning defences to evaluate whether they +are effective as learning constraints in an RL setting finding that no method +is generally effective pointing to the need for more research in defences given +the widespread adoption of methods designed to learn from feedback. We end the +paper with the observation that some defences work by performing "harmless +reward hacking" for which we provide a theoretical explanation drawn from the +theory of Constrained Markov Decision Processes and provide some direction for +future defence development. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ SPICE: Smart Projection Interface for Cooking Enhancement + + +
+ Tangible User Interfaces (TUI) for human--computer interaction (HCI) provide +the user with physical representations of digital information with the aim to +overcome the limitations of screen-based interfaces. Although many compelling +demonstrations of TUIs exist in the literature, there is a lack of research on +TUIs intended for daily two-handed tasks and processes, such as cooking. In +response to this gap, we propose SPICE (Smart Projection Interface for Cooking +Enhancement). SPICE investigates TUIs in a kitchen setting, aiming to transform +the recipe following experience from simply text-based to tangibly interactive. +SPICE includes a tracking system, an agent-based software, and vision large +language models to create and interpret a kitchen environment where recipe +information is projected directly onto the cooking surface. We conducted a +comparative usability study of SPICE and text-based recipe following with 30 +participants, assessing the task difficulty, total duration, and efficiency, as +well as user confidence and taste perception. The results indicate that SPICE +allowed participants to perform the recipe with less stops and in shorter time +while also improving self-reported efficiency, confidence, and taste. Despite +this, participants self-reported no change in overall difficulty, which is a +direction for future research. Overall, the SPICE project demonstrates the +potential of using TUIs to improve everyday activities, paving the way for +future research in HCI and new computing interfaces. + +
+
+ comment: Article submitted to IUI 2025 +
+
+
+
+
+ + Who Brings the Frisbee: Probing Hidden Hallucination Factors in Large + Vision-Language Model via Causality Analysis WACV2025 + + +
+ Recent advancements in large vision-language models (LVLM) have significantly +enhanced their ability to comprehend visual inputs alongside natural language. +However, a major challenge in their real-world application is hallucination, +where LVLMs generate non-existent visual elements, eroding user trust. The +underlying mechanism driving this multimodal hallucination is poorly +understood. Minimal research has illuminated whether contexts such as sky, +tree, or grass field involve the LVLM in hallucinating a frisbee. We +hypothesize that hidden factors, such as objects, contexts, and semantic +foreground-background structures, induce hallucination. This study proposes a +novel causal approach: a hallucination probing system to identify these hidden +factors. By analyzing the causality between images, text prompts, and network +saliency, we systematically explore interventions to block these factors. Our +experimental findings show that a straightforward technique based on our +analysis can significantly reduce hallucinations. Additionally, our analyses +indicate the potential to edit network internals to minimize hallucinated +outputs. + +
+
+ comment: Accepted by WACV2025 +
+
+
+
+
+ + ☆ Personalizing Multimodal Large Language Models for Image Captioning: An + Experimental Analysis ECCV 2024 + + +
+ The task of image captioning demands an algorithm to generate natural +language descriptions of visual inputs. Recent advancements have seen a +convergence between image captioning research and the development of Large +Language Models (LLMs) and Multimodal LLMs -- like GPT-4V and Gemini -- which +extend the capabilities of text-only LLMs to multiple modalities. This paper +investigates whether Multimodal LLMs can supplant traditional image captioning +networks by evaluating their performance on various image description +benchmarks. We explore both the zero-shot capabilities of these models and +their adaptability to different semantic domains through fine-tuning methods, +including prompt learning, prefix tuning, and low-rank adaptation. Our results +demonstrate that while Multimodal LLMs achieve impressive zero-shot +performance, fine-tuning for specific domains while maintaining their +generalization capabilities intact remains challenging. We discuss the +implications of these findings for future research in image captioning and the +development of more adaptable Multimodal LLMs. + +
+
+ comment: ECCV 2024 Workshop on Green Foundation Models +
+
+
+
+
+ + ♻ ☆ FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking + Portrait + + +
+ With the rapid advancement of diffusion-based generative models, portrait +image animation has achieved remarkable results. However, it still faces +challenges in temporally consistent video generation and fast sampling due to +its iterative sampling nature. This paper presents FLOAT, an audio-driven +talking portrait video generation method based on flow matching generative +model. We shift the generative modeling from the pixel-based latent space to a +learned motion latent space, enabling efficient design of temporally consistent +motion. To achieve this, we introduce a transformer-based vector field +predictor with a simple yet effective frame-wise conditioning mechanism. +Additionally, our method supports speech-driven emotion enhancement, enabling a +natural incorporation of expressive motions. Extensive experiments demonstrate +that our method outperforms state-of-the-art audio-driven talking portrait +methods in terms of visual quality, motion fidelity, and efficiency. + +
+
+ comment: Project page: https://deepbrainai-research.github.io/float/ +
+
+
+
+
+ + ♻ ☆ Once-for-All: Controllable Generative Image Compression with Dynamic + Granularity Adaption + + +
+ Although recent generative image compression methods have demonstrated +impressive potential in optimizing the rate-distortion-perception trade-off, +they still face the critical challenge of flexible rate adaption to diverse +compression necessities and scenarios. To overcome this challenge, this paper +proposes a Controllable Generative Image Compression framework, termed +Control-GIC, the first capable of fine-grained bitrate adaption across a broad +spectrum while ensuring high-fidelity and generality compression. Control-GIC +is grounded in a VQGAN framework that encodes an image as a sequence of +variable-length codes (i.e. VQ-indices), which can be losslessly compressed and +exhibits a direct positive correlation with the bitrates. Drawing inspiration +from the classical coding principle, we correlate the information density of +local image patches with their granular representations. Hence, we can flexibly +determine a proper allocation of granularity for the patches to achieve dynamic +adjustment for VQ-indices, resulting in desirable compression rates. We further +develop a probabilistic conditional decoder capable of retrieving historic +encoded multi-granularity representations according to transmitted codes, and +then reconstruct hierarchical granular features in the formalization of +conditional probability, enabling more informative aggregation to improve +reconstruction realism. Our experiments show that Control-GIC allows highly +flexible and controllable bitrate adaption where the results demonstrate its +superior performance over recent state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Relational Learning for Multimodal Knowledge Graphs + + +
+ Relational learning is an essential task in the domain of knowledge +representation, particularly in knowledge graph completion (KGC). While +relational learning in traditional single-modal settings has been extensively +studied, exploring it within a multimodal KGC context presents distinct +challenges and opportunities. One of the major challenges is inference on newly +discovered relations without any associated training data. This zero-shot +relational learning scenario poses unique requirements for multimodal KGC, +i.e., utilizing multimodality to facilitate relational learning.However, +existing works fail to support the leverage of multimodal information and leave +the problem unexplored. In this paper, we propose a novel end-to-end framework, +consisting of three components, i.e., multimodal learner, structure +consolidator, and relation embedding generator, to integrate diverse multimodal +information and knowledge graph structures to facilitate the zero-shot +relational learning. Evaluation results on three multimodal knowledge graphs +demonstrate the superior performance of our proposed method. + +
+
+ comment: In the Proceedings of the 2024 IEEE International Conference on Big + Data (IEEE BigData 2024) +
+
+
+
+
+ + ♻ ☆ PerceiverS: A Multi-Scale Perceiver with Effective Segmentation for + Long-Term Expressive Symbolic Music Generation + + +
+ AI-based music generation has progressed significantly in recent years. +However, creating symbolic music that is both long-structured and expressive +remains a considerable challenge. In this paper, we propose PerceiverS +(Segmentation and Scale), a novel architecture designed to address this issue +by leveraging both Effective Segmentation and Multi-Scale attention mechanisms. +Our approach enhances symbolic music generation by simultaneously learning +long-term structural dependencies and short-term expressive details. By +combining cross-attention and self-attention in a Multi-Scale setting, +PerceiverS captures long-range musical structure while preserving musical +diversity. The proposed model has been evaluated using the Maestro dataset and +has demonstrated improvements in generating music of conventional length with +expressive nuances. The project demos and the generated music samples can be +accessed through the link: https://perceivers.github.io + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Motion Prompting: Controlling Video Generation with Motion Trajectories + + +
+ Motion control is crucial for generating expressive and compelling video +content; however, most existing video generation models rely mainly on text +prompts for control, which struggle to capture the nuances of dynamic actions +and temporal compositions. To this end, we train a video generation model +conditioned on spatio-temporally sparse or dense motion trajectories. In +contrast to prior motion conditioning work, this flexible representation can +encode any number of trajectories, object-specific or global scene motion, and +temporally sparse motion; due to its flexibility we refer to this conditioning +as motion prompts. While users may directly specify sparse trajectories, we +also show how to translate high-level user requests into detailed, semi-dense +motion prompts, a process we term motion prompt expansion. We demonstrate the +versatility of our approach through various applications, including camera and +object motion control, "interacting" with an image, motion transfer, and image +editing. Our results showcase emergent behaviors, such as realistic physics, +suggesting the potential of motion prompts for probing video models and +interacting with future generative world models. Finally, we evaluate +quantitatively, conduct a human study, and demonstrate strong performance. +Video results are available on our webpage: https://motion-prompting.github.io/ + +
+
+ comment: Project page: https://motion-prompting.github.io/ +
+
+
+
+
+ + ☆ Diffusion-based Visual Anagram as Multi-task Learning WACV 2025 + + +
+ Visual anagrams are images that change appearance upon transformation, like +flipping or rotation. With the advent of diffusion models, generating such +optical illusions can be achieved by averaging noise across multiple views +during the reverse denoising process. However, we observe two critical failure +modes in this approach: (i) concept segregation, where concepts in different +views are independently generated, which can not be considered a true anagram, +and (ii) concept domination, where certain concepts overpower others. In this +work, we cast the visual anagram generation problem in a multi-task learning +setting, where different viewpoint prompts are analogous to different tasks,and +derive denoising trajectories that align well across tasks simultaneously. At +the core of our designed framework are two newly introduced techniques, where +(i) an anti-segregation optimization strategy that promotes overlap in +cross-attention maps between different concepts, and (ii) a noise vector +balancing method that adaptively adjusts the influence of different tasks. +Additionally, we observe that directly averaging noise predictions yields +suboptimal performance because statistical properties may not be preserved, +prompting us to derive a noise variance rectification method. Extensive +qualitative and quantitative experiments demonstrate our method's superior +ability to generate visual anagrams spanning diverse concepts. + +
+
+ comment: WACV 2025. Code is publicly available at + https://github.com/Pixtella/Anagram-MTL +
+
+
+
+
+ + ☆ Taming Scalable Visual Tokenizer for Autoregressive Image Generation + + +
+ Existing vector quantization (VQ) methods struggle with scalability, largely +attributed to the instability of the codebook that undergoes partial updates +during training. The codebook is prone to collapse as utilization decreases, +due to the progressively widening distribution gap between non-activated codes +and visual features. To solve the problem, we propose Index Backpropagation +Quantization (IBQ), a new VQ method for the joint optimization of all codebook +embeddings and the visual encoder. Applying a straight-through estimator on the +one-hot categorical distribution between the encoded feature and codebook, all +codes are differentiable and maintain a consistent latent space with the visual +encoder. IBQ enables scalable training of visual tokenizers and, for the first +time, achieves a large-scale codebook ($2^{18}$) with high dimension ($256$) +and high utilization. Experiments on the standard ImageNet benchmark +demonstrate the scalability and superiority of IBQ, achieving competitive +results on both reconstruction ($1.00$ rFID) and autoregressive visual +generation ($2.05$ gFID). The code and models are available at +https://github.com/TencentARC/SEED-Voken. + +
+
+
+
+
+ + ☆ FoundHand: Large-Scale Domain-Specific Learning for Controllable Hand + Image Generation + + +
+ Despite remarkable progress in image generation models, generating realistic +hands remains a persistent challenge due to their complex articulation, varying +viewpoints, and frequent occlusions. We present FoundHand, a large-scale +domain-specific diffusion model for synthesizing single and dual hand images. +To train our model, we introduce FoundHand-10M, a large-scale hand dataset with +2D keypoints and segmentation mask annotations. Our insight is to use 2D hand +keypoints as a universal representation that encodes both hand articulation and +camera viewpoint. FoundHand learns from image pairs to capture physically +plausible hand articulations, natively enables precise control through 2D +keypoints, and supports appearance control. Our model exhibits core +capabilities that include the ability to repose hands, transfer hand +appearance, and even synthesize novel views. This leads to zero-shot +capabilities for fixing malformed hands in previously generated images, or +synthesizing hand video sequences. We present extensive experiments and +evaluations that demonstrate state-of-the-art performance of our method. + +
+
+
+
+
+ + ☆ SNOOPI: Supercharged One-step Diffusion Distillation with Proper + Guidance + + +
+ Recent approaches have yielded promising results in distilling multi-step +text-to-image diffusion models into one-step ones. The state-of-the-art +efficient distillation technique, i.e., SwiftBrushv2 (SBv2), even surpasses the +teacher model's performance with limited resources. However, our study reveals +its instability when handling different diffusion model backbones due to using +a fixed guidance scale within the Variational Score Distillation (VSD) loss. +Another weakness of the existing one-step diffusion models is the missing +support for negative prompt guidance, which is crucial in practical image +generation. This paper presents SNOOPI, a novel framework designed to address +these limitations by enhancing the guidance in one-step diffusion models during +both training and inference. First, we effectively enhance training stability +through Proper Guidance-SwiftBrush (PG-SB), which employs a random-scale +classifier-free guidance approach. By varying the guidance scale of both +teacher models, we broaden their output distributions, resulting in a more +robust VSD loss that enables SB to perform effectively across diverse backbones +while maintaining competitive performance. Second, we propose a training-free +method called Negative-Away Steer Attention (NASA), which integrates negative +prompts into one-step diffusion models via cross-attention to suppress +undesired elements in generated images. Our experimental results show that our +proposed methods significantly improve baseline models across various metrics. +Remarkably, we achieve an HPSv2 score of 31.08, setting a new state-of-the-art +benchmark for one-step diffusion models. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ AniGS: Animatable Gaussian Avatar from a Single Image with Inconsistent + Gaussian Reconstruction + + +
+ Generating animatable human avatars from a single image is essential for +various digital human modeling applications. Existing 3D reconstruction methods +often struggle to capture fine details in animatable models, while generative +approaches for controllable animation, though avoiding explicit 3D modeling, +suffer from viewpoint inconsistencies in extreme poses and computational +inefficiencies. In this paper, we address these challenges by leveraging the +power of generative models to produce detailed multi-view canonical pose +images, which help resolve ambiguities in animatable human reconstruction. We +then propose a robust method for 3D reconstruction of inconsistent images, +enabling real-time rendering during inference. Specifically, we adapt a +transformer-based video generation model to generate multi-view canonical pose +images and normal maps, pretraining on a large-scale video dataset to improve +generalization. To handle view inconsistencies, we recast the reconstruction +problem as a 4D task and introduce an efficient 3D modeling approach using 4D +Gaussian Splatting. Experiments demonstrate that our method achieves +photorealistic, real-time animation of 3D human avatars from in-the-wild +images, showcasing its effectiveness and generalization capability. + +
+
+ comment: Project Page: https://lingtengqiu.github.io/2024/AniGS/ +
+
+
+
+
+ + ☆ Planning-Guided Diffusion Policy Learning for Generalizable Contact-Rich + Bimanual Manipulation + + +
+ Contact-rich bimanual manipulation involves precise coordination of two arms +to change object states through strategically selected contacts and motions. +Due to the inherent complexity of these tasks, acquiring sufficient +demonstration data and training policies that generalize to unseen scenarios +remain a largely unresolved challenge. Building on recent advances in planning +through contacts, we introduce Generalizable Planning-Guided Diffusion Policy +Learning (GLIDE), an approach that effectively learns to solve contact-rich +bimanual manipulation tasks by leveraging model-based motion planners to +generate demonstration data in high-fidelity physics simulation. Through +efficient planning in randomized environments, our approach generates +large-scale and high-quality synthetic motion trajectories for tasks involving +diverse objects and transformations. We then train a task-conditioned diffusion +policy via behavior cloning using these demonstrations. To tackle the +sim-to-real gap, we propose a set of essential design options in feature +extraction, task representation, action prediction, and data augmentation that +enable learning robust prediction of smooth action sequences and generalization +to unseen scenarios. Through experiments in both simulation and the real world, +we demonstrate that our approach can enable a bimanual robotic system to +effectively manipulate objects of diverse geometries, dimensions, and physical +properties. Website: https://glide-manip.github.io/ + +
+
+
+
+
+ + ☆ A Bidirectional Long Short Term Memory Approach for Infrastructure + Health Monitoring Using On-board Vibration Response + + +
+ The growing volume of available infrastructural monitoring data enables the +development of powerful datadriven approaches to estimate infrastructure health +conditions using direct measurements. This paper proposes a deep learning +methodology to estimate infrastructure physical parameters, such as railway +track stiffness, using drive-by vibration response signals. The proposed method +employs a Long Short-term Memory (LSTM) feature extractor accounting for +temporal dependencies in the feature extraction phase, and a bidirectional Long +Short-term Memory (BiLSTM) networks to leverage bidirectional temporal +dependencies in both the forward and backward paths of the drive-by vibration +response in condition estimation phase. Additionally, a framing approach is +employed to enhance the resolution of the monitoring task to the beam level by +segmenting the vibration signal into frames equal to the distance between +individual beams, centering the frames over the beam nodes. The proposed +LSTM-BiLSTM model offers a versatile tool for various bridge and railway +infrastructure conditions monitoring using direct drive-by vibration response +measurements. The results demonstrate the potential of incorporating temporal +analysis in the feature extraction phase and emphasize the pivotal role of +bidirectional temporal information in infrastructure health condition +estimation. The proposed methodology can accurately and automatically estimate +railway track stiffness and identify local stiffness reductions in the presence +of noise using drive-by measurements. An illustrative case study of +vehicle-track interaction simulation is used to demonstrate the performance of +the proposed model, achieving a maximum mean absolute percentage error of 1.7% +and 0.7% in estimating railpad and ballast stiffness, respectively. + +
+
+ comment: 17 pages; Accepted for the presentation at Transportation Research + Board (TRB) Annual Meeting, and under review in the Journal of Transportation + Research Record (TRR) +
+
+
+
+
+ + ☆ Robust soybean seed yield estimation using high-throughput ground robot + videos + + +
+ We present a novel method for soybean (Glycine max (L.) Merr.) yield +estimation leveraging high throughput seed counting via computer vision and +deep learning techniques. Traditional methods for collecting yield data are +labor-intensive, costly, prone to equipment failures at critical data +collection times, and require transportation of equipment across field sites. +Computer vision, the field of teaching computers to interpret visual data, +allows us to extract detailed yield information directly from images. By +treating it as a computer vision task, we report a more efficient alternative, +employing a ground robot equipped with fisheye cameras to capture comprehensive +videos of soybean plots from which images are extracted in a variety of +development programs. These images are processed through the P2PNet-Yield +model, a deep learning framework where we combined a Feature Extraction Module +(the backbone of the P2PNet-Soy) and a Yield Regression Module to estimate seed +yields of soybean plots. Our results are built on three years of yield testing +plot data - 8500 in 2021, 2275 in 2022, and 650 in 2023. With these datasets, +our approach incorporates several innovations to further improve the accuracy +and generalizability of the seed counting and yield estimation architecture, +such as the fisheye image correction and data augmentation with random sensor +effects. The P2PNet-Yield model achieved a genotype ranking accuracy score of +up to 83%. It demonstrates up to a 32% reduction in time to collect yield data +as well as costs associated with traditional yield estimation, offering a +scalable solution for breeding programs and agricultural productivity +enhancement. + +
+
+ comment: 23 pages, 12 figures, 2 tables +
+
+
+
+
+ + ☆ MetaShadow: Object-Centered Shadow Detection, Removal, and Synthesis + + +
+ Shadows are often under-considered or even ignored in image editing +applications, limiting the realism of the edited results. In this paper, we +introduce MetaShadow, a three-in-one versatile framework that enables +detection, removal, and controllable synthesis of shadows in natural images in +an object-centered fashion. MetaShadow combines the strengths of two +cooperative components: Shadow Analyzer, for object-centered shadow detection +and removal, and Shadow Synthesizer, for reference-based controllable shadow +synthesis. Notably, we optimize the learning of the intermediate features from +Shadow Analyzer to guide Shadow Synthesizer to generate more realistic shadows +that blend seamlessly with the scene. Extensive evaluations on multiple shadow +benchmark datasets show significant improvements of MetaShadow over the +existing state-of-the-art methods on object-centered shadow detection, removal, +and synthesis. MetaShadow excels in image-editing tasks such as object removal, +relocation, and insertion, pushing the boundaries of object-centered image +editing. + +
+
+
+
+
+ + ☆ Scaling Image Tokenizers with Grouped Spherical Quantization + + +
+ Vision tokenizers have gained a lot of attraction due to their scalability +and compactness; previous works depend on old-school GAN-based hyperparameters, +biased comparisons, and a lack of comprehensive analysis of the scaling +behaviours. To tackle those issues, we introduce Grouped Spherical Quantization +(GSQ), featuring spherical codebook initialization and lookup regularization to +constrain codebook latent to a spherical surface. Our empirical analysis of +image tokenizer training strategies demonstrates that GSQ-GAN achieves superior +reconstruction quality over state-of-the-art methods with fewer training +iterations, providing a solid foundation for scaling studies. Building on this, +we systematically examine the scaling behaviours of GSQ, specifically in latent +dimensionality, codebook size, and compression ratios, and their impact on +model performance. Our findings reveal distinct behaviours at high and low +spatial compression levels, underscoring challenges in representing +high-dimensional latent spaces. We show that GSQ can restructure +high-dimensional latent into compact, low-dimensional spaces, thus enabling +efficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x +down-sampling with a reconstruction FID (rFID) of 0.50. + +
+
+
+
+
+ + ☆ Sharp-It: A Multi-view to Multi-view Diffusion Model for 3D Synthesis + and Manipulation + + +
+ Advancements in text-to-image diffusion models have led to significant +progress in fast 3D content creation. One common approach is to generate a set +of multi-view images of an object, and then reconstruct it into a 3D model. +However, this approach bypasses the use of a native 3D representation of the +object and is hence prone to geometric artifacts and limited in controllability +and manipulation capabilities. An alternative approach involves native 3D +generative models that directly produce 3D representations. These models, +however, are typically limited in their resolution, resulting in lower quality +3D objects. In this work, we bridge the quality gap between methods that +directly generate 3D representations and ones that reconstruct 3D objects from +multi-view images. We introduce a multi-view to multi-view diffusion model +called Sharp-It, which takes a 3D consistent set of multi-view images rendered +from a low-quality object and enriches its geometric details and texture. The +diffusion model operates on the multi-view set in parallel, in the sense that +it shares features across the generated views. A high-quality 3D model can then +be reconstructed from the enriched multi-view set. By leveraging the advantages +of both 2D and 3D approaches, our method offers an efficient and controllable +method for high-quality 3D content creation. We demonstrate that Sharp-It +enables various 3D applications, such as fast synthesis, editing, and +controlled generation, while attaining high-quality assets. + +
+
+ comment: Project page at https://yiftachede.github.io/Sharp-It/ +
+
+
+
+
+ + ☆ Continual Learning of Personalized Generative Face Models with + Experience Replay WACV 2025 + + +
+ We introduce a novel continual learning problem: how to sequentially update +the weights of a personalized 2D and 3D generative face model as new batches of +photos in different appearances, styles, poses, and lighting are captured +regularly. We observe that naive sequential fine-tuning of the model leads to +catastrophic forgetting of past representations of the individual's face. We +then demonstrate that a simple random sampling-based experience replay method +is effective at mitigating catastrophic forgetting when a relatively large +number of images can be stored and replayed. However, for long-term deployment +of these models with relatively smaller storage, this simple random +sampling-based replay technique also forgets past representations. Thus, we +introduce a novel experience replay algorithm that combines random sampling +with StyleGAN's latent space to represent the buffer as an optimal convex hull. +We observe that our proposed convex hull-based experience replay is more +effective in preventing forgetting than a random sampling baseline and the +lower bound. + +
+
+ comment: Accepted to WACV 2025. Project page (incl. supplementary materials): + https://anniedde.github.io/personalizedcontinuallearning.github.io/ +
+
+
+
+
+ + ☆ Improving Dynamic Object Interactions in Text-to-Video Generation with + AI Feedback + + +
+ Large text-to-video models hold immense potential for a wide range of +downstream applications. However, these models struggle to accurately depict +dynamic object interactions, often resulting in unrealistic movements and +frequent violations of real-world physics. One solution inspired by large +language models is to align generated outputs with desired outcomes using +external feedback. This enables the model to refine its responses autonomously, +eliminating extensive manual data collection. In this work, we investigate the +use of feedback to enhance the object dynamics in text-to-video models. We aim +to answer a critical question: what types of feedback, paired with which +specific self-improvement algorithms, can most effectively improve text-video +alignment and realistic object interactions? We begin by deriving a unified +probabilistic objective for offline RL finetuning of text-to-video models. This +perspective highlights how design elements in existing algorithms like KL +regularization and policy projection emerge as specific choices within a +unified framework. We then use derived methods to optimize a set of text-video +alignment metrics (e.g., CLIP scores, optical flow), but notice that they often +fail to align with human perceptions of generation quality. To address this +limitation, we propose leveraging vision-language models to provide more +nuanced feedback specifically tailored to object dynamics in videos. Our +experiments demonstrate that our method can effectively optimize a wide variety +of rewards, with binary AI feedback driving the most significant improvements +in video quality for dynamic interactions, as confirmed by both AI and human +evaluations. Notably, we observe substantial gains when using reward signals +derived from AI feedback, particularly in scenarios involving complex +interactions between multiple objects and realistic depictions of objects +falling. + +
+
+ comment: Website: https://sites.google.com/view/aif-dynamic-t2v/ +
+
+
+
+
+ + ☆ AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand + Audio-Visual Information? + + +
+ Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini +1.5 Pro, and Reka Core, have expanded their capabilities to include vision and +audio modalities. While these models demonstrate impressive performance across +a wide range of audio-visual applications, our proposed DeafTest reveals that +MLLMs often struggle with simple tasks humans find trivial: 1) determining +which of two sounds is louder, and 2) determining which of two sounds has a +higher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a +comprehensive audio-visual benchmark designed to assess whether those MLLMs can +truly understand the audio-visual information. This benchmark encompasses 4,555 +carefully crafted problems, each incorporating text, visual, and audio +components. To successfully infer answers, models must effectively leverage +clues from both visual and audio inputs. To ensure precise and objective +evaluation of MLLM responses, we have structured the questions as +multiple-choice, eliminating the need for human evaluation or LLM-assisted +assessment. We benchmark a series of closed-source and open-source models and +summarize the observations. By revealing the limitations of current models, we +aim to provide useful insight for future dataset collection and model +development. + +
+
+ comment: Project page: https://av-odyssey.github.io/ +
+
+
+
+
+ + MERGE: Multi-faceted Hierarchical Graph-based GNN for Gene Expression + Prediction from Whole Slide Histopathology Images + + +
+ Recent advances in Spatial Transcriptomics (ST) pair histology images with +spatially resolved gene expression profiles, enabling predictions of gene +expression across different tissue locations based on image patches. This opens +up new possibilities for enhancing whole slide image (WSI) prediction tasks +with localized gene expression. However, existing methods fail to fully +leverage the interactions between different tissue locations, which are crucial +for accurate joint prediction. To address this, we introduce MERGE +(Multi-faceted hiErarchical gRaph for Gene Expressions), which combines a +multi-faceted hierarchical graph construction strategy with graph neural +networks (GNN) to improve gene expression predictions from WSIs. By clustering +tissue image patches based on both spatial and morphological features, and +incorporating intra- and inter-cluster edges, our approach fosters interactions +between distant tissue locations during GNN learning. As an additional +contribution, we evaluate different data smoothing techniques that are +necessary to mitigate artifacts in ST data, often caused by technical +imperfections. We advocate for adopting gene-aware smoothing methods that are +more biologically justified. Experimental results on gene expression prediction +show that our GNN method outperforms state-of-the-art techniques across +multiple metrics. + +
+
+ comment: Main Paper: 8 pages, Supplementary Material: 9 pages, Figures: 16 +
+
+
+
+
+ + ☆ Class-wise Autoencoders Measure Classification Difficulty And Detect + Label Mistakes + + +
+ We introduce a new framework for analyzing classification datasets based on +the ratios of reconstruction errors between autoencoders trained on individual +classes. This analysis framework enables efficient characterization of datasets +on the sample, class, and entire dataset levels. We define reconstruction error +ratios (RERs) that probe classification difficulty and allow its decomposition +into (1) finite sample size and (2) Bayes error and decision-boundary +complexity. Through systematic study across 19 popular visual datasets, we find +that our RER-based dataset difficulty probe strongly correlates with error rate +for state-of-the-art (SOTA) classification models. By interpreting sample-level +classification difficulty as a label mistakenness score, we further find that +RERs achieve SOTA performance on mislabel detection tasks on hard datasets +under symmetric and asymmetric label noise. Our code is publicly available at +https://github.com/voxel51/reconstruction-error-ratios. + +
+
+ comment: 30 pages, 18 figures +
+
+
+
+
+ + ☆ OCR Hinders RAG: Evaluating the Cascading Impact of OCR on + Retrieval-Augmented Generation + + +
+ Retrieval-augmented Generation (RAG) enhances Large Language Models (LLMs) by +integrating external knowledge to reduce hallucinations and incorporate +up-to-date information without retraining. As an essential part of RAG, +external knowledge bases are commonly built by extracting structured data from +unstructured PDF documents using Optical Character Recognition (OCR). However, +given the imperfect prediction of OCR and the inherent non-uniform +representation of structured data, knowledge bases inevitably contain various +OCR noises. In this paper, we introduce OHRBench, the first benchmark for +understanding the cascading impact of OCR on RAG systems. OHRBench includes 350 +carefully selected unstructured PDF documents from six real-world RAG +application domains, along with Q&As derived from multimodal elements in +documents, challenging existing OCR solutions used for RAG To better understand +OCR's impact on RAG systems, we identify two primary types of OCR noise: +Semantic Noise and Formatting Noise and apply perturbation to generate a set of +structured data with varying degrees of each OCR noise. Using OHRBench, we +first conduct a comprehensive evaluation of current OCR solutions and reveal +that none is competent for constructing high-quality knowledge bases for RAG +systems. We then systematically evaluate the impact of these two noise types +and demonstrate the vulnerability of RAG systems. Furthermore, we discuss the +potential of employing Vision-Language Models (VLMs) without OCR in RAG +systems. Code: https://github.com/opendatalab/OHR-Bench + +
+
+
+
+
+ + ☆ MedTet: An Online Motion Model for 4D Heart Reconstruction + + +
+ We present a novel approach to reconstruction of 3D cardiac motion from +sparse intraoperative data. While existing methods can accurately reconstruct +3D organ geometries from full 3D volumetric imaging, they cannot be used during +surgical interventions where usually limited observed data, such as a few 2D +frames or 1D signals, is available in real-time. We propose a versatile +framework for reconstructing 3D motion from such partial data. It discretizes +the 3D space into a deformable tetrahedral grid with signed distance values, +providing implicit unlimited resolution while maintaining explicit control over +motion dynamics. Given an initial 3D model reconstructed from pre-operative +full volumetric data, our system, equipped with an universal observation +encoder, can reconstruct coherent 3D cardiac motion from full 3D volumes, a few +2D MRI slices or even 1D signals. Extensive experiments on cardiac intervention +scenarios demonstrate our ability to generate plausible and anatomically +consistent 3D motion reconstructions from various sparse real-time +observations, highlighting its potential for multimodal cardiac imaging. Our +code and model will be made available at https://github.com/Scalsol/MedTet. + +
+
+
+
+
+ + ☆ Copy-Move Forgery Detection and Question Answering for Remote Sensing + Image + + +
+ This paper introduces the task of Remote Sensing Copy-Move Question Answering +(RSCMQA). Unlike traditional Remote Sensing Visual Question Answering (RSVQA), +RSCMQA focuses on interpreting complex tampering scenarios and inferring +relationships between objects. Based on the practical needs of national defense +security and land resource monitoring, we have developed an accurate and +comprehensive global dataset for remote sensing image copy-move question +answering, named RS-CMQA-2.1M. These images were collected from 29 different +regions across 14 countries. Additionally, we have refined a balanced dataset, +RS-CMQA-B, to address the long-standing issue of long-tail data in the remote +sensing field. Furthermore, we propose a region-discriminative guided +multimodal CMQA model, which enhances the accuracy of answering questions about +tampered images by leveraging prompt about the differences and connections +between the source and tampered domains. Extensive experiments demonstrate that +our method provides a stronger benchmark for RS-CMQA compared to general VQA +and RSVQA models. Our dataset and code are available at +https://github.com/shenyedepisa/RSCMQA. + +
+
+ comment: 7 figs, 7 tables +
+
+
+
+
+ + ☆ Remote Sensing Temporal Vision-Language Models: A Comprehensive Survey + + +
+ Temporal image analysis in remote sensing has traditionally centered on +change detection, which identifies regions of change between images captured at +different times. However, change detection remains limited by its focus on +visual-level interpretation, often lacking contextual or descriptive +information. The rise of Vision-Language Models (VLMs) has introduced a new +dimension to remote sensing temporal image analysis by integrating visual +information with natural language, creating an avenue for advanced +interpretation of temporal image changes. Remote Sensing Temporal VLMs +(RSTVLMs) allow for dynamic interactions, generating descriptive captions, +answering questions, and providing a richer semantic understanding of temporal +images. This temporal vision-language capability is particularly valuable for +complex remote sensing applications, where higher-level insights are crucial. +This paper comprehensively reviews the progress of RSTVLM research, with a +focus on the latest VLM applications for temporal image analysis. We categorize +and discuss core methodologies, datasets, and metrics, highlight recent +advances in temporal vision-language tasks, and outline key challenges and +future directions for research in this emerging field. This survey fills a +critical gap in the literature by providing an integrated overview of RSTVLM, +offering a foundation for further advancements in remote sensing temporal image +understanding. We will keep tracing related works at +\url{https://github.com/Chen-Yang-Liu/Awesome-RS-Temporal-VLM} + +
+
+
+
+
+ + ☆ Segmentation of Coronary Artery Stenosis in X-ray Angiography using + Mamba Models + + +
+ Coronary artery disease stands as one of the primary contributors to global +mortality rates. The automated identification of coronary artery stenosis from +X-ray images plays a critical role in the diagnostic process for coronary heart +disease. This task is challenging due to the complex structure of coronary +arteries, intrinsic noise in X-ray images, and the fact that stenotic coronary +arteries appear narrow and blurred in X-ray angiographies. This study employs +five different variants of the Mamba-based model and one variant of the Swin +Transformer-based model, primarily based on the U-Net architecture, for the +localization of stenosis in Coronary artery disease. Our best results showed an +F1 score of 68.79% for the U-Mamba BOT model, representing an 11.8% improvement +over the semi-supervised approach. + +
+
+
+
+
+ + ☆ SJTU:Spatial judgments in multimodal models towards unified segmentation + through coordinate detection + + +
+ Despite advances in vision-language understanding, implementing image +segmentation within multimodal architectures remains a fundamental challenge in +modern artificial intelligence systems. Existing vision-language models, which +primarily rely on backbone architectures or CLIP-based embedding learning, +demonstrate inherent limitations in fine-grained spatial localization and +operational capabilities. This paper introduces SJTU: Spatial Judgments in +multimodal models - Towards Unified segmentation through coordinate detection, +a novel framework that leverages spatial coordinate understanding to bridge +vision-language interaction and precise segmentation, enabling accurate target +identification through natural language instructions. The framework proposes a +novel approach for integrating segmentation techniques with vision-language +models based on multimodal spatial inference. By leveraging normalized +coordinate detection for bounding boxes and translating it into actionable +segmentation outputs, we explore the possibility of integrating multimodal +spatial and language representations. Based on the proposed technical approach, +the framework demonstrates superior performance on various benchmark datasets +as well as accurate object segmentation. Results on the COCO 2017 dataset for +general object detection and Pascal VOC datasets for semantic segmentation +demonstrate the generalization capabilities of the framework. + +
+
+ comment: 15 pages, 3 figures +
+
+
+
+
+ + ☆ ShadowHack: Hacking Shadows via Luminance-Color Divide and Conquer + + +
+ Shadows introduce challenges such as reduced brightness, texture +deterioration, and color distortion in images, complicating a holistic +solution. This study presents \textbf{ShadowHack}, a divide-and-conquer +strategy that tackles these complexities by decomposing the original task into +luminance recovery and color remedy. To brighten shadow regions and repair the +corrupted textures in the luminance space, we customize LRNet, a U-shaped +network with a rectified outreach attention module, to enhance information +interaction and recalibrate contaminated attention maps. With luminance +recovered, CRNet then leverages cross-attention mechanisms to revive vibrant +colors, producing visually compelling results. Extensive experiments on +multiple datasets are conducted to demonstrate the superiority of ShadowHack +over existing state-of-the-art solutions both quantitatively and qualitatively, +highlighting the effectiveness of our design. Our code will be made publicly +available at https://github.com/lime-j/ShadowHack + +
+
+
+
+
+ + ☆ Unveiling Concept Attribution in Diffusion Models + + +
+ Diffusion models have shown remarkable abilities in generating realistic and +high-quality images from text prompts. However, a trained model remains +black-box; little do we know about the role of its components in exhibiting a +concept such as objects or styles. Recent works employ causal tracing to +localize layers storing knowledge in generative models without showing how +those layers contribute to the target concept. In this work, we approach the +model interpretability problem from a more general perspective and pose a +question: \textit{``How do model components work jointly to demonstrate +knowledge?''}. We adapt component attribution to decompose diffusion models, +unveiling how a component contributes to a concept. Our framework allows +effective model editing, in particular, we can erase a concept from diffusion +models by removing positive components while remaining knowledge of other +concepts. Surprisingly, we also show there exist components that contribute +negatively to a concept, which has not been discovered in the knowledge +localization approach. Experimental results confirm the role of positive and +negative components pinpointed by our framework, depicting a complete view of +interpreting generative models. Our code is available at +\url{https://github.com/mail-research/CAD-attribution4diffusion} + +
+
+
+
+
+ + ☆ LiDAR-based Registration against Georeferenced Models for Globally + Consistent Allocentric Maps + + +
+ Modern unmanned aerial vehicles (UAVs) are irreplaceable in search and rescue +(SAR) missions to obtain a situational overview or provide closeups without +endangering personnel. However, UAVs heavily rely on global navigation +satellite system (GNSS) for localization which works well in open spaces, but +the precision drastically degrades in the vicinity of buildings. These +inaccuracies hinder aggregation of diverse data from multiple sources in a +unified georeferenced frame for SAR operators. In contrast, CityGML models +provide approximate building shapes with accurate georeferenced poses. Besides, +LiDAR works best in the vicinity of 3D structures. Hence, we refine coarse GNSS +measurements by registering LiDAR maps against CityGML and digital elevation +map (DEM) models as a prior for allocentric mapping. An intuitive plausibility +score selects the best hypothesis based on occupancy using a 2D height map. +Afterwards, we integrate the registration results in a continuous-time +spline-based pose graph optimizer with LiDAR odometry and further sensing +modalities to obtain globally consistent, georeferenced trajectories and maps. +We evaluate the viability of our approach on multiple flights captured at two +distinct testing sites. Our method successfully reduced GNSS offset errors from +up-to 16 m to below 0.5 m on multiple flights. Furthermore, we obtain globally +consistent maps w.r.t. prior 3D geospatial models. + +
+
+ comment: Presented at IEEE International Symposium on Safety, Security, and + Rescue Robotics (SSRR), New York City, USA, November 2024 +
+
+
+
+
+ + ☆ Multimodal Remote Sensing Scene Classification Using VLMs and Dual-Cross + Attention Networks + + +
+ Remote sensing scene classification (RSSC) is a critical task with diverse +applications in land use and resource management. While unimodal image-based +approaches show promise, they often struggle with limitations such as high +intra-class variance and inter-class similarity. Incorporating textual +information can enhance classification by providing additional context and +semantic understanding, but manual text annotation is labor-intensive and +costly. In this work, we propose a novel RSSC framework that integrates text +descriptions generated by large vision-language models (VLMs) as an auxiliary +modality without incurring expensive manual annotation costs. To fully leverage +the latent complementarities between visual and textual data, we propose a dual +cross-attention-based network to fuse these modalities into a unified +representation. Extensive experiments with both quantitative and qualitative +evaluation across five RSSC datasets demonstrate that our framework +consistently outperforms baseline models. We also verify the effectiveness of +VLM-generated text descriptions compared to human-annotated descriptions. +Additionally, we design a zero-shot classification scenario to show that the +learned multimodal representation can be effectively utilized for unseen class +classification. This research opens new opportunities for leveraging textual +information in RSSC tasks and provides a promising multimodal fusion structure, +offering insights and inspiration for future studies. Code is available at: +https://github.com/CJR7/MultiAtt-RSSC + +
+
+
+
+
+ + ☆ WEM-GAN: Wavelet transform based facial expression manipulation + + +
+ Facial expression manipulation aims to change human facial expressions +without affecting face recognition. In order to transform the facial +expressions to target expressions, previous methods relied on expression labels +to guide the manipulation process. However, these methods failed to preserve +the details of facial features, which causes the weakening or the loss of +identity information in the output image. In our work, we propose WEM-GAN, in +short for wavelet-based expression manipulation GAN, which puts more efforts on +preserving the details of the original image in the editing process. Firstly, +we take advantage of the wavelet transform technique and combine it with our +generator with a U-net autoencoder backbone, in order to improve the +generator's ability to preserve more details of facial features. Secondly, we +also implement the high-frequency component discriminator, and use +high-frequency domain adversarial loss to further constrain the optimization of +our model, providing the generated face image with more abundant details. +Additionally, in order to narrow the gap between generated facial expressions +and target expressions, we use residual connections between encoder and +decoder, while also using relative action units (AUs) several times. Extensive +qualitative and quantitative experiments have demonstrated that our model +performs better in preserving identity features, editing capability, and image +generation quality on the AffectNet dataset. It also shows superior performance +in metrics such as Average Content Distance (ACD) and Expression Distance (ED). + +
+
+
+
+
+ + ☆ Towards Rich Emotions in 3D Avatars: A Text-to-3D Avatar Generation + Benchmark + + +
+ Producing emotionally dynamic 3D facial avatars with text derived from spoken +words (Emo3D) has been a pivotal research topic in 3D avatar generation. While +progress has been made in general-purpose 3D avatar generation, the exploration +of generating emotional 3D avatars remains scarce, primarily due to the +complexities of identifying and rendering rich emotions from spoken words. This +paper reexamines Emo3D generation and draws inspiration from human processes, +breaking down Emo3D into two cascading steps: Text-to-3D Expression Mapping +(T3DEM) and 3D Avatar Rendering (3DAR). T3DEM is the most crucial step in +determining the quality of Emo3D generation and encompasses three key +challenges: Expression Diversity, Emotion-Content Consistency, and Expression +Fluidity. To address these challenges, we introduce a novel benchmark to +advance research in Emo3D generation. First, we present EmoAva, a large-scale, +high-quality dataset for T3DEM, comprising 15,000 text-to-3D expression +mappings that characterize the aforementioned three challenges in Emo3D +generation. Furthermore, we develop various metrics to effectively evaluate +models against these identified challenges. Next, to effectively model the +consistency, diversity, and fluidity of human expressions in the T3DEM step, we +propose the Continuous Text-to-Expression Generator, which employs an +autoregressive Conditional Variational Autoencoder for expression code +generation, enhanced with Latent Temporal Attention and Expression-wise +Attention mechanisms. Finally, to further enhance the 3DAR step on rendering +higher-quality subtle expressions, we present the Globally-informed Gaussian +Avatar (GiGA) model. GiGA incorporates a global information mechanism into 3D +Gaussian representations, enabling the capture of subtle micro-expressions and +seamless transitions between emotional states. + +
+
+ comment: 18 pages, 14 figures. Project website: + https://github.com/WalkerMitty/EmoAva +
+
+
+
+
+ + ☆ ROVER: A Multi-Season Dataset for Visual SLAM + + +
+ Robust Simultaneous Localization and Mapping (SLAM) is a crucial enabler for +autonomous navigation in natural, unstructured environments such as parks and +gardens. However, these environments present unique challenges for SLAM due to +frequent seasonal changes, varying light conditions, and dense vegetation. +These factors often degrade the performance of visual SLAM algorithms +originally developed for structured urban environments. To address this gap, we +present ROVER, a comprehensive benchmark dataset tailored for evaluating visual +SLAM algorithms under diverse environmental conditions and spatial +configurations. We captured the dataset with a robotic platform equipped with +monocular, stereo, and RGB-D cameras, as well as inertial sensors. It covers 39 +recordings across five outdoor locations, collected through all seasons and +various lighting scenarios, i.e., day, dusk, and night with and without +external lighting. With this novel dataset, we evaluate several traditional and +deep learning-based SLAM methods and study their performance in diverse +challenging conditions. The results demonstrate that while stereo-inertial and +RGB-D configurations generally perform better under favorable lighting and +moderate vegetation, most SLAM systems perform poorly in low-light and +high-vegetation scenarios, particularly during summer and autumn. Our analysis +highlights the need for improved adaptability in visual SLAM algorithms for +outdoor applications, as current systems struggle with dynamic environmental +factors affecting scale, feature extraction, and trajectory consistency. This +dataset provides a solid foundation for advancing visual SLAM research in +real-world, natural environments, fostering the development of more resilient +SLAM systems for long-term outdoor localization and mapping. The dataset and +the code of the benchmark are available under +https://iis-esslingen.github.io/rover. + +
+
+ comment: 17 pages, 7 figures, 11 tables +
+
+
+
+
+ + ☆ RelayGS: Reconstructing Dynamic Scenes with Large-Scale and Complex + Motions via Relay Gaussians + + +
+ Reconstructing dynamic scenes with large-scale and complex motions remains a +significant challenge. Recent techniques like Neural Radiance Fields and 3D +Gaussian Splatting (3DGS) have shown promise but still struggle with scenes +involving substantial movement. This paper proposes RelayGS, a novel method +based on 3DGS, specifically designed to represent and reconstruct highly +dynamic scenes. Our RelayGS learns a complete 4D representation with canonical +3D Gaussians and a compact motion field, consisting of three stages. First, we +learn a fundamental 3DGS from all frames, ignoring temporal scene variations, +and use a learnable mask to separate the highly dynamic foreground from the +minimally moving background. Second, we replicate multiple copies of the +decoupled foreground Gaussians from the first stage, each corresponding to a +temporal segment, and optimize them using pseudo-views constructed from +multiple frames within each segment. These Gaussians, termed Relay Gaussians, +act as explicit relay nodes, simplifying and breaking down large-scale motion +trajectories into smaller, manageable segments. Finally, we jointly learn the +scene's temporal motion and refine the canonical Gaussians learned from the +first two stages. We conduct thorough experiments on two dynamic scene datasets +featuring large and complex motions, where our RelayGS outperforms +state-of-the-arts by more than 1 dB in PSNR, and successfully reconstructs +real-world basketball game scenes in a much more complete and coherent manner, +whereas previous methods usually struggle to capture the complex motion of +players. Code will be publicly available at https://github.com/gqk/RelayGS + +
+
+ comment: Technical Report. GitHub: https://github.com/gqk/RelayGS +
+
+
+
+
+ + ☆ OODFace: Benchmarking Robustness of Face Recognition under Common + Corruptions and Appearance Variations + + +
+ With the rise of deep learning, facial recognition technology has seen +extensive research and rapid development. Although facial recognition is +considered a mature technology, we find that existing open-source models and +commercial algorithms lack robustness in certain real-world Out-of-Distribution +(OOD) scenarios, raising concerns about the reliability of these systems. In +this paper, we introduce OODFace, which explores the OOD challenges faced by +facial recognition models from two perspectives: common corruptions and +appearance variations. We systematically design 30 OOD scenarios across 9 major +categories tailored for facial recognition. By simulating these challenges on +public datasets, we establish three robustness benchmarks: LFW-C/V, CFP-FP-C/V, +and YTF-C/V. We then conduct extensive experiments on 19 different facial +recognition models and 3 commercial APIs, along with extended experiments on +face masks, Vision-Language Models (VLMs), and defense strategies to assess +their robustness. Based on the results, we draw several key insights, +highlighting the vulnerability of facial recognition systems to OOD data and +suggesting possible solutions. Additionally, we offer a unified toolkit that +includes all corruption and variation types, easily extendable to other +datasets. We hope that our benchmarks and findings can provide guidance for +future improvements in facial recognition model robustness. + +
+
+
+
+
+ + ☆ BYE: Build Your Encoder with One Sequence of Exploration Data for + Long-Term Dynamic Scene Understanding + + +
+ Dynamic scene understanding remains a persistent challenge in robotic +applications. Early dynamic mapping methods focused on mitigating the negative +influence of short-term dynamic objects on camera motion estimation by masking +or tracking specific categories, which often fall short in adapting to +long-term scene changes. Recent efforts address object association in long-term +dynamic environments using neural networks trained on synthetic datasets, but +they still rely on predefined object shapes and categories. Other methods +incorporate visual, geometric, or semantic heuristics for the association but +often lack robustness. In this work, we introduce BYE, a class-agnostic, +per-scene point cloud encoder that removes the need for predefined categories, +shape priors, or extensive association datasets. Trained on only a single +sequence of exploration data, BYE can efficiently perform object association in +dynamically changing scenes. We further propose an ensembling scheme combining +the semantic strengths of Vision Language Models (VLMs) with the scene-specific +expertise of BYE, achieving a 7% improvement and a 95% success rate in object +association tasks. Code and dataset are available at +https://byencoder.github.io. + +
+
+
+
+
+ + ☆ Resonance: Learning to Predict Social-Aware Pedestrian Trajectories as + Co-Vibrations + + +
+ Learning to forecast the trajectories of intelligent agents like pedestrians +has caught more researchers' attention. Despite researchers' efforts, it +remains a challenge to accurately account for social interactions among agents +when forecasting, and in particular, to simulate such social modifications to +future trajectories in an explainable and decoupled way. Inspired by the +resonance phenomenon of vibration systems, we propose the Resonance (short for +Re) model to forecast pedestrian trajectories as co-vibrations, and regard that +social interactions are associated with spectral properties of agents' +trajectories. It forecasts future trajectories as three distinct vibration +terms to represent agents' future plans from different perspectives in a +decoupled way. Also, agents' social interactions and how they modify scheduled +trajectories will be considered in a resonance-like manner by learning the +similarities of their trajectory spectrums. Experiments on multiple datasets, +whether pedestrian or vehicle, have verified the usefulness of our method both +quantitatively and qualitatively. + +
+
+
+
+
+ + ☆ Multi-scale and Multi-path Cascaded Convolutional Network for Semantic + Segmentation of Colorectal Polyps + + +
+ Colorectal polyps are structural abnormalities of the gastrointestinal tract +that can potentially become cancerous in some cases. The study introduces a +novel framework for colorectal polyp segmentation named the Multi-Scale and +Multi-Path Cascaded Convolution Network (MMCC-Net), aimed at addressing the +limitations of existing models, such as inadequate spatial dependence +representation and the absence of multi-level feature integration during the +decoding stage by integrating multi-scale and multi-path cascaded convolutional +techniques and enhances feature aggregation through dual attention modules, +skip connections, and a feature enhancer. MMCC-Net achieves superior +performance in identifying polyp areas at the pixel level. The Proposed +MMCC-Net was tested across six public datasets and compared against eight SOTA +models to demonstrate its efficiency in polyp segmentation. The MMCC-Net's +performance shows Dice scores with confidence intervals ranging between (77.08, +77.56) and (94.19, 94.71) and Mean Intersection over Union (MIoU) scores with +confidence intervals ranging from (72.20, 73.00) to (89.69, 90.53) on the six +databases. These results highlight the model's potential as a powerful tool for +accurate and efficient polyp segmentation, contributing to early detection and +prevention strategies in colorectal cancer. + +
+
+
+
+
+ + ☆ TimeWalker: Personalized Neural Space for Lifelong Head Avatars + + +
+ We present TimeWalker, a novel framework that models realistic, full-scale 3D +head avatars of a person on lifelong scale. Unlike current human head avatar +pipelines that capture identity at the momentary level(e.g., instant +photography or short videos), TimeWalker constructs a person's comprehensive +identity from unstructured data collection over his/her various life stages, +offering a paradigm to achieve full reconstruction and animation of that person +at different moments of life. At the heart of TimeWalker's success is a novel +neural parametric model that learns personalized representation with the +disentanglement of shape, expression, and appearance across ages. Central to +our methodology are the concepts of two aspects: (1) We track back to the +principle of modeling a person's identity in an additive combination of average +head representation in the canonical space, and moment-specific head attribute +representations driven from a set of neural head basis. To learn the set of +head basis that could represent the comprehensive head variations in a compact +manner, we propose a Dynamic Neural Basis-Blending Module (Dynamo). It +dynamically adjusts the number and blend weights of neural head bases, +according to both shared and specific traits of the target person over ages. +(2) Dynamic 2D Gaussian Splatting (DNA-2DGS), an extension of Gaussian +splatting representation, to model head motion deformations like facial +expressions without losing the realism of rendering and reconstruction. +DNA-2DGS includes a set of controllable 2D oriented planar Gaussian disks that +utilize the priors from parametric model, and move/rotate with the change of +expression. Through extensive experimental evaluations, we show TimeWalker's +ability to reconstruct and animate avatars across decoupled dimensions with +realistic rendering effects, demonstrating a way to achieve personalized 'time +traveling' in a breeze. + +
+
+ comment: Project Page: https://timewalker2024.github.io/timewalker.github.io/ + , Video: https://www.youtube.com/watch?v=x8cpOVMY_ko +
+
+
+
+
+ + ☆ It Takes Two: Real-time Co-Speech Two-person's Interaction Generation + via Reactive Auto-regressive Diffusion Model + + +
+ Conversational scenarios are very common in real-world settings, yet existing +co-speech motion synthesis approaches often fall short in these contexts, where +one person's audio and gestures will influence the other's responses. +Additionally, most existing methods rely on offline sequence-to-sequence +frameworks, which are unsuitable for online applications. In this work, we +introduce an audio-driven, auto-regressive system designed to synthesize +dynamic movements for two characters during a conversation. At the core of our +approach is a diffusion-based full-body motion synthesis model, which is +conditioned on the past states of both characters, speech audio, and a +task-oriented motion trajectory input, allowing for flexible spatial control. +To enhance the model's ability to learn diverse interactions, we have enriched +existing two-person conversational motion datasets with more dynamic and +interactive motions. We evaluate our system through multiple experiments to +show it outperforms across a variety of tasks, including single and two-person +co-speech motion generation, as well as interactive motion generation. To the +best of our knowledge, this is the first system capable of generating +interactive full-body motions for two characters from speech in an online +manner. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ☆ VISTA: A Panoramic View of Neural Representations + + +
+ We present VISTA (Visualization of Internal States and Their Associations), a +novel pipeline for visually exploring and interpreting neural network +representations. VISTA addresses the challenge of analyzing vast +multidimensional spaces in modern machine learning models by mapping +representations into a semantic 2D space. The resulting collages visually +reveal patterns and relationships within internal representations. We +demonstrate VISTA's utility by applying it to sparse autoencoder latents +uncovering new properties and interpretations. We review the VISTA methodology, +present findings from our case study ( https://got.drib.net/latents/ ), and +discuss implications for neural network interpretability across various domains +of machine learning. + +
+
+
+
+
+ + ☆ 3D Face Reconstruction From Radar Images + + +
+ The 3D reconstruction of faces gains wide attention in computer vision and is +used in many fields of application, for example, animation, virtual reality, +and even forensics. This work is motivated by monitoring patients in sleep +laboratories. Due to their unique characteristics, sensors from the radar +domain have advantages compared to optical sensors, namely penetration of +electrically non-conductive materials and independence of light. These +advantages of radar signals unlock new applications and require adaptation of +3D reconstruction frameworks. We propose a novel model-based method for 3D +reconstruction from radar images. We generate a dataset of synthetic radar +images with a physics-based but non-differentiable radar renderer. This dataset +is used to train a CNN-based encoder to estimate the parameters of a 3D +morphable face model. Whilst the encoder alone already leads to strong +reconstructions of synthetic data, we extend our reconstruction in an +Analysis-by-Synthesis fashion to a model-based autoencoder. This is enabled by +learning the rendering process in the decoder, which acts as an object-specific +differentiable radar renderer. Subsequently, the combination of both network +parts is trained to minimize both, the loss of the parameters and the loss of +the resulting reconstructed radar image. This leads to the additional benefit, +that at test time the parameters can be further optimized by finetuning the +autoencoder unsupervised on the image loss. We evaluated our framework on +generated synthetic face images as well as on real radar images with 3D ground +truth of four individuals. + +
+
+
+
+
+ + ☆ RG-SAN: Rule-Guided Spatial Awareness Network for End-to-End 3D + Referring Expression Segmentation NeurIPS 2024 + + +
+ 3D Referring Expression Segmentation (3D-RES) aims to segment 3D objects by +correlating referring expressions with point clouds. However, traditional +approaches frequently encounter issues like over-segmentation or +mis-segmentation, due to insufficient emphasis on spatial information of +instances. In this paper, we introduce a Rule-Guided Spatial Awareness Network +(RG-SAN) by utilizing solely the spatial information of the target instance for +supervision. This approach enables the network to accurately depict the spatial +relationships among all entities described in the text, thus enhancing the +reasoning capabilities. The RG-SAN consists of the Text-driven Localization +Module (TLM) and the Rule-guided Weak Supervision (RWS) strategy. The TLM +initially locates all mentioned instances and iteratively refines their +positional information. The RWS strategy, acknowledging that only target +objects have supervised positional information, employs dependency tree rules +to precisely guide the core instance's positioning. Extensive testing on the +ScanRefer benchmark has shown that RG-SAN not only establishes new performance +benchmarks, with an mIoU increase of 5.1 points, but also exhibits significant +improvements in robustness when processing descriptions with spatial ambiguity. +All codes are available at https://github.com/sosppxo/RG-SAN. + +
+
+ comment: Accepted by NeurIPS 2024 (Oral), Code: + https://github.com/sosppxo/RG-SAN +
+
+
+
+
+ + ☆ OMENN: One Matrix to Explain Neural Networks + + +
+ Deep Learning (DL) models are often black boxes, making their decision-making +processes difficult to interpret. This lack of transparency has driven +advancements in eXplainable Artificial Intelligence (XAI), a field dedicated to +clarifying the reasoning behind DL model predictions. Among these, +attribution-based methods such as LRP and GradCAM are widely used, though they +rely on approximations that can be imprecise. + To address these limitations, we introduce One Matrix to Explain Neural +Networks (OMENN), a novel post-hoc method that represents a neural network as a +single, interpretable matrix for each specific input. This matrix is +constructed through a series of linear transformations that represent the +processing of the input by each successive layer in the neural network. As a +result, OMENN provides locally precise, attribution-based explanations of the +input across various modern models, including ViTs and CNNs. We present a +theoretical analysis of OMENN based on dynamic linearity property and validate +its effectiveness with extensive tests on two XAI benchmarks, demonstrating +that OMENN is competitive with state-of-the-art methods. + +
+
+ comment: Under review, code will be released after acceptance +
+
+
+
+
+ + ☆ Who Walks With You Matters: Perceiving Social Interactions with Groups + for Pedestrian Trajectory Prediction CVPR 2025 + + +
+ Understanding and anticipating human movement has become more critical and +challenging in diverse applications such as autonomous driving and +surveillance. The complex interactions brought by different relations between +agents are a crucial reason that poses challenges to this task. Researchers +have put much effort into designing a system using rule-based or data-based +models to extract and validate the patterns between pedestrian trajectories and +these interactions, which has not been adequately addressed yet. Inspired by +how humans perceive social interactions with different level of relations to +themself, this work proposes the GrouP ConCeption (short for GPCC) model +composed of the Group method, which categorizes nearby agents into either group +members or non-group members based on a long-term distance kernel function, and +the Conception module, which perceives both visual and acoustic information +surrounding the target agent. Evaluated across multiple datasets, the GPCC +model demonstrates significant improvements in trajectory prediction accuracy, +validating its effectiveness in modeling both social and individual dynamics. +The qualitative analysis also indicates that the GPCC framework successfully +leverages grouping and perception cues human-like intuitively to validate the +proposed model's explainability in pedestrian trajectory forecasting. + +
+
+ comment: 15 pages, 10 figures, submitted to CVPR 2025 +
+
+
+
+
+ + ☆ Bio-inspired visual relative localization for large swarms of UAVs + + +
+ We propose a new approach to visual perception for relative localization of +agents within large-scale swarms of UAVs. Inspired by biological perception +utilized by schools of sardines, swarms of bees, and other large groups of +animals capable of moving in a decentralized yet coherent manner, our method +does not rely on detecting individual neighbors by each agent and estimating +their relative position, but rather we propose to regress a neighbor density +over distance. This allows for a more accurate distance estimation as well as +better scalability with respect to the number of neighbors. Additionally, a +novel swarm control algorithm is proposed to make it compatible with the new +relative localization method. We provide a thorough evaluation of the presented +methods and demonstrate that the regressing approach to distance estimation is +more robust to varying relative pose of the targets and that it is suitable to +be used as the main source of relative localization for swarm stabilization. + +
+
+
+
+
+ + ☆ Single-Shot Metric Depth from Focused Plenoptic Cameras ICRA 2025 + + +
+ Metric depth estimation from visual sensors is crucial for robots to +perceive, navigate, and interact with their environment. Traditional range +imaging setups, such as stereo or structured light cameras, face hassles +including calibration, occlusions, and hardware demands, with accuracy limited +by the baseline between cameras. Single- and multi-view monocular depth offers +a more compact alternative, but is constrained by the unobservability of the +metric scale. Light field imaging provides a promising solution for estimating +metric depth by using a unique lens configuration through a single device. +However, its application to single-view dense metric depth is under-addressed +mainly due to the technology's high cost, the lack of public benchmarks, and +proprietary geometrical models and software. + Our work explores the potential of focused plenoptic cameras for dense metric +depth. We propose a novel pipeline that predicts metric depth from a single +plenoptic camera shot by first generating a sparse metric point cloud using +machine learning, which is then used to scale and align a dense relative depth +map regressed by a foundation depth model, resulting in dense metric depth. To +validate it, we curated the Light Field & Stereo Image Dataset (LFS) of +real-world light field images with stereo depth labels, filling a current gap +in existing resources. Experimental results show that our pipeline produces +accurate metric depth predictions, laying a solid groundwork for future +research in this field. + +
+
+ comment: 8 pages (6 for text + 2 for references), 6 figures, 2 tables. + Submitted to IEEE ICRA 2025 +
+
+
+
+
+ + ☆ Active Negative Loss: A Robust Framework for Learning with Noisy Labels + + +
+ Deep supervised learning has achieved remarkable success across a wide range +of tasks, yet it remains susceptible to overfitting when confronted with noisy +labels. To address this issue, noise-robust loss functions offer an effective +solution for enhancing learning in the presence of label noise. In this work, +we systematically investigate the limitation of the recently proposed Active +Passive Loss (APL), which employs Mean Absolute Error (MAE) as its passive loss +function. Despite the robustness brought by MAE, one of its key drawbacks is +that it pays equal attention to clean and noisy samples; this feature slows +down convergence and potentially makes training difficult, particularly in +large-scale datasets. To overcome these challenges, we introduce a novel loss +function class, termed Normalized Negative Loss Functions (NNLFs), which serve +as passive loss functions within the APL framework. NNLFs effectively address +the limitations of MAE by concentrating more on memorized clean samples. By +replacing MAE in APL with our proposed NNLFs, we enhance APL and present a new +framework called Active Negative Loss (ANL). Moreover, in non-symmetric noise +scenarios, we propose an entropy-based regularization technique to mitigate the +vulnerability to the label imbalance. Extensive experiments demonstrate that +the new loss functions adopted by our ANL framework can achieve better or +comparable performance to state-of-the-art methods across various label noise +types and in image segmentation tasks. The source code is available at: +https://github.com/Virusdoll/Active-Negative-Loss. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Trajectory-based Road Autolabeling with Lidar-Camera Fusion in Winter + Conditions + + +
+ Robust road segmentation in all road conditions is required for safe +autonomous driving and advanced driver assistance systems. Supervised deep +learning methods provide accurate road segmentation in the domain of their +training data but cannot be trusted in out-of-distribution scenarios. Including +the whole distribution in the trainset is challenging as each sample must be +labeled by hand. Trajectory-based self-supervised methods offer a potential +solution as they can learn from the traversed route without manual labels. +However, existing trajectory-based methods use learning schemes that rely only +on the camera or only on the lidar. In this paper, trajectory-based learning is +implemented jointly with lidar and camera for increased performance. Our method +outperforms recent standalone camera- and lidar-based methods when evaluated +with a challenging winter driving dataset including countryside and suburb +driving scenes. The source code is available at +https://github.com/eerik98/lidar-camera-road-autolabeling.git + +
+
+
+
+
+ + ☆ ScImage: How Good Are Multimodal Large Language Models at Scientific + Text-to-Image Generation? + + +
+ Multimodal large language models (LLMs) have demonstrated impressive +capabilities in generating high-quality images from textual instructions. +However, their performance in generating scientific images--a critical +application for accelerating scientific progress--remains underexplored. In +this work, we address this gap by introducing ScImage, a benchmark designed to +evaluate the multimodal capabilities of LLMs in generating scientific images +from textual descriptions. ScImage assesses three key dimensions of +understanding: spatial, numeric, and attribute comprehension, as well as their +combinations, focusing on the relationships between scientific objects (e.g., +squares, circles). We evaluate five models, GPT-4o, Llama, AutomaTikZ, Dall-E, +and StableDiffusion, using two modes of output generation: code-based outputs +(Python, TikZ) and direct raster image generation. Additionally, we examine +four different input languages: English, German, Farsi, and Chinese. Our +evaluation, conducted with 11 scientists across three criteria (correctness, +relevance, and scientific accuracy), reveals that while GPT-4o produces outputs +of decent quality for simpler prompts involving individual dimensions such as +spatial, numeric, or attribute understanding in isolation, all models face +challenges in this task, especially for more complex prompts. + +
+
+
+
+
+ + ☆ GenMix: Effective Data Augmentation with Generative Diffusion Model + Image Editing + + +
+ Data augmentation is widely used to enhance generalization in visual +classification tasks. However, traditional methods struggle when source and +target domains differ, as in domain adaptation, due to their inability to +address domain gaps. This paper introduces GenMix, a generalizable +prompt-guided generative data augmentation approach that enhances both +in-domain and cross-domain image classification. Our technique leverages image +editing to generate augmented images based on custom conditional prompts, +designed specifically for each problem type. By blending portions of the input +image with its edited generative counterpart and incorporating fractal +patterns, our approach mitigates unrealistic images and label ambiguity, +improving the performance and adversarial robustness of the resulting models. +Efficacy of our method is established with extensive experiments on eight +public datasets for general and fine-grained classification, in both in-domain +and cross-domain settings. Additionally, we demonstrate performance +improvements for self-supervised learning, learning with data scarcity, and +adversarial robustness. As compared to the existing state-of-the-art methods, +our technique achieves stronger performance across the board. + +
+
+ comment: https://diffusemix.github.io/ +
+
+
+
+
+ + ☆ Realistic Surgical Simulation from Monocular Videos + + +
+ This paper tackles the challenge of automatically performing realistic +surgical simulations from readily available surgical videos. Recent efforts +have successfully integrated physically grounded dynamics within 3D Gaussians +to perform high-fidelity simulations in well-reconstructed simulation +environments from static scenes. However, they struggle with the geometric +inconsistency in reconstructing simulation environments and unrealistic +physical deformations in simulations of soft tissues when it comes to dynamic +and complex surgical processes. In this paper, we propose SurgiSim, a novel +automatic simulation system to overcome these limitations. To build a surgical +simulation environment, we maintain a canonical 3D scene composed of 3D +Gaussians coupled with a deformation field to represent a dynamic surgical +scene. This process involves a multi-stage optimization with trajectory and +anisotropic regularization, enhancing the geometry consistency of the canonical +scene, which serves as the simulation environment. To achieve realistic +physical simulations in this environment, we implement a Visco-Elastic +deformation model based on the Maxwell model, effectively restoring the complex +deformations of tissues. Additionally, we infer the physical parameters of +tissues by minimizing the discrepancies between the input video and simulation +results guided by estimated tissue motion, ensuring realistic simulation +outcomes. Experiments on various surgical scenarios and interactions +demonstrate SurgiSim's ability to perform realistic simulation of soft tissues +among surgical procedures, showing its enormous potential for enhancing +surgical training, planning, and robotic surgery systems. The project page is +at https://namaenashibot.github.io/SurgiSim/. + +
+
+
+
+
+ + ☆ Dual Exposure Stereo for Extended Dynamic Range 3D Imaging + + +
+ Achieving robust stereo 3D imaging under diverse illumination conditions is +an important however challenging task, due to the limited dynamic ranges (DRs) +of cameras, which are significantly smaller than real world DR. As a result, +the accuracy of existing stereo depth estimation methods is often compromised +by under- or over-exposed images. Here, we introduce dual-exposure stereo for +extended dynamic range 3D imaging. We develop automatic dual-exposure control +method that adjusts the dual exposures, diverging them when the scene DR +exceeds the camera DR, thereby providing information about broader DR. From the +captured dual-exposure stereo images, we estimate depth using motion-aware +dual-exposure stereo network. To validate our method, we develop a robot-vision +system, collect stereo video datasets, and generate a synthetic dataset. Our +method outperforms other exposure control methods. + +
+
+
+
+
+ + ☆ UniForm: A Reuse Attention Mechanism Optimized for Efficient Vision + Transformers on Edge Devices + + +
+ Transformer-based architectures have demonstrated remarkable success across +various domains, but their deployment on edge devices remains challenging due +to high memory and computational demands. In this paper, we introduce a novel +Reuse Attention mechanism, tailored for efficient memory access and +computational optimization, enabling seamless operation on resource-constrained +platforms without compromising performance. Unlike traditional multi-head +attention (MHA), which redundantly computes separate attention matrices for +each head, Reuse Attention consolidates these computations into a shared +attention matrix, significantly reducing memory overhead and computational +complexity. Comprehensive experiments on ImageNet-1K and downstream tasks show +that the proposed UniForm models leveraging Reuse Attention achieve +state-of-the-art imagenet classification accuracy while outperforming existing +attention mechanisms, such as Linear Attention and Flash Attention, in +inference speed and memory scalability. Notably, UniForm-l achieves a 76.7% +Top-1 accuracy on ImageNet-1K with 21.8ms inference time on edge devices like +the Jetson AGX Orin, representing up to a 5x speedup over competing benchmark +methods. These results demonstrate the versatility of Reuse Attention across +high-performance GPUs and edge platforms, paving the way for broader real-time +applications + +
+
+ comment: 13 Pages, 8 Tables, 7 Figures +
+
+
+
+
+ + ☆ Amodal Depth Anything: Amodal Depth Estimation in the Wild + + +
+ Amodal depth estimation aims to predict the depth of occluded (invisible) +parts of objects in a scene. This task addresses the question of whether models +can effectively perceive the geometry of occluded regions based on visible +cues. Prior methods primarily rely on synthetic datasets and focus on metric +depth estimation, limiting their generalization to real-world settings due to +domain shifts and scalability challenges. In this paper, we propose a novel +formulation of amodal depth estimation in the wild, focusing on relative depth +prediction to improve model generalization across diverse natural images. We +introduce a new large-scale dataset, Amodal Depth In the Wild (ADIW), created +using a scalable pipeline that leverages segmentation datasets and compositing +techniques. Depth maps are generated using large pre-trained depth models, and +a scale-and-shift alignment strategy is employed to refine and blend depth +predictions, ensuring consistency in ground-truth annotations. To tackle the +amodal depth task, we present two complementary frameworks: Amodal-DAV2, a +deterministic model based on Depth Anything V2, and Amodal-DepthFM, a +generative model that integrates conditional flow matching principles. Our +proposed frameworks effectively leverage the capabilities of large pre-trained +models with minimal modifications to achieve high-quality amodal depth +predictions. Experiments validate our design choices, demonstrating the +flexibility of our models in generating diverse, plausible depth structures for +occluded regions. Our method achieves a 69.5% improvement in accuracy over the +previous SoTA on the ADIW dataset. + +
+
+
+
+
+ + ☆ SimuScope: Realistic Endoscopic Synthetic Dataset Generation through + Surgical Simulation and Diffusion Models WACV + + +
+ Computer-assisted surgical (CAS) systems enhance surgical execution and +outcomes by providing advanced support to surgeons. These systems often rely on +deep learning models trained on complex, challenging-to-annotate data. While +synthetic data generation can address these challenges, enhancing the realism +of such data is crucial. This work introduces a multi-stage pipeline for +generating realistic synthetic data, featuring a fully-fledged surgical +simulator that automatically produces all necessary annotations for modern CAS +systems. This simulator generates a wide set of annotations that surpass those +available in public synthetic datasets. Additionally, it offers a more complex +and realistic simulation of surgical interactions, including the dynamics +between surgical instruments and deformable anatomical environments, +outperforming existing approaches. To further bridge the visual gap between +synthetic and real data, we propose a lightweight and flexible image-to-image +translation method based on Stable Diffusion (SD) and Low-Rank Adaptation +(LoRA). This method leverages a limited amount of annotated data, enables +efficient training, and maintains the integrity of annotations generated by our +simulator. The proposed pipeline is experimentally validated and can translate +synthetic images into images with real-world characteristics, which can +generalize to real-world context, thereby improving both training and CAS +guidance. The code and the dataset are available at +https://github.com/SanoScience/SimuScope. + +
+
+ comment: Accepted to IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ Controlling the Latent Diffusion Model for Generative Image Shadow + Removal via Residual Generation + + +
+ Large-scale generative models have achieved remarkable advancements in +various visual tasks, yet their application to shadow removal in images remains +challenging. These models often generate diverse, realistic details without +adequate focus on fidelity, failing to meet the crucial requirements of shadow +removal, which necessitates precise preservation of image content. In contrast +to prior approaches that aimed to regenerate shadow-free images from scratch, +this paper utilizes diffusion models to generate and refine image residuals. +This strategy fully uses the inherent detailed information within shadowed +images, resulting in a more efficient and faithful reconstruction of +shadow-free content. Additionally, to revent the accumulation of errors during +the generation process, a crosstimestep self-enhancement training strategy is +proposed. This strategy leverages the network itself to augment the training +data, not only increasing the volume of data but also enabling the network to +dynamically correct its generation trajectory, ensuring a more accurate and +robust output. In addition, to address the loss of original details in the +process of image encoding and decoding of large generative models, a +content-preserved encoder-decoder structure is designed with a control +mechanism and multi-scale skip connections to achieve high-fidelity shadow-free +image reconstruction. Experimental results demonstrate that the proposed method +can reproduce high-quality results based on a large latent diffusion prior and +faithfully preserve the original contents in shadow regions. + +
+
+ comment: 13pages, 10 figures +
+
+
+
+
+ + ☆ HumanRig: Learning Automatic Rigging for Humanoid Character in a Large + Scale Dataset + + +
+ With the rapid evolution of 3D generation algorithms, the cost of producing +3D humanoid character models has plummeted, yet the field is impeded by the +lack of a comprehensive dataset for automatic rigging, which is a pivotal step +in character animation. Addressing this gap, we present HumanRig, the first +large-scale dataset specifically designed for 3D humanoid character rigging, +encompassing 11,434 meticulously curated T-posed meshes adhered to a uniform +skeleton topology. Capitalizing on this dataset, we introduce an innovative, +data-driven automatic rigging framework, which overcomes the limitations of +GNN-based methods in handling complex AI-generated meshes. Our approach +integrates a Prior-Guided Skeleton Estimator (PGSE) module, which uses 2D +skeleton joints to provide a preliminary 3D skeleton, and a Mesh-Skeleton +Mutual Attention Network (MSMAN) that fuses skeleton features with 3D mesh +features extracted by a U-shaped point transformer. This enables a +coarse-to-fine 3D skeleton joint regression and a robust skinning estimation, +surpassing previous methods in quality and versatility. This work not only +remedies the dataset deficiency in rigging research but also propels the +animation industry towards more efficient and automated character rigging +pipelines. + +
+
+ comment: Website: https://github.com/c8241998/HumanRig +
+
+
+
+
+ + ☆ LoCo: Low-Contrast-Enhanced Contrastive Learning for Semi-Supervised + Endoscopic Image Segmentation + + +
+ The segmentation of endoscopic images plays a vital role in computer-aided +diagnosis and treatment. The advancements in deep learning have led to the +employment of numerous models for endoscopic tumor segmentation, achieving +promising segmentation performance. Despite recent advancements, precise +segmentation remains challenging due to limited annotations and the issue of +low contrast. To address these issues, we propose a novel semi-supervised +segmentation framework termed LoCo via low-contrast-enhanced contrastive +learning (LCC). This innovative approach effectively harnesses the vast amounts +of unlabeled data available for endoscopic image segmentation, improving both +accuracy and robustness in the segmentation process. Specifically, LCC +incorporates two advanced strategies to enhance the distinctiveness of +low-contrast pixels: inter-class contrast enhancement (ICE) and boundary +contrast enhancement (BCE), enabling models to segment low-contrast pixels +among malignant tumors, benign tumors, and normal tissues. Additionally, a +confidence-based dynamic filter (CDF) is designed for pseudo-label selection, +enhancing the utilization of generated pseudo-labels for unlabeled data with a +specific focus on minority classes. Extensive experiments conducted on two +public datasets, as well as a large proprietary dataset collected over three +years, demonstrate that LoCo achieves state-of-the-art results, significantly +outperforming previous methods. The source code of LoCo is available at the URL +of https://github.com/AnoK3111/LoCo. + +
+
+
+
+
+ + ☆ Noisy Ostracods: A Fine-Grained, Imbalanced Real-World Dataset for + Benchmarking Robust Machine Learning and Label Correction Methods + + +
+ We present the Noisy Ostracods, a noisy dataset for genus and species +classification of crustacean ostracods with specialists' annotations. Over the +71466 specimens collected, 5.58% of them are estimated to be noisy (possibly +problematic) at genus level. The dataset is created to addressing a real-world +challenge: creating a clean fine-grained taxonomy dataset. The Noisy Ostracods +dataset has diverse noises from multiple sources. Firstly, the noise is +open-set, including new classes discovered during curation that were not part +of the original annotation. The dataset has pseudo-classes, where annotators +misclassified samples that should belong to an existing class into a new +pseudo-class. The Noisy Ostracods dataset is highly imbalanced with a imbalance +factor $\rho$ = 22429. This presents a unique challenge for robust machine +learning methods, as existing approaches have not been extensively evaluated on +fine-grained classification tasks with such diverse real-world noise. Initial +experiments using current robust learning techniques have not yielded +significant performance improvements on the Noisy Ostracods dataset compared to +cross-entropy training on the raw, noisy data. On the other hand, noise +detection methods have underperformed in error hit rate compared to naive +cross-validation ensembling for identifying problematic labels. These findings +suggest that the fine-grained, imbalanced nature, and complex noise +characteristics of the dataset present considerable challenges for existing +noise-robust algorithms. By openly releasing the Noisy Ostracods dataset, our +goal is to encourage further research into the development of noise-resilient +machine learning methods capable of effectively handling diverse, real-world +noise in fine-grained classification tasks. The dataset, along with its +evaluation protocols, can be accessed at +https://github.com/H-Jamieu/Noisy_ostracods. + +
+
+ comment: Initial submit +
+
+
+
+
+ + ☆ Active Learning via Classifier Impact and Greedy Selection for + Interactive Image Retrieval + + +
+ Active Learning (AL) is a user-interactive approach aimed at reducing +annotation costs by selecting the most crucial examples to label. Although AL +has been extensively studied for image classification tasks, the specific +scenario of interactive image retrieval has received relatively little +attention. This scenario presents unique characteristics, including an open-set +and class-imbalanced binary classification, starting with very few labeled +samples. We introduce a novel batch-mode Active Learning framework named GAL +(Greedy Active Learning) that better copes with this application. It +incorporates a new acquisition function for sample selection that measures the +impact of each unlabeled sample on the classifier. We further embed this +strategy in a greedy selection approach, better exploiting the samples within +each batch. We evaluate our framework with both linear (SVM) and non-linear +MLP/Gaussian Process classifiers. For the Gaussian Process case, we show a +theoretical guarantee on the greedy approximation. Finally, we assess our +performance for the interactive content-based image retrieval task on several +benchmarks and demonstrate its superiority over existing approaches and common +baselines. Code is available at https://github.com/barleah/GreedyAL. + +
+
+ comment: Accepted to Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ☆ Partial Non-rigid Deformations and interpolations of Human Body Surfaces + + +
+ Non-rigid shape deformations pose significant challenges, and most existing +methods struggle to handle partial deformations effectively. We present Partial +Non-rigid Deformations and interpolations of the human body Surfaces (PaNDAS), +a new method to learn local and global deformations of 3D surface meshes by +building on recent deep models. Unlike previous approaches, our method enables +restricting deformations to specific parts of the shape in a versatile way and +allows for mixing and combining various poses from the database, all while not +requiring any optimization at inference time. We demonstrate that the proposed +framework can be used to generate new shapes, interpolate between parts of +shapes, and perform other shape manipulation tasks with state-of-the-art +accuracy and greater locality across various types of human surface data. Code +and data will be made available soon. + +
+
+
+
+
+ + ☆ Initial Study On Improving Segmentation By Combining Preoperative CT And + Intraoperative CBCT Using Synthetic Data + + +
+ Computer-Assisted Interventions enable clinicians to perform precise, +minimally invasive procedures, often relying on advanced imaging methods. +Cone-beam computed tomography (CBCT) can be used to facilitate +computer-assisted interventions, despite often suffering from artifacts that +pose challenges for accurate interpretation. While the degraded image quality +can affect image analysis, the availability of high quality, preoperative scans +offers potential for improvements. Here we consider a setting where +preoperative CT and intraoperative CBCT scans are available, however, the +alignment (registration) between the scans is imperfect to simulate a real +world scenario. We propose a multimodal learning method that fuses roughly +aligned CBCT and CT scans and investigate the effect on segmentation +performance. For this experiment we use synthetically generated data containing +real CT and synthetic CBCT volumes with corresponding voxel annotations. We +show that this fusion setup improves segmentation performance in $18$ out of +$20$ investigated setups. + +
+
+ comment: Accepted at BVM 2025. arXiv admin note: text overlap with + arXiv:2406.11650 +
+
+
+
+
+ + Viewpoint Consistency in 3D Generation via Attention and CLIP Guidance + + +
+ Despite recent advances in text-to-3D generation techniques, current methods +often suffer from geometric inconsistencies, commonly referred to as the Janus +Problem. This paper identifies the root cause of the Janus Problem: viewpoint +generation bias in diffusion models, which creates a significant gap between +the actual generated viewpoint and the expected one required for optimizing the +3D model. To address this issue, we propose a tuning-free approach called the +Attention and CLIP Guidance (ACG) mechanism. ACG enhances desired viewpoints by +adaptively controlling cross-attention maps, employs CLIP-based view-text +similarities to filter out erroneous viewpoints, and uses a coarse-to-fine +optimization strategy with staged prompts to progressively refine 3D +generation. Extensive experiments demonstrate that our method significantly +reduces the Janus Problem without compromising generation speed, establishing +ACG as an efficient, plug-and-play component for existing text-to-3D +frameworks. + +
+
+
+
+
+ + ☆ AH-OCDA: Amplitude-based Curriculum Learning and Hopfield Segmentation + Model for Open Compound Domain Adaptation WACV 2025 + + +
+ Open compound domain adaptation (OCDA) is a practical domain adaptation +problem that consists of a source domain, target compound domain, and unseen +open domain. In this problem, the absence of domain labels and pixel-level +segmentation labels for both compound and open domains poses challenges to the +direct application of existing domain adaptation and generalization methods. To +address this issue, we propose Amplitude-based curriculum learning and a +Hopfield segmentation model for Open Compound Domain Adaptation (AH-OCDA). Our +method comprises two complementary components: 1) amplitude-based curriculum +learning and 2) Hopfield segmentation model. Without prior knowledge of target +domains within the compound domains, amplitude-based curriculum learning +gradually induces the semantic segmentation model to adapt from the near-source +compound domain to the far-source compound domain by ranking unlabeled compound +domain images through Fast Fourier Transform (FFT). Additionally, the Hopfield +segmentation model maps segmentation feature distributions from arbitrary +domains to the feature distributions of the source domain. AH-OCDA achieves +state-of-the-art performance on two OCDA benchmarks and extended open domains, +demonstrating its adaptability to continuously changing compound domains and +unseen open domains. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ☆ PCIM: Learning Pixel Attributions via Pixel-wise Channel Isolation + Mixing in High Content Imaging + + +
+ Deep Neural Networks (DNNs) have shown remarkable success in various computer +vision tasks. However, their black-box nature often leads to difficulty in +interpreting their decisions, creating an unfilled need for methods to explain +the decisions, and ultimately forming a barrier to their wide acceptance +especially in biomedical applications. This work introduces a novel method, +Pixel-wise Channel Isolation Mixing (PCIM), to calculate pixel attribution +maps, highlighting the image parts most crucial for a classification decision +but without the need to extract internal network states or gradients. Unlike +existing methods, PCIM treats each pixel as a distinct input channel and trains +a blending layer to mix these pixels, reflecting specific classifications. This +unique approach allows the generation of pixel attribution maps for each image, +but agnostic to the choice of the underlying classification network. Benchmark +testing on three application relevant, diverse high content Imaging datasets +show state-of-the-art performance, particularly for model fidelity and +localization ability in both, fluorescence and bright field High Content +Imaging. PCIM contributes as a unique and effective method for creating +pixel-level attribution maps from arbitrary DNNs, enabling interpretability and +trust. + +
+
+
+
+
+ + ☆ Sustainable Self-evolution Adversarial Training + + +
+ With the wide application of deep neural network models in various computer +vision tasks, there has been a proliferation of adversarial example generation +strategies aimed at deeply exploring model security. However, existing +adversarial training defense models, which rely on single or limited types of +attacks under a one-time learning process, struggle to adapt to the dynamic and +evolving nature of attack methods. Therefore, to achieve defense performance +improvements for models in long-term applications, we propose a novel +Sustainable Self-Evolution Adversarial Training (SSEAT) framework. +Specifically, we introduce a continual adversarial defense pipeline to realize +learning from various kinds of adversarial examples across multiple stages. +Additionally, to address the issue of model catastrophic forgetting caused by +continual learning from ongoing novel attacks, we propose an adversarial data +replay module to better select more diverse and key relearning data. +Furthermore, we design a consistency regularization strategy to encourage +current defense models to learn more from previously trained ones, guiding them +to retain more past knowledge and maintain accuracy on clean samples. Extensive +experiments have been conducted to verify the efficacy of the proposed SSEAT +defense method, which demonstrates superior defense performance and +classification accuracy compared to competitors. + +
+
+ comment: Accepted to ACMMM 2024 +
+
+
+
+
+ + ☆ GSGTrack: Gaussian Splatting-Guided Object Pose Tracking from RGB Videos + + +
+ Tracking the 6DoF pose of unknown objects in monocular RGB video sequences is +crucial for robotic manipulation. However, existing approaches typically rely +on accurate depth information, which is non-trivial to obtain in real-world +scenarios. Although depth estimation algorithms can be employed, geometric +inaccuracy can lead to failures in RGBD-based pose tracking methods. To address +this challenge, we introduce GSGTrack, a novel RGB-based pose tracking +framework that jointly optimizes geometry and pose. Specifically, we adopt 3D +Gaussian Splatting to create an optimizable 3D representation, which is learned +simultaneously with a graph-based geometry optimization to capture the object's +appearance features and refine its geometry. However, the joint optimization +process is susceptible to perturbations from noisy pose and geometry data. +Thus, we propose an object silhouette loss to address the issue of pixel-wise +loss being overly sensitive to pose noise during tracking. To mitigate the +geometric ambiguities caused by inaccurate depth information, we propose a +geometry-consistent image pair selection strategy, which filters out +low-confidence pairs and ensures robust geometric optimization. Extensive +experiments on the OnePose and HO3D datasets demonstrate the effectiveness of +GSGTrack in both 6DoF pose tracking and object reconstruction. + +
+
+
+
+
+ + ☆ Diabetic Retinopathy Classification from Retinal Images using Machine + Learning Approaches + + +
+ Diabetic Retinopathy is one of the most familiar diseases and is a diabetes +complication that affects eyes. Initially, diabetic retinopathy may cause no +symptoms or only mild vision problems. Eventually, it can cause blindness. So +early detection of symptoms could help to avoid blindness. In this paper, we +present some experiments on some features of diabetic retinopathy, like +properties of exudates, properties of blood vessels and properties of +microaneurysm. Using the features, we can classify healthy, mild +non-proliferative, moderate non-proliferative, severe non-proliferative and +proliferative stages of DR. Support Vector Machine, Random Forest and Naive +Bayes classifiers are used to classify the stages. Finally, Random Forest is +found to be the best for higher accuracy, sensitivity and specificity of 76.5%, +77.2% and 93.3% respectively. + +
+
+ comment: 5 pages, 9 figures, 2 tables. International Conference on Advanced + Engineering, Technology and Applications (ICAETA-2021), Istanbul, Turkey +
+
+
+
+
+ + ☆ Composing Open-domain Vision with RAG for Ocean Monitoring and + Conservation NeurIPS 2024 + + +
+ Climate change's destruction of marine biodiversity is threatening +communities and economies around the world which rely on healthy oceans for +their livelihoods. The challenge of applying computer vision to niche, +real-world domains such as ocean conservation lies in the dynamic and diverse +environments where traditional top-down learning struggle with long-tailed +distributions, generalization, and domain transfer. Scalable species +identification for ocean monitoring is particularly difficult due to the need +to adapt models to new environments and identify rare or unseen species. To +overcome these limitations, we propose leveraging bottom-up, open-domain +learning frameworks as a resilient, scalable solution for image and video +analysis in marine applications. Our preliminary demonstration uses pretrained +vision-language models (VLMs) combined with retrieval-augmented generation +(RAG) as grounding, leaving the door open for numerous architectural, training +and engineering optimizations. We validate this approach through a preliminary +application in classifying fish from video onboard fishing vessels, +demonstrating impressive emergent retrieval and prediction capabilities without +domain-specific training or knowledge of the task itself. + +
+
+ comment: Accepted to Climate Change AI Workshop at NeurIPS 2024. 9 pages, 6 + figures, 1 table +
+
+
+
+
+ + ☆ Diffusion Implicit Policy for Unpaired Scene-aware Motion Synthesis + + +
+ Human motion generation is a long-standing problem, and scene-aware motion +synthesis has been widely researched recently due to its numerous applications. +Prevailing methods rely heavily on paired motion-scene data whose quantity is +limited. Meanwhile, it is difficult to generalize to diverse scenes when +trained only on a few specific ones. Thus, we propose a unified framework, +termed Diffusion Implicit Policy (DIP), for scene-aware motion synthesis, where +paired motion-scene data are no longer necessary. In this framework, we +disentangle human-scene interaction from motion synthesis during training and +then introduce an interaction-based implicit policy into motion diffusion +during inference. Synthesized motion can be derived through iterative diffusion +denoising and implicit policy optimization, thus motion naturalness and +interaction plausibility can be maintained simultaneously. The proposed +implicit policy optimizes the intermediate noised motion in a GAN Inversion +manner to maintain motion continuity and control keyframe poses though the +ControlNet branch and motion inpainting. For long-term motion synthesis, we +introduce motion blending for stable transitions between multiple sub-tasks, +where motions are fused in rotation power space and translation linear space. +The proposed method is evaluated on synthesized scenes with ShapeNet furniture, +and real scenes from PROX and Replica. Results show that our framework presents +better motion naturalness and interaction plausibility than cutting-edge +methods. This also indicates the feasibility of utilizing the DIP for motion +synthesis in more general tasks and versatile scenes. +https://jingyugong.github.io/DiffusionImplicitPolicy/ + +
+
+
+
+
+ + ☆ VideoGen-of-Thought: A Collaborative Framework for Multi-Shot Video + Generation + + +
+ Current video generation models excel at generating short clips but still +struggle with creating multi-shot, movie-like videos. Existing models trained +on large-scale data on the back of rich computational resources are +unsurprisingly inadequate for maintaining a logical storyline and visual +consistency across multiple shots of a cohesive script since they are often +trained with a single-shot objective. To this end, we propose +VideoGen-of-Thought (VGoT), a collaborative and training-free architecture +designed specifically for multi-shot video generation. VGoT is designed with +three goals in mind as follows. Multi-Shot Video Generation: We divide the +video generation process into a structured, modular sequence, including (1) +Script Generation, which translates a curt story into detailed prompts for each +shot; (2) Keyframe Generation, responsible for creating visually consistent +keyframes faithful to character portrayals; and (3) Shot-Level Video +Generation, which transforms information from scripts and keyframes into shots; +(4) Smoothing Mechanism that ensures a consistent multi-shot output. Reasonable +Narrative Design: Inspired by cinematic scriptwriting, our prompt generation +approach spans five key domains, ensuring logical consistency, character +development, and narrative flow across the entire video. Cross-Shot +Consistency: We ensure temporal and identity consistency by leveraging +identity-preserving (IP) embeddings across shots, which are automatically +created from the narrative. Additionally, we incorporate a cross-shot smoothing +mechanism, which integrates a reset boundary that effectively combines latent +features from adjacent shots, resulting in smooth transitions and maintaining +visual coherence throughout the video. Our experiments demonstrate that VGoT +surpasses existing video generation methods in producing high-quality, +coherent, multi-shot videos. + +
+
+ comment: Webpage: https://cheliosoops.github.io/VGoT +
+
+
+
+
+ + ☆ ProbPose: A Probabilistic Approach to 2D Human Pose Estimation + + +
+ Current Human Pose Estimation methods have achieved significant improvements. +However, state-of-the-art models ignore out-of-image keypoints and use +uncalibrated heatmaps as keypoint location representations. To address these +limitations, we propose ProbPose, which predicts for each keypoint: a +calibrated probability of keypoint presence at each location in the activation +window, the probability of being outside of it, and its predicted visibility. +To address the lack of evaluation protocols for out-of-image keypoints, we +introduce the CropCOCO dataset and the Extended OKS (Ex-OKS) metric, which +extends OKS to out-of-image points. Tested on COCO, CropCOCO, and OCHuman, +ProbPose shows significant gains in out-of-image keypoint localization while +also improving in-image localization through data augmentation. Additionally, +the model improves robustness along the edges of the bounding box and offers +better flexibility in keypoint evaluation. The code and models are available on +https://mirapurkrabek.github.io/ProbPose/ for research purposes. + +
+
+ comment: Code: https://mirapurkrabek.github.io/ProbPose/ +
+
+
+
+
+ + ☆ Vision Transformers for Weakly-Supervised Microorganism Enumeration + + +
+ Microorganism enumeration is an essential task in many applications, such as +assessing contamination levels or ensuring health standards when evaluating +surface cleanliness. However, it's traditionally performed by human-supervised +methods that often require manual counting, making it tedious and +time-consuming. Previous research suggests automating this task using computer +vision and machine learning methods, primarily through instance segmentation or +density estimation techniques. This study conducts a comparative analysis of +vision transformers (ViTs) for weakly-supervised counting in microorganism +enumeration, contrasting them with traditional architectures such as ResNet and +investigating ViT-based models such as TransCrowd. We trained different +versions of ViTs as the architectural backbone for feature extraction using +four microbiology datasets to determine potential new approaches for total +microorganism enumeration in images. Results indicate that while ResNets +perform better overall, ViTs performance demonstrates competent results across +all datasets, opening up promising lines of research in microorganism +enumeration. This comparative study contributes to the field of microbial image +analysis by presenting innovative approaches to the recurring challenge of +microorganism enumeration and by highlighting the capabilities of ViTs in the +task of regression counting. + +
+
+ comment: 8 pages, 3 figures, 3 tables, conference +
+
+
+
+
+ + ☆ Multi-robot autonomous 3D reconstruction using Gaussian splatting with + Semantic guidance + + +
+ Implicit neural representations and 3D Gaussian splatting (3DGS) have shown +great potential for scene reconstruction. Recent studies have expanded their +applications in autonomous reconstruction through task assignment methods. +However, these methods are mainly limited to single robot, and rapid +reconstruction of large-scale scenes remains challenging. Additionally, +task-driven planning based on surface uncertainty is prone to being trapped in +local optima. To this end, we propose the first 3DGS-based centralized +multi-robot autonomous 3D reconstruction framework. To further reduce time cost +of task generation and improve reconstruction quality, we integrate online +open-vocabulary semantic segmentation with surface uncertainty of 3DGS, +focusing view sampling on regions with high instance uncertainty. Finally, we +develop a multi-robot collaboration strategy with mode and task assignments +improving reconstruction quality while ensuring planning efficiency. Our method +demonstrates the highest reconstruction quality among all planning methods and +superior planning efficiency compared to existing multi-robot methods. We +deploy our method on multiple robots, and results show that it can effectively +plan view paths and reconstruct scenes with high quality. + +
+
+
+
+
+ + ☆ SparseLGS: Sparse View Language Embedded Gaussian Splatting + + +
+ Recently, several studies have combined Gaussian Splatting to obtain scene +representations with language embeddings for open-vocabulary 3D scene +understanding. While these methods perform well, they essentially require very +dense multi-view inputs, limiting their applicability in real-world scenarios. +In this work, we propose SparseLGS to address the challenge of 3D scene +understanding with pose-free and sparse view input images. Our method leverages +a learning-based dense stereo model to handle pose-free and sparse inputs, and +a three-step region matching approach to address the multi-view semantic +inconsistency problem, which is especially important for sparse inputs. +Different from directly learning high-dimensional CLIP features, we extract +low-dimensional information and build bijections to avoid excessive learning +and storage costs. We introduce a reconstruction loss during semantic training +to improve Gaussian positions and shapes. To the best of our knowledge, we are +the first to address the 3D semantic field problem with sparse pose-free +inputs. Experimental results show that SparseLGS achieves comparable quality +when reconstructing semantic fields with fewer inputs (3-4 views) compared to +previous SOTA methods with dense input. Besides, when using the same sparse +input, SparseLGS leads significantly in quality and heavily improves the +computation speed (5$\times$ speedup). Project page: {\tt\small +\url{https://ustc3dv.github.io/SparseLGS}} + +
+
+ comment: Project Page: https://ustc3dv.github.io/SparseLGS +
+
+
+
+
+ + ☆ U-Net in Medical Image Segmentation: A Review of Its Applications Across + Modalities + + +
+ Medical imaging is essential in healthcare to provide key insights into +patient anatomy and pathology, aiding in diagnosis and treatment. Non-invasive +techniques such as X-ray, Magnetic Resonance Imaging (MRI), Computed Tomography +(CT), and Ultrasound (US), capture detailed images of organs, tissues, and +abnormalities. Effective analysis of these images requires precise segmentation +to delineate regions of interest (ROI), such as organs or lesions. Traditional +segmentation methods, relying on manual feature-extraction, are labor-intensive +and vary across experts. Recent advancements in Artificial Intelligence (AI) +and Deep Learning (DL), particularly convolutional models such as U-Net and its +variants (U-Net++ and U-Net 3+), have transformed medical image segmentation +(MIS) by automating the process and enhancing accuracy. These models enable +efficient, precise pixel-wise classification across various imaging modalities, +overcoming the limitations of manual segmentation. This review explores various +medical imaging techniques, examines the U-Net architectures and their +adaptations, and discusses their application across different modalities. It +also identifies common challenges in MIS and proposes potential solutions. + +
+
+
+
+
+ + ☆ Fast LiDAR Data Generation with Rectified Flows + + +
+ Building LiDAR generative models holds promise as powerful data priors for +restoration, scene manipulation, and scalable simulation in autonomous mobile +robots. In recent years, approaches using diffusion models have emerged, +significantly improving training stability and generation quality. Despite the +success of diffusion models, generating high-quality samples requires numerous +iterations of running neural networks, and the increasing computational cost +can pose a barrier to robotics applications. To address this challenge, this +paper presents R2Flow, a fast and high-fidelity generative model for LiDAR +data. Our method is based on rectified flows that learn straight trajectories, +simulating data generation with much fewer sampling steps against diffusion +models. We also propose a efficient Transformer-based model architecture for +processing the image representation of LiDAR range and reflectance +measurements. Our experiments on the unconditional generation of the KITTI-360 +dataset demonstrate the effectiveness of our approach in terms of both +efficiency and quality. + +
+
+
+
+
+ + ☆ Cross-Attention Head Position Patterns Can Align with Human Visual + Concepts in Text-to-Image Generative Models + + +
+ Recent text-to-image diffusion models leverage cross-attention layers, which +have been effectively utilized to enhance a range of visual generative tasks. +However, our understanding of cross-attention layers remains somewhat limited. +In this study, we present a method for constructing Head Relevance Vectors +(HRVs) that align with useful visual concepts. An HRV for a given visual +concept is a vector with a length equal to the total number of cross-attention +heads, where each element represents the importance of the corresponding head +for the given visual concept. We develop and employ an ordered weakening +analysis to demonstrate the effectiveness of HRVs as interpretable features. To +demonstrate the utility of HRVs, we propose concept strengthening and concept +adjusting methods and apply them to enhance three visual generative tasks. We +show that misinterpretations of polysemous words in image generation can be +corrected in most cases, five challenging attributes in image editing can be +successfully modified, and catastrophic neglect in multi-concept generation can +be mitigated. Overall, our work provides an advancement in understanding +cross-attention layers and introduces new approaches for fine-controlling these +layers at the head level. + +
+
+
+
+
+ + ☆ CubeFormer: A Simple yet Effective Baseline for Lightweight Image + Super-Resolution + + +
+ Lightweight image super-resolution (SR) methods aim at increasing the +resolution and restoring the details of an image using a lightweight neural +network. However, current lightweight SR methods still suffer from inferior +performance and unpleasant details. Our analysis reveals that these methods are +hindered by constrained feature diversity, which adversely impacts feature +representation and detail recovery. To respond this issue, we propose a simple +yet effective baseline called CubeFormer, designed to enhance feature richness +by completing holistic information aggregation. To be specific, we introduce +cube attention, which expands 2D attention to 3D space, facilitating exhaustive +information interactions, further encouraging comprehensive information +extraction and promoting feature variety. In addition, we inject block and grid +sampling strategies to construct intra-cube transformer blocks (Intra-CTB) and +inter-cube transformer blocks (Inter-CTB), which perform local and global +modeling, respectively. Extensive experiments show that our CubeFormer achieves +state-of-the-art performance on commonly used SR benchmarks. Our source code +and models will be publicly available. + +
+
+
+
+
+ + ☆ How to Use Diffusion Priors under Sparse Views? + + +
+ Novel view synthesis under sparse views has been a long-term important +challenge in 3D reconstruction. Existing works mainly rely on introducing +external semantic or depth priors to supervise the optimization of 3D +representations. However, the diffusion model, as an external prior that can +directly provide visual supervision, has always underperformed in sparse-view +3D reconstruction using Score Distillation Sampling (SDS) due to the low +information entropy of sparse views compared to text, leading to optimization +challenges caused by mode deviation. To this end, we present a thorough +analysis of SDS from the mode-seeking perspective and propose Inline Prior +Guided Score Matching (IPSM), which leverages visual inline priors provided by +pose relationships between viewpoints to rectify the rendered image +distribution and decomposes the original optimization objective of SDS, thereby +offering effective diffusion visual guidance without any fine-tuning or +pre-training. Furthermore, we propose the IPSM-Gaussian pipeline, which adopts +3D Gaussian Splatting as the backbone and supplements depth and geometry +consistency regularization based on IPSM to further improve inline priors and +rectified distribution. Experimental results on different public datasets show +that our method achieves state-of-the-art reconstruction quality. The code is +released at https://github.com/iCVTEAM/IPSM. + +
+
+
+
+
+ + ☆ Unlocking Tuning-Free Few-Shot Adaptability in Visual Foundation Models + by Recycling Pre-Tuned LoRAs + + +
+ Large Language Models (LLMs) such as ChatGPT demonstrate strong few-shot +adaptability without requiring fine-tuning, positioning them ideal for +data-limited and real-time applications. However, this adaptability has not yet +been replicated in current Visual Foundation Models (VFMs), which require +explicit fine-tuning with sufficient tuning data. Besides, the +pretraining-finetuning paradigm has led to the surge of numerous task-specific +modular components, such as Low-Rank Adaptation (LoRA). For the first time, we +explore the potential of reusing diverse pre-tuned LoRAs without accessing +their original training data, to achieve tuning-free few-shot adaptation in +VFMs. Our framework, LoRA Recycle, distills a meta-LoRA from diverse pre-tuned +LoRAs with a meta-learning objective, using surrogate data generated inversely +from pre-tuned LoRAs themselves. The VFM, once equipped with the meta-LoRA, is +empowered to solve new few-shot tasks in a single forward pass, akin to the +in-context learning of LLMs. Additionally, we incorporate a double-efficient +mechanism tailored to our framework, significantly accelerating the +meta-training process while maintaining or even improving performance. +Extensive experiments across various few-shot classification benchmarks across +both in- and cross-domain scenarios demonstrate the superiority of our +framework. + +
+
+
+
+
+ + ☆ GIST: Towards Photorealistic Style Transfer via Multiscale Geometric + Representations + + +
+ State-of-the-art Style Transfer methods often leverage pre-trained encoders +optimized for discriminative tasks, which may not be ideal for image synthesis. +This can result in significant artifacts and loss of photorealism. Motivated by +the ability of multiscale geometric image representations to capture +fine-grained details and global structure, we propose GIST: Geometric-based +Image Style Transfer, a novel Style Transfer technique that exploits the +geometric properties of content and style images. GIST replaces the standard +Neural Style Transfer autoencoding framework with a multiscale image expansion, +preserving scene details without the need for post-processing or training. Our +method matches multiresolution and multidirectional representations such as +Wavelets and Contourlets by solving an optimal transport problem, leading to an +efficient texture transferring. Experiments show that GIST is on-par or +outperforms recent photorealistic Style Transfer approaches while significantly +reducing the processing time with no model training. + +
+
+
+
+
+ + ☆ CC-OCR: A Comprehensive and Challenging OCR Benchmark for Evaluating + Large Multimodal Models in Literacy + + +
+ Large Multimodal Models (LMMs) have demonstrated impressive performance on +recognizing document images with natural language instructions. However, it +remains unclear to what extent capabilities in literacy with rich structure and +fine-grained visual challenges. The current landscape lacks a comprehensive +benchmark to effectively measure the literate capabilities of LMMs. Existing +benchmarks are often limited by narrow scenarios and specified tasks. To this +end, we introduce CC-OCR, a comprehensive benchmark that possess a diverse +range of scenarios, tasks, and challenges. CC-OCR comprises four OCR-centric +tracks: multi-scene text reading, multilingual text reading, document parsing, +and key information extraction. It includes 39 subsets with 7,058 full +annotated images, of which 41% are sourced from real applications, being +released for the first time. Furthermore, we evaluate nine prominent LMMs and +reveal both the strengths and weaknesses of these models, particularly in text +grounding, multi-orientation, and hallucination of repetition. CC-OCR aims to +comprehensively evaluate the capabilities of LMMs on OCR-centered tasks, +driving advancement in LMMs. + +
+
+ comment: 11 pages, 4 figures; The code and data will be publicly available as + soon as possible +
+
+
+
+
+ + ☆ 3D representation in 512-Byte:Variational tokenizer is the key for + autoregressive 3D generation + + +
+ Autoregressive transformers have revolutionized high-fidelity image +generation. One crucial ingredient lies in the tokenizer, which compresses +high-resolution image patches into manageable discrete tokens with a scanning +or hierarchical order suitable for large language models. Extending these +tokenizers to 3D generation, however, presents a significant challenge: unlike +image patches that naturally exhibit spatial sequence and multi-scale +relationships, 3D data lacks an inherent order, making it difficult to compress +into fewer tokens while preserving structural details. To address this, we +introduce the Variational Tokenizer (VAT), which transforms unordered 3D data +into compact latent tokens with an implicit hierarchy, suited for efficient and +high-fidelity coarse-to-fine autoregressive modeling. VAT begins with an +in-context transformer, which compress numerous unordered 3D features into a +reduced token set with minimal information loss. This latent space is then +mapped to a Gaussian distribution for residual quantization, with token counts +progressively increasing across scales. In this way, tokens at different scales +naturally establish the interconnections by allocating themselves into +different subspaces within the same Gaussian distribution, facilitating +discrete modeling of token relationships across scales. During the decoding +phase, a high-resolution triplane is utilized to convert these compact latent +tokens into detailed 3D shapes. Extensive experiments demonstrate that VAT +enables scalable and efficient 3D generation, outperforming existing methods in +quality, efficiency, and generalization. Remarkably, VAT achieves up to a 250x +compression, reducing a 1MB mesh to just 3.9KB with a 96% F-score, and can +further compress to 256 int8 tokens, achieving a 2000x reduction while +maintaining a 92% F-score. + +
+
+ comment: 22 pages, 21 figures +
+
+
+
+
+ + ☆ Transformer-Metric Loss for CNN-Based Face Recognition + + +
+ In deep learning, the loss function plays a crucial role in optimizing the +network. Many recent innovations in loss techniques have been made, and various +margin-based angular loss functions (metric loss) have been designed +particularly for face recognition. The concept of transformers is already +well-researched and applied in many facets of machine vision. This paper +presents a technique for loss evaluation that uses a transformer network as an +additive loss in the face recognition domain. The standard metric loss function +typically takes the final embedding of the main CNN backbone as its input. +Here, we employ a transformer-metric loss, a combined approach that integrates +both transformer-loss and metric-loss. This research intends to analyze the +transformer behavior on the convolution output when the CNN outcome is arranged +in a sequential vector. The transformer encoder takes input from the contextual +vectors obtained from the final convolution layer of the network. With this +technique, we use transformer loss with various base metric-loss functions to +evaluate the effect of the combined loss functions. We observe that such a +configuration allows the network to achieve SoTA results on various validation +datasets with some limitations. This research expands the role of transformers +in the machine vision domain and opens new possibilities for exploring +transformers as a loss function. + +
+
+ comment: Face Recognition using Transformer Loss +
+
+
+
+
+ + ☆ Cascaded Multi-Scale Attention for Enhanced Multi-Scale Feature + Extraction and Interaction with Low-Resolution Images + + +
+ In real-world applications of image recognition tasks, such as human pose +estimation, cameras often capture objects, like human bodies, at low +resolutions. This scenario poses a challenge in extracting and leveraging +multi-scale features, which is often essential for precise inference. To +address this challenge, we propose a new attention mechanism, named cascaded +multi-scale attention (CMSA), tailored for use in CNN-ViT hybrid architectures, +to handle low-resolution inputs effectively. The design of CMSA enables the +extraction and seamless integration of features across various scales without +necessitating the downsampling of the input image or feature maps. This is +achieved through a novel combination of grouped multi-head self-attention +mechanisms with window-based local attention and cascaded fusion of multi-scale +features over different scales. This architecture allows for the effective +handling of features across different scales, enhancing the model's ability to +perform tasks such as human pose estimation, head pose estimation, and more +with low-resolution images. Our experimental results show that the proposed +method outperforms existing state-of-the-art methods in these areas with fewer +parameters, showcasing its potential for broad application in real-world +scenarios where capturing high-resolution images is not feasible. Code is +available at https://github.com/xyongLu/CMSA. + +
+
+ comment: 9 pages, 4 figures, 5 tables. The paper is under consideration at + Computer Vision and Image Understanding +
+
+
+
+
+ + ☆ LayoutVLM: Differentiable Optimization of 3D Layout via Vision-Language + Models + + +
+ Open-universe 3D layout generation arranges unlabeled 3D assets conditioned +on language instruction. Large language models (LLMs) struggle with generating +physically plausible 3D scenes and adherence to input instructions, +particularly in cluttered scenes. We introduce LayoutVLM, a framework and scene +layout representation that exploits the semantic knowledge of Vision-Language +Models (VLMs) and supports differentiable optimization to ensure physical +plausibility. LayoutVLM employs VLMs to generate two mutually reinforcing +representations from visually marked images, and a self-consistent decoding +process to improve VLMs spatial planning. Our experiments show that LayoutVLM +addresses the limitations of existing LLM and constraint-based approaches, +producing physically plausible 3D layouts better aligned with the semantic +intent of input language instructions. We also demonstrate that fine-tuning +VLMs with the proposed scene layout representation extracted from existing +scene datasets can improve performance. + +
+
+ comment: project website: https://ai.stanford.edu/~sunfanyun/layoutvlm/ +
+
+
+
+
+ + ☆ VideoICL: Confidence-based Iterative In-context Learning for + Out-of-Distribution Video Understanding + + +
+ Recent advancements in video large multimodal models (LMMs) have +significantly improved their video understanding and reasoning capabilities. +However, their performance drops on out-of-distribution (OOD) tasks that are +underrepresented in training data. Traditional methods like fine-tuning on OOD +datasets are impractical due to high computational costs. While In-context +learning (ICL) with demonstration examples has shown promising generalization +performance in language tasks and image-language tasks without fine-tuning, +applying ICL to video-language tasks faces challenges due to the limited +context length in Video LMMs, as videos require longer token lengths. To +address these issues, we propose VideoICL, a novel video in-context learning +framework for OOD tasks that introduces a similarity-based relevant example +selection strategy and a confidence-based iterative inference approach. This +allows to select the most relevant examples and rank them based on similarity, +to be used for inference. If the generated response has low confidence, our +framework selects new examples and performs inference again, iteratively +refining the results until a high-confidence response is obtained. This +approach improves OOD video understanding performance by extending effective +context length without incurring high costs. The experimental results on +multiple benchmarks demonstrate significant performance gains, especially in +domain-specific scenarios, laying the groundwork for broader video +comprehension applications. Code will be released at +https://github.com/KangsanKim07/VideoICL + +
+
+
+
+
+ + ☆ Anatomically-Grounded Fact Checking of Automated Chest X-ray Reports + + +
+ With the emergence of large-scale vision-language models, realistic radiology +reports may be generated using only medical images as input guided by simple +prompts. However, their practical utility has been limited due to the factual +errors in their description of findings. In this paper, we propose a novel +model for explainable fact-checking that identifies errors in findings and +their locations indicated through the reports. Specifically, we analyze the +types of errors made by automated reporting methods and derive a new synthetic +dataset of images paired with real and fake descriptions of findings and their +locations from a ground truth dataset. A new multi-label cross-modal +contrastive regression network is then trained on this datsaset. We evaluate +the resulting fact-checking model and its utility in correcting reports +generated by several SOTA automated reporting tools on a variety of benchmark +datasets with results pointing to over 40\% improvement in report quality +through such error detection and correction. + +
+
+
+
+
+ + ☆ VISCO: Benchmarking Fine-Grained Critique and Correction Towards + Self-Improvement in Visual Reasoning + + +
+ The ability of large vision-language models (LVLMs) to critique and correct +their reasoning is an essential building block towards their self-improvement. +However, a systematic analysis of such capabilities in LVLMs is still lacking. +We propose VISCO, the first benchmark to extensively analyze the fine-grained +critique and correction capabilities of LVLMs. Compared to existing work that +uses a single scalar value to critique the entire reasoning [4], VISCO features +dense and fine-grained critique, requiring LVLMs to evaluate the correctness of +each step in the chain-of-thought and provide natural language explanations to +support their judgments. Extensive evaluation of 24 LVLMs demonstrates that +human-written critiques significantly enhance the performance after correction, +showcasing the potential of the self-improvement strategy. However, the +model-generated critiques are less helpful and sometimes detrimental to the +performance, suggesting that critique is the crucial bottleneck. We identified +three common patterns in critique failures: failure to critique visual +perception, reluctance to "say no", and exaggerated assumption of error +propagation. To address these issues, we propose an effective LookBack strategy +that revisits the image to verify each piece of information in the initial +reasoning. LookBack significantly improves critique and correction performance +by up to 13.5%. + +
+
+ comment: Project: https://visco-benchmark.github.io/ +
+
+
+
+
+ + ☆ Underload: Defending against Latency Attacks for Object Detectors on + Edge Devices + + +
+ Object detection is a fundamental enabler for many real-time downstream +applications such as autonomous driving, augmented reality and supply chain +management. However, the algorithmic backbone of neural networks is brittle to +imperceptible perturbations in the system inputs, which were generally known as +misclassifying attacks. By targeting the real-time processing capability, a new +class of latency attacks are reported recently. They exploit new attack +surfaces in object detectors by creating a computational bottleneck in the +post-processing module, that leads to cascading failure and puts the real-time +downstream tasks at risks. In this work, we take an initial attempt to defend +against this attack via background-attentive adversarial training that is also +cognizant of the underlying hardware capabilities. We first draw system-level +connections between latency attack and hardware capacity across heterogeneous +GPU devices. Based on the particular adversarial behaviors, we utilize +objectness loss as a proxy and build background attention into the adversarial +training pipeline, and achieve a reasonable balance between clean and robust +accuracy. The extensive experiments demonstrate the defense effectiveness of +restoring real-time processing capability from $13$ FPS to $43$ FPS on Jetson +Orin NX, with a better trade-off between the clean and robust accuracy. + +
+
+
+
+
+ + ☆ Generative Photography: Scene-Consistent Camera Control for Realistic + Text-to-Image Synthesis + + +
+ Image generation today can produce somewhat realistic images from text +prompts. However, if one asks the generator to synthesize a particular camera +setting such as creating different fields of view using a 24mm lens versus a +70mm lens, the generator will not be able to interpret and generate +scene-consistent images. This limitation not only hinders the adoption of +generative tools in photography applications but also exemplifies a broader +issue of bridging the gap between the data-driven models and the physical +world. In this paper, we introduce the concept of Generative Photography, a +framework designed to control camera intrinsic settings during content +generation. The core innovation of this work are the concepts of Dimensionality +Lifting and Contrastive Camera Learning, which achieve continuous and +consistent transitions for different camera settings. Experimental results show +that our method produces significantly more scene-consistent photorealistic +images than state-of-the-art models such as Stable Diffusion 3 and FLUX. + +
+
+ comment: Project page: https://generative-photography.github.io/project/ +
+
+
+
+
+ + ☆ Agri-LLaVA: Knowledge-Infused Large Multimodal Assistant on Agricultural + Pests and Diseases + + +
+ In the general domain, large multimodal models (LMMs) have achieved +significant advancements, yet challenges persist in applying them to specific +fields, especially agriculture. As the backbone of the global economy, +agriculture confronts numerous challenges, with pests and diseases being +particularly concerning due to their complexity, variability, rapid spread, and +high resistance. This paper specifically addresses these issues. We construct +the first multimodal instruction-following dataset in the agricultural domain, +covering over 221 types of pests and diseases with approximately 400,000 data +entries. This dataset aims to explore and address the unique challenges in pest +and disease control. Based on this dataset, we propose a knowledge-infused +training method to develop Agri-LLaVA, an agricultural multimodal conversation +system. To accelerate progress in this field and inspire more researchers to +engage, we design a diverse and challenging evaluation benchmark for +agricultural pests and diseases. Experimental results demonstrate that +Agri-LLaVA excels in agricultural multimodal conversation and visual +understanding, providing new insights and approaches to address agricultural +pests and diseases. By open-sourcing our dataset and model, we aim to promote +research and development in LMMs within the agricultural domain and make +significant contributions to tackle the challenges of agricultural pests and +diseases. All resources can be found at https://github.com/Kki2Eve/Agri-LLaVA. + +
+
+
+
+
+ + ☆ Personalized Multimodal Large Language Models: A Survey + + +
+ Multimodal Large Language Models (MLLMs) have become increasingly important +due to their state-of-the-art performance and ability to integrate multiple +data modalities, such as text, images, and audio, to perform complex tasks with +high accuracy. This paper presents a comprehensive survey on personalized +multimodal large language models, focusing on their architecture, training +methods, and applications. We propose an intuitive taxonomy for categorizing +the techniques used to personalize MLLMs to individual users, and discuss the +techniques accordingly. Furthermore, we discuss how such techniques can be +combined or adapted when appropriate, highlighting their advantages and +underlying rationale. We also provide a succinct summary of personalization +tasks investigated in existing research, along with the evaluation metrics +commonly used. Additionally, we summarize the datasets that are useful for +benchmarking personalized MLLMs. Finally, we outline critical open challenges. +This survey aims to serve as a valuable resource for researchers and +practitioners seeking to understand and advance the development of personalized +multimodal large language models. + +
+
+
+
+
+ + ☆ WSI-LLaVA: A Multimodal Large Language Model for Whole Slide Image + + +
+ Recent advancements in computational pathology have produced patch-level +Multi-modal Large Language Models (MLLMs), but these models are limited by +their inability to analyze whole slide images (WSIs) comprehensively and their +tendency to bypass crucial morphological features that pathologists rely on for +diagnosis. To address these challenges, we first introduce WSI-Bench, a +large-scale morphology-aware benchmark containing 180k VQA pairs from 9,850 +WSIs across 30 cancer types, designed to evaluate MLLMs' understanding of +morphological characteristics crucial for accurate diagnosis. Building upon +this benchmark, we present WSI-LLaVA, a novel framework for gigapixel WSI +understanding that employs a three-stage training approach: WSI-text alignment, +feature space alignment, and task-specific instruction tuning. To better assess +model performance in pathological contexts, we develop two specialized WSI +metrics: WSI-Precision and WSI-Relevance. Experimental results demonstrate that +WSI-LLaVA outperforms existing models across all capability dimensions, with a +significant improvement in morphological analysis, establishing a clear +correlation between morphological understanding and diagnostic accuracy. + +
+
+ comment: 38 pages, 22 figures, 35 tables +
+
+
+
+
+ + ☆ SparseGrasp: Robotic Grasping via 3D Semantic Gaussian Splatting from + Sparse Multi-View RGB Images + + +
+ Language-guided robotic grasping is a rapidly advancing field where robots +are instructed using human language to grasp specific objects. However, +existing methods often depend on dense camera views and struggle to quickly +update scenes, limiting their effectiveness in changeable environments. + In contrast, we propose SparseGrasp, a novel open-vocabulary robotic grasping +system that operates efficiently with sparse-view RGB images and handles scene +updates fastly. Our system builds upon and significantly enhances existing +computer vision modules in robotic learning. Specifically, SparseGrasp utilizes +DUSt3R to generate a dense point cloud as the initialization for 3D Gaussian +Splatting (3DGS), maintaining high fidelity even under sparse supervision. +Importantly, SparseGrasp incorporates semantic awareness from recent vision +foundation models. To further improve processing efficiency, we repurpose +Principal Component Analysis (PCA) to compress features from 2D models. +Additionally, we introduce a novel render-and-compare strategy that ensures +rapid scene updates, enabling multi-turn grasping in changeable environments. + Experimental results show that SparseGrasp significantly outperforms +state-of-the-art methods in terms of both speed and adaptability, providing a +robust solution for multi-turn grasping in changeable environment. + +
+
+
+
+
+ + ☆ GSOT3D: Towards Generic 3D Single Object Tracking in the Wild + + +
+ In this paper, we present a novel benchmark, GSOT3D, that aims at +facilitating development of generic 3D single object tracking (SOT) in the +wild. Specifically, GSOT3D offers 620 sequences with 123K frames, and covers a +wide selection of 54 object categories. Each sequence is offered with multiple +modalities, including the point cloud (PC), RGB image, and depth. This allows +GSOT3D to support various 3D tracking tasks, such as single-modal 3D SOT on PC +and multi-modal 3D SOT on RGB-PC or RGB-D, and thus greatly broadens research +directions for 3D object tracking. To provide highquality per-frame 3D +annotations, all sequences are labeled manually with multiple rounds of +meticulous inspection and refinement. To our best knowledge, GSOT3D is the +largest benchmark dedicated to various generic 3D object tracking tasks. To +understand how existing 3D trackers perform and to provide comparisons for +future research on GSOT3D, we assess eight representative point cloud-based +tracking models. Our evaluation results exhibit that these models heavily +degrade on GSOT3D, and more efforts are required for robust and generic 3D +object tracking. Besides, to encourage future research, we present a simple yet +effective generic 3D tracker, named PROT3D, that localizes the target object +via a progressive spatial-temporal network and outperforms all current +solutions by a large margin. By releasing GSOT3D, we expect to advance further +3D tracking in future research and applications. Our benchmark and model as +well as the evaluation results will be publicly released at our webpage +https://github.com/ailovejinx/GSOT3D. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Towards Neuro-Symbolic Video Understanding ECCV + + +
+ The unprecedented surge in video data production in recent years necessitates +efficient tools to extract meaningful frames from videos for downstream tasks. +Long-term temporal reasoning is a key desideratum for frame retrieval systems. +While state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are +proficient in short-term semantic understanding, they surprisingly fail at +long-term reasoning across frames. A key reason for this failure is that they +intertwine per-frame perception and temporal reasoning into a single deep +network. Hence, decoupling but co-designing semantic understanding and temporal +reasoning is essential for efficient scene identification. We propose a system +that leverages vision-language models for semantic understanding of individual +frames but effectively reasons about the long-term evolution of events using +state machines and temporal logic (TL) formulae that inherently capture memory. +Our TL-based reasoning improves the F1 score of complex event identification by +9-15% compared to benchmarks that use GPT4 for reasoning on state-of-the-art +self-driving datasets such as Waymo and NuScenes. + +
+
+ comment: Accepted by The European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Evaluation of Text-to-Video Models using Formal + Verification + + +
+ Recent advancements in text-to-video models such as Sora, Gen-3, MovieGen, +and CogVideoX are pushing the boundaries of synthetic video generation, with +adoption seen in fields like robotics, autonomous driving, and entertainment. +As these models become prevalent, various metrics and benchmarks have emerged +to evaluate the quality of the generated videos. However, these metrics +emphasize visual quality and smoothness, neglecting temporal fidelity and +text-to-video alignment, which are crucial for safety-critical applications. To +address this gap, we introduce NeuS-V, a novel synthetic video evaluation +metric that rigorously assesses text-to-video alignment using neuro-symbolic +formal verification techniques. Our approach first converts the prompt into a +formally defined Temporal Logic (TL) specification and translates the generated +video into an automaton representation. Then, it evaluates the text-to-video +alignment by formally checking the video automaton against the TL +specification. Furthermore, we present a dataset of temporally extended prompts +to evaluate state-of-the-art video generation models against our benchmark. We +find that NeuS-V demonstrates a higher correlation by over 5x with human +evaluations when compared to existing metrics. Our evaluation further reveals +that current video generation models perform poorly on these temporally complex +prompts, highlighting the need for future work in improving text-to-video +generation capabilities. + +
+
+
+
+
+ + ♻ ☆ Diffusion Models with Anisotropic Gaussian Splatting for Image + Inpainting + + +
+ Image inpainting is a fundamental task in computer vision, aiming to restore +missing or corrupted regions in images realistically. While recent deep +learning approaches have significantly advanced the state-of-the-art, +challenges remain in maintaining structural continuity and generating coherent +textures, particularly in large missing areas. Diffusion models have shown +promise in generating high-fidelity images but often lack the structural +guidance necessary for realistic inpainting. We propose a novel inpainting +method that combines diffusion models with anisotropic Gaussian splatting to +capture both local structures and global context effectively. By modeling +missing regions using anisotropic Gaussian functions that adapt to local image +gradients, our approach provides structural guidance to the diffusion-based +inpainting network. The Gaussian splat maps are integrated into the diffusion +process, enhancing the model's ability to generate high-fidelity and +structurally coherent inpainting results. Extensive experiments demonstrate +that our method outperforms state-of-the-art techniques, producing visually +plausible results with enhanced structural integrity and texture realism. + +
+
+
+
+
+ + ♻ ☆ Switti: Designing Scale-Wise Transformers for Text-to-Image Synthesis + + +
+ This work presents Switti, a scale-wise transformer for text-to-image +generation. Starting from existing next-scale prediction AR models, we first +explore them for T2I generation and propose architectural modifications to +improve their convergence and overall performance. We then observe that +self-attention maps of our pretrained scale-wise AR model exhibit weak +dependence on preceding scales. Based on this insight, we propose a non-AR +counterpart facilitating ~11% faster sampling and lower memory usage while also +achieving slightly better generation quality. Furthermore, we reveal that +classifier-free guidance at high-resolution scales is often unnecessary and can +even degrade performance. By disabling guidance at these scales, we achieve an +additional sampling acceleration of ~20% and improve the generation of +fine-grained details. Extensive human preference studies and automated +evaluations show that Switti outperforms existing T2I AR models and competes +with state-of-the-art T2I diffusion models while being up to 7 times faster. + +
+
+ comment: 19 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ D-MiSo: Editing Dynamic 3D Scenes using Multi-Gaussians Soup + + +
+ Over the past years, we have observed an abundance of approaches for modeling +dynamic 3D scenes using Gaussian Splatting (GS). Such solutions use GS to +represent the scene's structure and the neural network to model dynamics. Such +approaches allow fast rendering and extracting each element of such a dynamic +scene. However, modifying such objects over time is challenging. SC-GS (Sparse +Controlled Gaussian Splatting) enhanced with Deformed Control Points partially +solves this issue. However, this approach necessitates selecting elements that +need to be kept fixed, as well as centroids that should be adjusted throughout +editing. Moreover, this task poses additional difficulties regarding the +re-productivity of such editing. To address this, we propose Dynamic +Multi-Gaussian Soup (D-MiSo), which allows us to model the mesh-inspired +representation of dynamic GS. Additionally, we propose a strategy of linking +parameterized Gaussian splats, forming a Triangle Soup with the estimated mesh. +Consequently, we can separately construct new trajectories for the 3D objects +composing the scene. Thus, we can make the scene's dynamic editable over time +or while maintaining partial dynamics. + +
+
+
+
+
+ + ♻ ☆ STRIDE: Single-video based Temporally Continuous Occlusion Robust 3D + Pose Estimation WACV + + +
+ The capability to accurately estimate 3D human poses is crucial for diverse +fields such as action recognition, gait recognition, and virtual/augmented +reality. However, a persistent and significant challenge within this field is +the accurate prediction of human poses under conditions of severe occlusion. +Traditional image-based estimators struggle with heavy occlusions due to a lack +of temporal context, resulting in inconsistent predictions. While video-based +models benefit from processing temporal data, they encounter limitations when +faced with prolonged occlusions that extend over multiple frames. This +challenge arises because these models struggle to generalize beyond their +training datasets, and the variety of occlusions is hard to capture in the +training data. Addressing these challenges, we propose STRIDE (Single-video +based TempoRally contInuous occlusion Robust 3D Pose Estimation), a novel +Test-Time Training (TTT) approach to fit a human motion prior for each video. +This approach specifically handles occlusions that were not encountered during +the model's training. By employing STRIDE, we can refine a sequence of noisy +initial pose estimates into accurate, temporally coherent poses during test +time, effectively overcoming the limitations of prior methods. Our framework +demonstrates flexibility by being model-agnostic, allowing us to use any +off-the-shelf 3D pose estimation method for improving robustness and temporal +consistency. We validate STRIDE's efficacy through comprehensive experiments on +challenging datasets like Occluded Human3.6M, Human3.6M, and OCMotion, where it +not only outperforms existing single-image and video-based pose estimation +models but also showcases superior handling of substantial occlusions, +achieving fast, robust, accurate, and temporally consistent 3D pose estimates. +Code is made publicly available at https://github.com/take2rohit/stride + +
+
+ comment: Paper accepted at IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV)-2025 +
+
+
+
+
+ + ♻ ☆ Go beyond End-to-End Training: Boosting Greedy Local Learning with + Context Supply + + +
+ Traditional end-to-end (E2E) training of deep networks necessitates storing +intermediate activations for back-propagation, resulting in a large memory +footprint on GPUs and restricted model parallelization. As an alternative, +greedy local learning partitions the network into gradient-isolated modules and +trains supervisely based on local preliminary losses, thereby providing +asynchronous and parallel training methods that substantially reduce memory +cost. However, empirical experiments reveal that as the number of segmentations +of the gradient-isolated module increases, the performance of the local +learning scheme degrades substantially, severely limiting its expansibility. To +avoid this issue, we theoretically analyze the greedy local learning from the +standpoint of information theory and propose a ContSup scheme, which +incorporates context supply between isolated modules to compensate for +information loss. Experiments on benchmark datasets (i.e. CIFAR, SVHN, STL-10) +achieve SOTA results and indicate that our proposed method can significantly +improve the performance of greedy local learning with minimal memory and +computational overhead, allowing for the boost of the number of isolated +modules. Our codes are available at https://github.com/Tab-ct/ContSup. + +
+
+ comment: 9 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Decoupling Dark Knowledge via Block-wise Logit Distillation for + Feature-level Alignment + + +
+ Knowledge Distillation (KD), a learning manner with a larger teacher network +guiding a smaller student network, transfers dark knowledge from the teacher to +the student via logits or intermediate features, with the aim of producing a +well-performed lightweight model. Notably, many subsequent feature-based KD +methods outperformed the earliest logit-based KD method and iteratively +generated numerous state-of-the-art distillation methods. Nevertheless, recent +work has uncovered the potential of the logit-based method, bringing the simple +KD form based on logits back into the limelight. Features or logits? They +partially implement the KD with entirely distinct perspectives; therefore, +choosing between logits and features is not straightforward. This paper +provides a unified perspective of feature alignment in order to obtain a better +comprehension of their fundamental distinction. Inheriting the design +philosophy and insights of feature-based and logit-based methods, we introduce +a block-wise logit distillation framework to apply implicit logit-based feature +alignment by gradually replacing teacher's blocks as intermediate +stepping-stone models to bridge the gap between the student and the teacher. +Our method obtains comparable or superior results to state-of-the-art +distillation methods. This paper demonstrates the great potential of combining +logit and features, and we hope it will inspire future research to revisit KD +from a higher vantage point. + +
+
+
+
+
+ + ♻ ☆ Denoising: A Powerful Building-Block for Imaging, Inverse Problems, and + Machine Learning + + +
+ Denoising, the process of reducing random fluctuations in a signal to +emphasize essential patterns, has been a fundamental problem of interest since +the dawn of modern scientific inquiry. Recent denoising techniques, +particularly in imaging, have achieved remarkable success, nearing theoretical +limits by some measures. Yet, despite tens of thousands of research papers, the +wide-ranging applications of denoising beyond noise removal have not been fully +recognized. This is partly due to the vast and diverse literature, making a +clear overview challenging. + This paper aims to address this gap. We present a clarifying perspective on +denoisers, their structure, and desired properties. We emphasize the increasing +importance of denoising and showcase its evolution into an essential building +block for complex tasks in imaging, inverse problems, and machine learning. +Despite its long history, the community continues to uncover unexpected and +groundbreaking uses for denoising, further solidifying its place as a +cornerstone of scientific and engineering practice. + +
+
+
+
+
+ + ♻ ☆ LumiNet: Latent Intrinsics Meets Diffusion Models for Indoor Scene + Relighting + + +
+ We introduce LumiNet, a novel architecture that leverages generative models +and latent intrinsic representations for effective lighting transfer. Given a +source image and a target lighting image, LumiNet synthesizes a relit version +of the source scene that captures the target's lighting. Our approach makes two +key contributions: a data curation strategy from the StyleGAN-based relighting +model for our training, and a modified diffusion-based ControlNet that +processes both latent intrinsic properties from the source image and latent +extrinsic properties from the target image. We further improve lighting +transfer through a learned adaptor (MLP) that injects the target's latent +extrinsic properties via cross-attention and fine-tuning. + Unlike traditional ControlNet, which generates images with conditional maps +from a single scene, LumiNet processes latent representations from two +different images - preserving geometry and albedo from the source while +transferring lighting characteristics from the target. Experiments demonstrate +that our method successfully transfers complex lighting phenomena including +specular highlights and indirect illumination across scenes with varying +spatial layouts and materials, outperforming existing approaches on challenging +indoor scenes using only images as input. + +
+
+ comment: Project page: https://luminet-relight.github.io +
+
+
+
+
+ + ♻ ☆ dc-GAN: Dual-Conditioned GAN for Face Demorphing From a Single Morph + + +
+ A facial morph is an image created by combining two face images pertaining to +two distinct identities. Face demorphing inverts the process and tries to +recover the original images constituting a facial morph. While morph attack +detection (MAD) techniques can be used to flag morph images, they do not +divulge any visual information about the faces used to create them. Demorphing +helps address this problem. Existing demorphing techniques are either very +restrictive (assume identities during testing) or produce feeble outputs (both +outputs look very similar). In this paper, we overcome these issues by +proposing dc-GAN, a novel GAN-based demorphing method conditioned on the morph +images. Our method overcomes morph-replication and produces high quality +reconstructions of the bonafide images used to create the morphs. Moreover, our +method is highly generalizable across demorphing paradigms +(differential/reference-free). We conduct experiments on AMSL, FRLL-Morphs and +MorDiff datasets to showcase the efficacy of our method. + +
+
+
+
+
+ + ♻ ☆ Tomographic SAR Reconstruction for Forest Height Estimation + + +
+ Tree height estimation serves as an important proxy for biomass estimation in +ecological and forestry applications. While traditional methods such as +photogrammetry and Light Detection and Ranging (LiDAR) offer accurate height +measurements, their application on a global scale is often cost-prohibitive and +logistically challenging. In contrast, remote sensing techniques, particularly +3D tomographic reconstruction from Synthetic Aperture Radar (SAR) imagery, +provide a scalable solution for global height estimation. SAR images have been +used in earth observation contexts due to their ability to work in all +weathers, unobscured by clouds. In this study, we use deep learning to estimate +forest canopy height directly from 2D Single Look Complex (SLC) images, a +derivative of SAR. Our method attempts to bypass traditional tomographic signal +processing, potentially reducing latency from SAR capture to end product. We +also quantify the impact of varying numbers of SLC images on height estimation +accuracy, aiming to inform future satellite operations and optimize data +collection strategies. Compared to full tomographic processing combined with +deep learning, our minimal method (partial processing + deep learning) falls +short, with an error 16-21\% higher, highlighting the continuing relevance of +geometric signal processing. + +
+
+
+
+
+ + ♻ ☆ Comparative Analysis of Resource-Efficient CNN Architectures for Brain + Tumor Classification + + +
+ Accurate brain tumor classification in MRI images is critical for timely +diagnosis and treatment planning. While deep learning models like ResNet-18, +VGG-16 have shown high accuracy, they often come with increased complexity and +computational demands. This study presents a comparative analysis of effective +yet simple Convolutional Neural Network (CNN) architecture and pre-trained +ResNet18, and VGG16 model for brain tumor classification using two publicly +available datasets: Br35H:: Brain Tumor Detection 2020 and Brain Tumor MRI +Dataset. The custom CNN architecture, despite its lower complexity, +demonstrates competitive performance with the pre-trained ResNet18 and VGG16 +models. In binary classification tasks, the custom CNN achieved an accuracy of +98.67% on the Br35H dataset and 99.62% on the Brain Tumor MRI Dataset. For +multi-class classification, the custom CNN, with a slight architectural +modification, achieved an accuracy of 98.09%, on the Brain Tumor MRI Dataset. +Comparatively, ResNet18 and VGG16 maintained high performance levels, but the +custom CNNs provided a more computationally efficient alternative. +Additionally,the custom CNNs were evaluated using few-shot learning (0, 5, 10, +15, 20, 40, and 80 shots) to assess their robustness, achieving notable +accuracy improvements with increased shots. This study highlights the potential +of well-designed, less complex CNN architectures as effective and +computationally efficient alternatives to deeper, pre-trained models for +medical imaging tasks, including brain tumor classification. This study +underscores the potential of custom CNNs in medical imaging tasks and +encourages further exploration in this direction. + +
+
+ comment: A revised and extended version of this paper has been accepted at the + 27th International Conference on Computer and Information Technology (ICCIT + 2024). It spans 8 pages and includes 6 figures +
+
+
+
+
+ + ♻ ☆ Grid-augmented vision: A simple yet effective approach for enhanced + spatial understanding in multi-modal agents + + +
+ Recent advances in multimodal models have demonstrated impressive +capabilities in object recognition and scene understanding. However, these +models often struggle with precise spatial localization - a critical capability +for real-world applications. Inspired by how humans use grid-based references +like chess boards and maps, we propose introducing explicit visual position +encoding through a simple grid overlay approach. By adding a 9x9 black grid +pattern onto input images, our method provides visual spatial guidance +analogous to how positional encoding works in transformers, but in an explicit, +visual form. + Experiments on the COCO 2017 dataset demonstrate that our grid-based approach +achieves significant improvements in localization accuracy, with a 107.4% +increase in IoU (from 0.27 to 0.56) and a 194.4% improvement in GIoU (from 0.18 +to 0.53) compared to baseline performance. Through attention visualization +analysis, we show how this visual position encoding helps models better ground +spatial relationships. Our method's simplicity and effectiveness make it +particularly valuable for applications requiring accurate spatial reasoning, +such as robotic manipulation, medical imaging, and autonomous navigation. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic + Vision-language Context Sparsification + + +
+ Multimodal Large Language Models (MLLMs) have achieved remarkable success in +vision understanding, reasoning, and interaction. However, the inference +computation and memory increase progressively with the generation of output +tokens during decoding, directly affecting the efficacy of MLLMs. Existing +methods attempt to reduce the vision context redundancy to achieve efficient +MLLMs. Unfortunately, the efficiency benefits of the vision context reduction +in the prefill stage gradually diminish during the decoding stage. To address +this problem, we proposed a dynamic vision-language context sparsification +framework Dynamic-LLaVA, which dynamically reduces the redundancy of vision +context in the prefill stage and decreases the memory and computation overhead +of the generated language context during decoding. Dynamic-LLaVA designs a +tailored sparsification inference scheme for different inference modes, i.e., +prefill, decoding with and without KV cache, to achieve efficient inference of +MLLMs. In practice, Dynamic-LLaVA can reduce computation consumption by +$\sim$75\% in the prefill stage. Meanwhile, throughout the entire generation +process of MLLMs, Dynamic-LLaVA reduces the $\sim$50\% computation consumption +under decoding without KV cache, while saving $\sim$50\% GPU memory overhead +when decoding with KV cache, due to the vision-language context sparsification. +Extensive experiments also demonstrate that Dynamic-LLaVA achieves efficient +inference for MLLMs with negligible understanding and generation ability +degradation or even performance gains compared to the full-context inference +baselines. Code is available at https://github.com/Osilly/dynamic_llava . + +
+
+ comment: Code is available at https://github.com/Osilly/dynamic_llava +
+
+
+
+
+ + ♻ ☆ Collaborative Feature-Logits Contrastive Learning for Open-Set + Semi-Supervised Object Detection + + +
+ Current Semi-Supervised Object Detection (SSOD) methods enhance detector +performance by leveraging large amounts of unlabeled data, assuming that both +labeled and unlabeled data share the same label space. However, in open-set +scenarios, the unlabeled dataset contains both in-distribution (ID) classes and +out-of-distribution (OOD) classes. Applying semi-supervised detectors in such +settings can lead to misclassifying OOD class as ID classes. To alleviate this +issue, we propose a simple yet effective method, termed Collaborative +Feature-Logits Detector (CFL-Detector). Specifically, we introduce a +feature-level clustering method using contrastive loss to clarify vector +boundaries in the feature space and highlight class differences. Additionally, +by optimizing the logits-level uncertainty classification loss, the model +enhances its ability to effectively distinguish between ID and OOD classes. +Extensive experiments demonstrate that our method achieves state-of-the-art +performance compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ Multi-Class Abnormality Classification Task in Video Capsule Endoscopy + + +
+ In this work for Capsule Vision Challenge 2024, we addressed the challenge of +multiclass anomaly classification in video capsule Endoscopy (VCE)[1] with a +variety of deep learning models, ranging from custom CNNs to advanced +transformer architectures. The purpose is to correctly classify diverse +gastrointestinal disorders, which is critical for increasing diagnostic +efficiency in clinical settings. We started with a baseline CNN model and +improved performance with ResNet[2] for better feature extraction, followed by +Vision Transformer (ViT)[3] to capture global dependencies. We further improve +the results by using Multiscale Vision Transformer (MViT)[4] for improved +hierarchical feature extraction, while Dual Attention Vision Transformer +(DaViT) [5] delivered best results by combining spatial and channel attention +methods. Our best balanced accuracy on validation set [6] was 0.8592 and Mean +AUC was 0.9932. This methodology enabled us to improve model accuracy across a +wide range of criteria, greatly surpassing all other methods.Additionally, our +team capsule commandos achieved 7th place ranking with a test set[7] +performance of Mean AUC: 0.7314 and balanced accuracy: 0.3235 + +
+
+ comment: Submission for Video Capsule Endoscopy Challenge +
+
+
+
+
+ + ♻ ☆ DPE-Net: Dual-Parallel Encoder Based Network for Semantic Segmentation + of Polyps + + +
+ In medical imaging, efficient segmentation of colon polyps plays a pivotal +role in minimally invasive solutions for colorectal cancer. This study +introduces a novel approach employing two parallel encoder branches within a +network for polyp segmentation. One branch of the encoder incorporates the dual +convolution blocks that have the capability to maintain feature information +over increased depths, and the other block embraces the single convolution +block with the addition of the previous layer's feature, offering diversity in +feature extraction within the encoder, combining them before transpose layers +with a depth-wise concatenation operation. Our model demonstrated superior +performance, surpassing several established deep-learning architectures on the +Kvasir and CVC-ClinicDB datasets, achieved a Dice score of 0.919, a mIoU of +0.866 for the Kvasir dataset, and a Dice score of 0.931 and a mIoU of 0.891 for +the CVC-ClinicDB. The visual and quantitative results highlight the efficacy of +our model, potentially setting a new model in medical image segmentation. + +
+
+
+
+
+ + ♻ ☆ HoloDrive: Holistic 2D-3D Multi-Modal Street Scene Generation for + Autonomous Driving + + +
+ Generative models have significantly improved the generation and prediction +quality on either camera images or LiDAR point clouds for autonomous driving. +However, a real-world autonomous driving system uses multiple kinds of input +modality, usually cameras and LiDARs, where they contain complementary +information for generation, while existing generation methods ignore this +crucial feature, resulting in the generated results only covering separate 2D +or 3D information. In order to fill the gap in 2D-3D multi-modal joint +generation for autonomous driving, in this paper, we propose our framework, +\emph{HoloDrive}, to jointly generate the camera images and LiDAR point clouds. +We employ BEV-to-Camera and Camera-to-BEV transform modules between +heterogeneous generative models, and introduce a depth prediction branch in the +2D generative model to disambiguate the un-projecting from image space to BEV +space, then extend the method to predict the future by adding temporal +structure and carefully designed progressive training. Further, we conduct +experiments on single frame generation and world model benchmarks, and +demonstrate our method leads to significant performance gains over SOTA methods +in terms of generation metrics. + +
+
+
+
+
+ + ♻ ☆ Spiking GS: Towards High-Accuracy and Low-Cost Surface Reconstruction + via Spiking Neuron-based Gaussian Splatting + + +
+ 3D Gaussian Splatting is capable of reconstructing 3D scenes in minutes. +Despite recent advances in improving surface reconstruction accuracy, the +reconstructed results still exhibit bias and suffer from inefficiency in +storage and training. This paper provides a different observation on the cause +of the inefficiency and the reconstruction bias, which is attributed to the +integration of the low-opacity parts (LOPs) of the generated Gaussians. We show +that LOPs consist of Gaussians with overall low-opacity (LOGs) and the +low-opacity tails (LOTs) of Gaussians. We propose Spiking GS to reduce such two +types of LOPs by integrating spiking neurons into the Gaussian Splatting +pipeline. Specifically, we introduce global and local full-precision +integrate-and-fire spiking neurons to the opacity and representation function +of flattened 3D Gaussians, respectively. Furthermore, we enhance the density +control strategy with spiking neurons' thresholds and a new criterion on the +scale of Gaussians. Our method can represent more accurate reconstructed +surfaces at a lower cost. The supplementary material and code are available at +https://github.com/zju-bmi-lab/SpikingGS. + +
+
+
+
+
+ + ♻ ☆ PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object + Detection in Bird's-Eye-View + + +
+ Recently, LSS-based multi-view 3D object detection provides an economical and +deployment-friendly solution for autonomous driving. However, all the existing +LSS-based methods transform multi-view image features into a Cartesian +Bird's-Eye-View(BEV) representation, which does not take into account the +non-uniform image information distribution and hardly exploits the view +symmetry. In this paper, in order to adapt the image information distribution +and preserve the view symmetry by regular convolution, we propose to employ the +polar BEV representation to substitute the Cartesian BEV representation. To +achieve this, we elaborately tailor three modules: a polar view transformer to +generate the polar BEV representation, a polar temporal fusion module for +fusing historical polar BEV features and a polar detection head to predict the +polar-parameterized representation of the object. In addition, we design a 2D +auxiliary detection head and a spatial attention enhancement module to improve +the quality of feature extraction in perspective view and BEV, respectively. +Finally, we integrate the above improvements into a novel multi-view 3D object +detector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves +the superior performance. The code is available at +https://github.com/Yzichen/PolarBEVDet.git. + +
+
+ comment: 11 pages, 6 figures. This work has been submitted to the IEEE for + possible publication +
+
+
+
+
+ + ♻ ☆ SegNet4D: Efficient Instance-Aware 4D Semantic Segmentation for LiDAR + Point Cloud + + +
+ 4D LiDAR semantic segmentation, also referred to as multi-scan semantic +segmentation, plays a crucial role in enhancing the environmental understanding +capabilities of autonomous vehicles or robots. It classifies the semantic +category of each LiDAR measurement point and detects whether it is dynamic, a +critical ability for tasks like obstacle avoidance and autonomous navigation. +Existing approaches often rely on computationally heavy 4D convolutions or +recursive networks, which result in poor real-time performance, making them +unsuitable for online robotics and autonomous driving applications. In this +paper, we introduce SegNet4D, a novel real-time 4D semantic segmentation +network offering both efficiency and strong semantic understanding. SegNet4D +addresses 4D segmentation as two tasks: single-scan semantic segmentation and +moving object segmentation, each tackled by a separate network head. Both +results are combined in a motion-semantic fusion module to achieve +comprehensive 4D segmentation. Additionally, instance information is extracted +from the current scan and exploited for instance-wise segmentation consistency. +Our approach surpasses state-of-the-art in both multi-scan semantic +segmentation and moving object segmentation while offering greater efficiency, +enabling real-time operation. Besides, its effectiveness and efficiency have +also been validated on a real-world unmanned ground platform. Our code will be +released at https://github.com/nubot-nudt/SegNet4D. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Facial Expression Recognition with Controlled Privacy Preservation and + Feature Compensation WACV2025 + + +
+ Facial expression recognition (FER) systems raise significant privacy +concerns due to the potential exposure of sensitive identity information. This +paper presents a study on removing identity information while preserving FER +capabilities. Drawing on the observation that low-frequency components +predominantly contain identity information and high-frequency components +capture expression, we propose a novel two-stream framework that applies +privacy enhancement to each component separately. We introduce a controlled +privacy enhancement mechanism to optimize performance and a feature compensator +to enhance task-relevant features without compromising privacy. Furthermore, we +propose a novel privacy-utility trade-off, providing a quantifiable measure of +privacy preservation efficacy in closed-set FER tasks. Extensive experiments on +the benchmark CREMA-D dataset demonstrate that our framework achieves 78.84% +recognition accuracy with a privacy (facial identity) leakage ratio of only +2.01%, highlighting its potential for secure and reliable video-based FER +applications. + +
+
+ comment: WACV2025 accepted +
+
+
+
+
+ + ♻ ☆ Efficient Concertormer for Image Deblurring and Beyond + + +
+ The Transformer architecture has achieved remarkable success in natural +language processing and high-level vision tasks over the past few years. +However, the inherent complexity of self-attention is quadratic to the size of +the image, leading to unaffordable computational costs for high-resolution +vision tasks. In this paper, we introduce Concertormer, featuring a novel +Concerto Self-Attention (CSA) mechanism designed for image deblurring. The +proposed CSA divides self-attention into two distinct components: one +emphasizes generally global and another concentrates on specifically local +correspondence. By retaining partial information in additional dimensions +independent from the self-attention calculations, our method effectively +captures global contextual representations with complexity linear to the image +size. To effectively leverage the additional dimensions, we present a +Cross-Dimensional Communication module, which linearly combines attention maps +and thus enhances expressiveness. Moreover, we amalgamate the two-staged +Transformer design into a single stage using the proposed gated-dconv MLP +architecture. While our primary objective is single-image motion deblurring, +extensive quantitative and qualitative evaluations demonstrate that our +approach performs favorably against the state-of-the-art methods in other +tasks, such as deraining and deblurring with JPEG artifacts. The source codes +and trained models will be made available to the public. + +
+
+
+
+
+ + ♻ ☆ Enhancing joint automatic chest X-ray diagnosis and clinical visual + attention prediction with multi-stage cooperative learning + + +
+ Purpose: As visual inspection is an inherent process during radiological +screening, the associated eye gaze data can provide valuable insights into +relevant clinical decisions. As deep learning has become the state-of-the-art +for computer-assisted diagnosis, integrating human behavior, such as eye gaze +data, into these systems is instrumental to help align machine predictions with +clinical diagnostic criteria, thus enhancing the quality of automatic +radiological diagnosis. Methods: We propose a novel deep learning framework for +joint disease diagnosis and prediction of corresponding clinical visual +attention maps for chest X-ray scans. Specifically, we introduce a new +dual-encoder multi-task UNet, which leverages both a DenseNet201 backbone and a +Residual and Squeeze-and-Excitation block-based encoder to extract diverse +features for visual attention map prediction, and a multi-scale feature-fusion +classifier to perform disease classification. To tackle the issue of +asynchronous training schedules of individual tasks in multi-task learning, we +proposed a multi-stage cooperative learning strategy, with contrastive learning +for feature encoder pretraining to boost performance. Results: Our proposed +method is shown to significantly outperform existing techniques for chest X-ray +diagnosis (AUC=0.93) and the quality of visual attention map prediction +(Correlation coefficient=0.58). Conclusion: Benefiting from the proposed +multi-task multi-stage cooperative learning, our technique demonstrates the +benefit of integrating clinicians' eye gaze into clinical AI systems to boost +performance and potentially explainability. + +
+
+
+
+
+ + ♻ ☆ TFS-NeRF: Template-Free NeRF for Semantic 3D Reconstruction of Dynamic + Scene NeurIPS 2024 + + +
+ Despite advancements in Neural Implicit models for 3D surface reconstruction, +handling dynamic environments with interactions between arbitrary rigid, +non-rigid, or deformable entities remains challenging. The generic +reconstruction methods adaptable to such dynamic scenes often require +additional inputs like depth or optical flow or rely on pre-trained image +features for reasonable outcomes. These methods typically use latent codes to +capture frame-by-frame deformations. Another set of dynamic scene +reconstruction methods, are entity-specific, mostly focusing on humans, and +relies on template models. In contrast, some template-free methods bypass these +requirements and adopt traditional LBS (Linear Blend Skinning) weights for a +detailed representation of deformable object motions, although they involve +complex optimizations leading to lengthy training times. To this end, as a +remedy, this paper introduces TFS-NeRF, a template-free 3D semantic NeRF for +dynamic scenes captured from sparse or single-view RGB videos, featuring +interactions among two entities and more time-efficient than other LBS-based +approaches. Our framework uses an Invertible Neural Network (INN) for LBS +prediction, simplifying the training process. By disentangling the motions of +interacting entities and optimizing per-entity skinning weights, our method +efficiently generates accurate, semantically separable geometries. Extensive +experiments demonstrate that our approach produces high-quality reconstructions +of both deformable and non-deformable objects in complex interactions, with +improved training efficiency compared to existing methods. + +
+
+ comment: Accepted in NeurIPS 2024 https://github.com/sbsws88/TFS-NeRF +
+
+
+
+
+ + ♻ ☆ SceneFactor: Factored Latent 3D Diffusion for Controllable 3D Scene + Generation + + +
+ We present SceneFactor, a diffusion-based approach for large-scale 3D scene +generation that enables controllable generation and effortless editing. +SceneFactor enables text-guided 3D scene synthesis through our factored +diffusion formulation, leveraging latent semantic and geometric manifolds for +generation of arbitrary-sized 3D scenes. While text input enables easy, +controllable generation, text guidance remains imprecise for intuitive, +localized editing and manipulation of the generated 3D scenes. Our factored +semantic diffusion generates a proxy semantic space composed of semantic 3D +boxes that enables controllable editing of generated scenes by adding, +removing, changing the size of the semantic 3D proxy boxes that guides +high-fidelity, consistent 3D geometric editing. Extensive experiments +demonstrate that our approach enables high-fidelity 3D scene synthesis with +effective controllable editing through our factored diffusion approach. + +
+
+ comment: 21 pages, 12 figures; https://alexeybokhovkin.github.io/scenefactor/ +
+
+
+
+
+ + ♻ ☆ Enabling DBSCAN for Very Large-Scale High-Dimensional Spaces + + +
+ DBSCAN is one of the most important non-parametric unsupervised data analysis +tools. By applying DBSCAN to a dataset, two key analytical results can be +obtained: (1) clustering data points based on density distribution and (2) +identifying outliers in the dataset. However, the time complexity of the DBSCAN +algorithm is $O(n^2 \beta)$, where $n$ is the number of data points and $\beta += O(D)$, with $D$ representing the dimensionality of the data space. As a +result, DBSCAN becomes computationally infeasible when both $n$ and $D$ are +large. In this paper, we propose a DBSCAN method based on spectral data +compression, capable of efficiently processing datasets with a large number of +data points ($n$) and high dimensionality ($D$). By preserving only the most +critical structural information during the compression process, our method +effectively removes substantial redundancy and noise. Consequently, the +solution quality of DBSCAN is significantly improved, enabling more accurate +and reliable results. + +
+
+
+
+
+ + ♻ ☆ A Good Foundation is Worth Many Labels: Label-Efficient Panoptic + Segmentation + + +
+ A key challenge for the widespread application of learning-based models for +robotic perception is to significantly reduce the required amount of annotated +training data while achieving accurate predictions. This is essential not only +to decrease operating costs but also to speed up deployment time. In this work, +we address this challenge for PAnoptic SegmenTation with fEw Labels (PASTEL) by +exploiting the groundwork paved by visual foundation models. We leverage +descriptive image features from such a model to train two lightweight network +heads for semantic segmentation and object boundary detection, using very few +annotated training samples. We then merge their predictions via a novel fusion +module that yields panoptic maps based on normalized cut. To further enhance +the performance, we utilize self-training on unlabeled images selected by a +feature-driven similarity scheme. We underline the relevance of our approach by +employing PASTEL to important robot perception use cases from autonomous +driving and agricultural robotics. In extensive experiments, we demonstrate +that PASTEL significantly outperforms previous methods for label-efficient +segmentation even when using fewer annotations. The code of our work is +publicly available at http://pastel.cs.uni-freiburg.de. + +
+
+
+
+
+ + ♻ ☆ Monocular Lane Detection Based on Deep Learning: A Survey + + +
+ Lane detection plays an important role in autonomous driving perception +systems. As deep learning algorithms gain popularity, monocular lane detection +methods based on them have demonstrated superior performance and emerged as a +key research direction in autonomous driving perception. The core designs of +these algorithmic frameworks can be summarized as follows: (1) Task paradigm, +focusing on lane instance-level discrimination; (2) Lane modeling, representing +lanes as a set of learnable parameters in the neural network; (3) Global +context supplementation, enhancing inference on the obscure lanes; (4) +Perspective effect elimination, providing accurate 3D lanes for downstream +applications. From these perspectives, this paper presents a comprehensive +overview of existing methods, encompassing both the increasingly mature 2D lane +detection approaches and the developing 3D lane detection works. Besides, this +paper compares the performance of mainstream methods on different benchmarks +and investigates their inference speed under a unified setting for fair +comparison. Moreover, we present some extended works on lane detection, +including multi-task perception, video lane detection, online high-definition +map construction, and lane topology reasoning, to offer readers a comprehensive +roadmap for the evolution of lane detection. Finally, we point out some +potential future research directions in this field. We exhaustively collect the +papers and codes of existing works at +https://github.com/Core9724/Awesome-Lane-Detection and will keep tracing the +research. + +
+
+
+
+
+ + ♻ ☆ KP-RED: Exploiting Semantic Keypoints for Joint 3D Shape Retrieval and + Deformation CVPR 2024 + + +
+ In this paper, we present KP-RED, a unified KeyPoint-driven REtrieval and +Deformation framework that takes object scans as input and jointly retrieves +and deforms the most geometrically similar CAD models from a pre-processed +database to tightly match the target. Unlike existing dense matching based +methods that typically struggle with noisy partial scans, we propose to +leverage category-consistent sparse keypoints to naturally handle both full and +partial object scans. Specifically, we first employ a lightweight retrieval +module to establish a keypoint-based embedding space, measuring the similarity +among objects by dynamically aggregating deformation-aware local-global +features around extracted keypoints. Objects that are close in the embedding +space are considered similar in geometry. Then we introduce the neural +cage-based deformation module that estimates the influence vector of each +keypoint upon cage vertices inside its local support region to control the +deformation of the retrieved shape. Extensive experiments on the synthetic +dataset PartNet and the real-world dataset Scan2CAD demonstrate that KP-RED +surpasses existing state-of-the-art approaches by a large margin. Codes and +trained models are released on https://github.com/lolrudy/KP-RED. + +
+
+ comment: Accepted by CVPR 2024. We identified an error in our baseline + experiments, re-ran them, and updated the results without impacting the + paper's conclusions. We apologize for the oversight and appreciate your + understanding +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ OpenHumanVid: A Large-Scale High-Quality Dataset for Enhancing + Human-Centric Video Generation + + +
+ Recent advancements in visual generation technologies have markedly increased +the scale and availability of video datasets, which are crucial for training +effective video generation models. However, a significant lack of high-quality, +human-centric video datasets presents a challenge to progress in this field. To +bridge this gap, we introduce OpenHumanVid, a large-scale and high-quality +human-centric video dataset characterized by precise and detailed captions that +encompass both human appearance and motion states, along with supplementary +human motion conditions, including skeleton sequences and speech audio. To +validate the efficacy of this dataset and the associated training strategies, +we propose an extension of existing classical diffusion transformer +architectures and conduct further pretraining of our models on the proposed +dataset. Our findings yield two critical insights: First, the incorporation of +a large-scale, high-quality dataset substantially enhances evaluation metrics +for generated human videos while preserving performance in general video +generation tasks. Second, the effective alignment of text with human +appearance, human motion, and facial motion is essential for producing +high-quality video outputs. Based on these insights and corresponding +methodologies, the straightforward extended network trained on the proposed +dataset demonstrates an obvious improvement in the generation of human-centric +videos. Project page https://fudan-generative-vision.github.io/OpenHumanVid + +
+
+ comment: 11 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Learning from Reduced Labels for Long-Tailed Data + + +
+ Long-tailed data is prevalent in real-world classification tasks and heavily +relies on supervised information, which makes the annotation process +exceptionally labor-intensive and time-consuming. Unfortunately, despite being +a common approach to mitigate labeling costs, existing weakly supervised +learning methods struggle to adequately preserve supervised information for +tail samples, resulting in a decline in accuracy for the tail classes. To +alleviate this problem, we introduce a novel weakly supervised labeling setting +called Reduced Label. The proposed labeling setting not only avoids the decline +of supervised information for the tail samples, but also decreases the labeling +costs associated with long-tailed data. Additionally, we propose an +straightforward and highly efficient unbiased framework with strong theoretical +guarantees to learn from these Reduced Labels. Extensive experiments conducted +on benchmark datasets including ImageNet validate the effectiveness of our +approach, surpassing the performance of state-of-the-art weakly supervised +methods. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ PriorPath: Coarse-To-Fine Approach for Controlled De-Novo Pathology + Semantic Masks Generation + + +
+ Incorporating artificial intelligence (AI) into digital pathology offers +promising prospects for automating and enhancing tasks such as image analysis +and diagnostic processes. However, the diversity of tissue samples and the +necessity for meticulous image labeling often result in biased datasets, +constraining the applicability of algorithms trained on them. To harness +synthetic histopathological images to cope with this challenge, it is essential +not only to produce photorealistic images but also to be able to exert control +over the cellular characteristics they depict. Previous studies used methods to +generate, from random noise, semantic masks that captured the spatial +distribution of the tissue. These masks were then used as a prior for +conditional generative approaches to produce photorealistic histopathological +images. However, as with many other generative models, this solution exhibits +mode collapse as the model fails to capture the full diversity of the +underlying data distribution. In this work, we present a pipeline, coined +PriorPath, that generates detailed, realistic, semantic masks derived from +coarse-grained images delineating tissue regions. This approach enables control +over the spatial arrangement of the generated masks and, consequently, the +resulting synthetic images. We demonstrated the efficacy of our method across +three cancer types, skin, prostate, and lung, showcasing PriorPath's capability +to cover the semantic mask space and to provide better similarity to real masks +compared to previous methods. Our approach allows for specifying desired tissue +distributions and obtaining both photorealistic masks and images within a +single platform, thus providing a state-of-the-art, controllable solution for +generating histopathological images to facilitate AI for computational +pathology. + +
+
+
+
+
+ + ♻ ☆ Take Your Steps: Hierarchically Efficient Pulmonary Disease Screening + via CT Volume Compression + + +
+ Deep learning models are widely used to process Computed Tomography (CT) data +in the automated screening of pulmonary diseases, significantly reducing the +workload of physicians. However, the three-dimensional nature of CT volumes +involves an excessive number of voxels, which significantly increases the +complexity of model processing. Previous screening approaches often overlook +this issue, which undoubtedly reduces screening efficiency. Towards efficient +and effective screening, we design a hierarchical approach to reduce the +computational cost of pulmonary disease screening. The new approach +re-organizes the screening workflows into three steps. First, we propose a +Computed Tomography Volume Compression (CTVC) method to select a small slice +subset that comprehensively represents the whole CT volume. Second, the +selected CT slices are used to detect pulmonary diseases coarsely via a +lightweight classification model. Third, an uncertainty measurement strategy is +applied to identify samples with low diagnostic confidence, which are +re-detected by radiologists. Experiments on two public pulmonary disease +datasets demonstrate that our approach achieves comparable accuracy and recall +while reducing the time by 50%-70% compared with the counterparts using full CT +volumes. Besides, we also found that our approach outperforms previous +cutting-edge CTVC methods in retaining important indications after compression. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ SpaGBOL: Spatial-Graph-Based Orientated Localisation + + +
+ Cross-View Geo-Localisation within urban regions is challenging in part due +to the lack of geo-spatial structuring within current datasets and techniques. +We propose utilising graph representations to model sequences of local +observations and the connectivity of the target location. Modelling as a graph +enables generating previously unseen sequences by sampling with new parameter +configurations. To leverage this newly available information, we propose a +GNN-based architecture, producing spatially strong embeddings and improving +discriminability over isolated image embeddings. We outline SpaGBOL, +introducing three novel contributions. 1) The first graph-structured dataset +for Cross-View Geo-Localisation, containing multiple streetview images per node +to improve generalisation. 2) Introducing GNNs to the problem, we develop the +first system that exploits the correlation between node proximity and feature +similarity. 3) Leveraging the unique properties of the graph representation - +we demonstrate a novel retrieval filtering approach based on neighbourhood +bearings. SpaGBOL achieves state-of-the-art accuracies on the unseen test graph +- with relative Top-1 retrieval improvements on previous techniques of 11%, and +50% when filtering with Bearing Vector Matching on the SpaGBOL dataset. + +
+
+
+
+
+ + ♻ ☆ Phase-Informed Tool Segmentation for Manual Small-Incision Cataract + Surgery + + +
+ Cataract surgery is the most common surgical procedure globally, with a +disproportionately higher burden in developing countries. While automated +surgical video analysis has been explored in general surgery, its application +to ophthalmic procedures remains limited. Existing works primarily focus on +Phaco cataract surgery, an expensive technique not accessible in regions where +cataract treatment is most needed. In contrast, Manual Small-Incision Cataract +Surgery (MSICS) is the preferred low-cost, faster alternative in high-volume +settings and for challenging cases. However, no dataset exists for MSICS. To +address this gap, we introduce Sankara-MSICS, the first comprehensive dataset +containing 53 surgical videos annotated for 18 surgical phases and 3,527 frames +with 13 surgical tools at the pixel level. We benchmark this dataset on +state-of-the-art models and present ToolSeg, a novel framework that enhances +tool segmentation by introducing a phase-conditional decoder and a simple yet +effective semi-supervised setup leveraging pseudo-labels from foundation +models. Our approach significantly improves segmentation performance, achieving +a $23.77\%$ to $38.10\%$ increase in mean Dice scores, with a notable boost for +tools that are less prevalent and small. Furthermore, we demonstrate that +ToolSeg generalizes to other surgical settings, showcasing its effectiveness on +the CaDIS dataset. + +
+
+
+
+
+ + ♻ ☆ VISION-XL: High Definition Video Inverse Problem Solver using Latent + Image Diffusion Models + + +
+ In this paper, we propose a novel framework for solving high-definition video +inverse problems using latent image diffusion models. Building on recent +advancements in spatio-temporal optimization for video inverse problems using +image diffusion models, our approach leverages latent-space diffusion models to +achieve enhanced video quality and resolution. To address the high +computational demands of processing high-resolution frames, we introduce a +pseudo-batch consistent sampling strategy, allowing efficient operation on a +single GPU. Additionally, to improve temporal consistency, we present +batch-consistent inversion, an initialization technique that incorporates +informative latents from the measurement frame. By integrating with SDXL, our +framework achieves state-of-the-art video reconstruction across a wide range of +spatio-temporal inverse problems, including complex combinations of frame +averaging and various spatial degradations, such as deblurring, +super-resolution, and inpainting. Unlike previous methods, our approach +supports multiple aspect ratios (landscape, vertical, and square) and delivers +HD-resolution reconstructions (exceeding 1280x720) in under 2.5 minutes on a +single NVIDIA 4090 GPU. + +
+
+ comment: Project page: https://vision-xl.github.io/ +
+
+
+
+
+ + ♻ ☆ Jailbreak Large Vision-Language Models Through Multi-Modal Linkage + + +
+ With the significant advancement of Large Vision-Language Models (VLMs), +concerns about their potential misuse and abuse have grown rapidly. Previous +studies have highlighted VLMs' vulnerability to jailbreak attacks, where +carefully crafted inputs can lead the model to produce content that violates +ethical and legal standards. However, existing methods struggle against +state-of-the-art VLMs like GPT-4o, due to the over-exposure of harmful content +and lack of stealthy malicious guidance. In this work, we propose a novel +jailbreak attack framework: Multi-Modal Linkage (MML) Attack. Drawing +inspiration from cryptography, MML utilizes an encryption-decryption process +across text and image modalities to mitigate over-exposure of malicious +information. To align the model's output with malicious intent covertly, MML +employs a technique called "evil alignment", framing the attack within a video +game production scenario. Comprehensive experiments demonstrate MML's +effectiveness. Specifically, MML jailbreaks GPT-4o with attack success rates of +97.80% on SafeBench, 98.81% on MM-SafeBench and 99.07% on HADES-Dataset. Our +code is available at https://github.com/wangyu-ovo/MML + +
+
+
+
+
+ + ♻ ☆ Embedded Prompt Tuning: Towards Enhanced Calibration of Pretrained + Models for Medical Images + + +
+ Foundation models pre-trained on large-scale data have been widely witnessed +to achieve success in various natural imaging downstream tasks. +Parameter-efficient fine-tuning (PEFT) methods aim to adapt foundation models +to new domains by updating only a small portion of parameters in order to +reduce computational overhead. However, the effectiveness of these PEFT +methods, especially in cross-domain few-shot scenarios, e.g., medical image +analysis, has not been fully explored. In this work, we facilitate the study of +the performance of PEFT when adapting foundation models to medical image +classification tasks. Furthermore, to alleviate the limitations of prompt +introducing ways and approximation capabilities on Transformer architectures of +mainstream prompt tuning methods, we propose the Embedded Prompt Tuning (EPT) +method by embedding prompt tokens into the expanded channels. We also find that +there are anomalies in the feature space distribution of foundation models +during pre-training process, and prompt tuning can help mitigate this negative +impact. To explain this phenomenon, we also introduce a novel perspective to +understand prompt tuning: Prompt tuning is a distribution calibrator. And we +support it by analyzing patch-wise scaling and feature separation operations +contained in EPT. Our experiments show that EPT outperforms several +state-of-the-art fine-tuning methods by a significant margin on few-shot +medical image classification tasks, and completes the fine-tuning process +within highly competitive time, indicating EPT is an effective PEFT method. The +source code is available at github.com/zuwenqiang/EPT. + +
+
+
+
+
+ + ♻ ☆ CamoFA: A Learnable Fourier-based Augmentation for Camouflage + Segmentation WACV 2025 + + +
+ Camouflaged object detection (COD) and camouflaged instance segmentation +(CIS) aim to recognize and segment objects that are blended into their +surroundings, respectively. While several deep neural network models have been +proposed to tackle those tasks, augmentation methods for COD and CIS have not +been thoroughly explored. Augmentation strategies can help improve models' +performance by increasing the size and diversity of the training data and +exposing the model to a wider range of variations in the data. Besides, we aim +to automatically learn transformations that help to reveal the underlying +structure of camouflaged objects and allow the model to learn to better +identify and segment camouflaged objects. To achieve this, we propose a +learnable augmentation method in the frequency domain for COD and CIS via the +Fourier transform approach, dubbed CamoFA. Our method leverages a conditional +generative adversarial network and cross-attention mechanism to generate a +reference image and an adaptive hybrid swapping with parameters to mix the +low-frequency component of the reference image and the high-frequency component +of the input image. This approach aims to make camouflaged objects more visible +for detection and segmentation models. Without bells and whistles, our proposed +augmentation method boosts the performance of camouflaged object detectors and +instance segmenters by large margins. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Multi-Granularity Video Object Segmentation + + +
+ Current benchmarks for video segmentation are limited to annotating only +salient objects (i.e., foreground instances). Despite their impressive +architectural designs, previous works trained on these benchmarks have +struggled to adapt to real-world scenarios. Thus, developing a new video +segmentation dataset aimed at tracking multi-granularity segmentation target in +the video scene is necessary. In this work, we aim to generate +multi-granularity video segmentation dataset that is annotated for both salient +and non-salient masks. To achieve this, we propose a large-scale, densely +annotated multi-granularity video object segmentation (MUG-VOS) dataset that +includes various types and granularities of mask annotations. We automatically +collected a training set that assists in tracking both salient and non-salient +objects, and we also curated a human-annotated test set for reliable +evaluation. In addition, we present memory-based mask propagation model (MMPM), +trained and evaluated on MUG-VOS dataset, which leads to the best performance +among the existing video object segmentation methods and Segment SAM-based +video segmentation methods. Project page is available at +https://cvlab-kaist.github.io/MUG-VOS. + +
+
+ comment: Project Page: https://cvlab-kaist.github.io/MUG-VOS +
+
+
+
+
+ + ♻ ☆ CFPNet: Improving Lightweight ToF Depth Completion via Cross-zone + Feature Propagation 3DV 2025 + + +
+ Depth completion using lightweight time-of-flight (ToF) depth sensors is +attractive due to their low cost. However, lightweight ToF sensors usually have +a limited field of view (FOV) compared with cameras. Thus, only pixels in the +zone area of the image can be associated with depth signals. Previous methods +fail to propagate depth features from the zone area to the outside-zone area +effectively, thus suffering from degraded depth completion performance outside +the zone. To this end, this paper proposes the CFPNet to achieve cross-zone +feature propagation from the zone area to the outside-zone area with two novel +modules. The first is a direct-attention-based propagation module (DAPM), which +enforces direct cross-zone feature acquisition. The second is a +large-kernel-based propagation module (LKPM), which realizes cross-zone feature +propagation by utilizing convolution layers with kernel sizes up to 31. CFPNet +achieves state-of-the-art (SOTA) depth completion performance by combining +these two modules properly, as verified by extensive experimental results on +the ZJU-L5 dataset. The code is available at +https://github.com/denyingmxd/CFPNet. + +
+
+ comment: Accepted by 3DV 2025 +
+
+
+
+
+ + ♻ ☆ Towards Cross-View-Consistent Self-Supervised Surround Depth Estimation IROS2024 + + +
+ Depth estimation is a cornerstone for autonomous driving, yet acquiring +per-pixel depth ground truth for supervised learning is challenging. +Self-Supervised Surround Depth Estimation (SSSDE) from consecutive images +offers an economical alternative. While previous SSSDE methods have proposed +different mechanisms to fuse information across images, few of them explicitly +consider the cross-view constraints, leading to inferior performance, +particularly in overlapping regions. This paper proposes an efficient and +consistent pose estimation design and two loss functions to enhance cross-view +consistency for SSSDE. For pose estimation, we propose to use only front-view +images to reduce training memory and sustain pose estimation consistency. The +first loss function is the dense depth consistency loss, which penalizes the +difference between predicted depths in overlapping regions. The second one is +the multi-view reconstruction consistency loss, which aims to maintain +consistency between reconstruction from spatial and spatial-temporal contexts. +Additionally, we introduce a novel flipping augmentation to improve the +performance further. Our techniques enable a simple neural model to achieve +state-of-the-art performance on the DDAD and nuScenes datasets. Last but not +least, our proposed techniques can be easily applied to other methods. The code +is available at https://github.com/denyingmxd/CVCDepth. + +
+
+ comment: Accepted by IROS2024 +
+
+
+
+
+ + ♻ ☆ GFreeDet: Exploiting Gaussian Splatting and Foundation Models for + Model-free Unseen Object Detection in the BOP Challenge 2024 + + +
+ In this report, we provide the technical details of the submitted method +GFreeDet, which exploits Gaussian splatting and vision Foundation models for +the model-free unseen object Detection track in the BOP 2024 Challenge. + +
+
+
+
+
+ + ♻ ☆ Concept Replacer: Replacing Sensitive Concepts in Diffusion Models via + Precision Localization + + +
+ As large-scale diffusion models continue to advance, they excel at producing +high-quality images but often generate unwanted content, such as sexually +explicit or violent content. Existing methods for concept removal generally +guide the image generation process but can unintentionally modify unrelated +regions, leading to inconsistencies with the original model. We propose a novel +approach for targeted concept replacing in diffusion models, enabling specific +concepts to be removed without affecting non-target areas. Our method +introduces a dedicated concept localizer for precisely identifying the target +concept during the denoising process, trained with few-shot learning to require +minimal labeled data. Within the identified region, we introduce a +training-free Dual Prompts Cross-Attention (DPCA) module to substitute the +target concept, ensuring minimal disruption to surrounding content. We evaluate +our method on concept localization precision and replacement efficiency. +Experimental results demonstrate that our method achieves superior precision in +localizing target concepts and performs coherent concept replacement with +minimal impact on non-target areas, outperforming existing approaches. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Mamba-based Pedestrian Attribute Recognition + + +
+ Current strong pedestrian attribute recognition models are developed based on +Transformer networks, which are computationally heavy. Recently proposed models +with linear complexity (e.g., Mamba) have garnered significant attention and +have achieved a good balance between accuracy and computational cost across a +variety of visual tasks. Relevant review articles also suggest that while these +models can perform well on some pedestrian attribute recognition datasets, they +are generally weaker than the corresponding Transformer models. To further tap +into the potential of the novel Mamba architecture for PAR tasks, this paper +designs and adapts Mamba into two typical PAR frameworks, i.e., the text-image +fusion approach and pure vision Mamba multi-label recognition framework. It is +found that interacting with attribute tags as additional input does not always +lead to an improvement, specifically, Vim can be enhanced, but VMamba cannot. +This paper further designs various hybrid Mamba-Transformer variants and +conducts thorough experimental validations. These experimental results indicate +that simply enhancing Mamba with a Transformer does not always lead to +performance improvements but yields better results under certain settings. We +hope this empirical study can further inspire research in Mamba for PAR, and +even extend into the domain of multi-label recognition, through the design of +these network structures and comprehensive experimentation. The source code of +this work will be released at \url{https://github.com/Event-AHU/OpenPAR} + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ PassionSR: Post-Training Quantization with Adaptive Scale in One-Step + Diffusion based Image Super-Resolution + + +
+ Diffusion-based image super-resolution (SR) models have shown superior +performance at the cost of multiple denoising steps. However, even though the +denoising step has been reduced to one, they require high computational costs +and storage requirements, making it difficult for deployment on hardware +devices. To address these issues, we propose a novel post-training quantization +approach with adaptive scale in one-step diffusion (OSD) image SR, PassionSR. +First, we simplify OSD model to two core components, UNet and Variational +Autoencoder (VAE) by removing the CLIPEncoder. Secondly, we propose Learnable +Boundary Quantizer (LBQ) and Learnable Equivalent Transformation (LET) to +optimize the quantization process and manipulate activation distributions for +better quantization. Finally, we design a Distributed Quantization Calibration +(DQC) strategy that stabilizes the training of quantized parameters for rapid +convergence. Comprehensive experiments demonstrate that PassionSR with 8-bit +and 6-bit obtains comparable visual results with full-precision model. +Moreover, our PassionSR achieves significant advantages over recent leading +low-bit quantization methods for image SR. Our code will be at +https://github.com/libozhu03/PassionSR. + +
+
+ comment: https://github.com/libozhu03/PassionSR +
+
+
+
+
+ + ♻ ☆ Adaptive Rank, Reduced Forgetting: Knowledge Retention in Continual + Learning Vision-Language Models with Dynamic Rank-Selective LoRA + + +
+ We investigate whether the pre-trained knowledge of vision-language models +(VLMs), such as CLIP, can be retained or even enhanced during continual +learning (CL) while absorbing knowledge from a data stream. Existing methods +often rely on additional reference data, isolated components for distribution +or domain predictions, leading to high training costs, increased inference +complexity, and limited improvement potential for pre-trained models. To +address these challenges, we first comprehensively analyze the effects of +parameter update locations and ranks on downstream adaptation and knowledge +retention. Based on these insights, we propose Dynamic Rank-Selective Low Rank +Adaptation (LoRA), a universal and efficient CL approach that adaptively +assigns ranks to LoRA modules based on their relevance to the current data. +Unlike prior methods, our approach continually enhances the pre-trained VLM by +retaining both the pre-trained knowledge and the knowledge acquired during CL. +Our approach eliminates the need for explicit domain or distribution prediction +and additional reference data, enabling seamless integration of new tasks while +preserving pre-trained capabilities. It also maintains the original +architecture and deployment pipeline of the pre-trained model without incurring +any additional inference overhead. Extensive experiments and analyses +demonstrate that our method outperforms state-of-the-art approaches in +continually absorbing knowledge of downstream tasks while retaining pre-trained +knowledge. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ DyMO: Training-Free Diffusion Model Alignment with Dynamic + Multi-Objective Scheduling + + +
+ Text-to-image diffusion model alignment is critical for improving the +alignment between the generated images and human preferences. While +training-based methods are constrained by high computational costs and dataset +requirements, training-free alignment methods remain underexplored and are +often limited by inaccurate guidance. We propose a plug-and-play training-free +alignment method, DyMO, for aligning the generated images and human preferences +during inference. Apart from text-aware human preference scores, we introduce a +semantic alignment objective for enhancing the semantic alignment in the early +stages of diffusion, relying on the fact that the attention maps are effective +reflections of the semantics in noisy images. We propose dynamic scheduling of +multiple objectives and intermediate recurrent steps to reflect the +requirements at different steps. Experiments with diverse pre-trained diffusion +models and metrics demonstrate the effectiveness and robustness of the proposed +method. + +
+
+
+
+
+ + ♻ ☆ Exploring Frequency-Inspired Optimization in Transformer for Efficient + Single Image Super-Resolution + + +
+ Transformer-based methods have exhibited remarkable potential in single image +super-resolution (SISR) by effectively extracting long-range dependencies. +However, most of the current research in this area has prioritized the design +of transformer blocks to capture global information, while overlooking the +importance of incorporating high-frequency priors, which we believe could be +beneficial. In our study, we conducted a series of experiments and found that +transformer structures are more adept at capturing low-frequency information, +but have limited capacity in constructing high-frequency representations when +compared to their convolutional counterparts. Our proposed solution, the +cross-refinement adaptive feature modulation transformer (CRAFT), integrates +the strengths of both convolutional and transformer structures. It comprises +three key components: the high-frequency enhancement residual block (HFERB) for +extracting high-frequency information, the shift rectangle window attention +block (SRWAB) for capturing global information, and the hybrid fusion block +(HFB) for refining the global representation. To tackle the inherent +intricacies of transformer structures, we introduce a frequency-guided +post-training quantization (PTQ) method aimed at enhancing CRAFT's efficiency. +These strategies incorporate adaptive dual clipping and boundary refinement. To +further amplify the versatility of our proposed approach, we extend our PTQ +strategy to function as a general quantization method for transformer-based +SISR techniques. Our experimental findings showcase CRAFT's superiority over +current state-of-the-art methods, both in full-precision and quantization +scenarios. These results underscore the efficacy and universality of our PTQ +strategy. The source code is available at: +https://github.com/AVC2-UESTC/Frequency-Inspired-Optimization-for-EfficientSR.git. + +
+
+ comment: Extended CRAFT, accepted by TPAMI +
+
+
+
+
+ + ♻ ☆ From Seconds to Hours: Reviewing MultiModal Large Language Models on + Comprehensive Long Video Understanding + + +
+ The integration of Large Language Models (LLMs) with visual encoders has +recently shown promising performance in visual understanding tasks, leveraging +their inherent capability to comprehend and generate human-like text for visual +reasoning. Given the diverse nature of visual data, MultiModal Large Language +Models (MM-LLMs) exhibit variations in model designing and training for +understanding images, short videos, and long videos. Our paper focuses on the +substantial differences and unique challenges posed by long video understanding +compared to static image and short video understanding. Unlike static images, +short videos encompass sequential frames with both spatial and within-event +temporal information, while long videos consist of multiple events with +between-event and long-term temporal information. In this survey, we aim to +trace and summarize the advancements of MM-LLMs from image understanding to +long video understanding. We review the differences among various visual +understanding tasks and highlight the challenges in long video understanding, +including more fine-grained spatiotemporal details, dynamic events, and +long-term dependencies. We then provide a detailed summary of the advancements +in MM-LLMs in terms of model design and training methodologies for +understanding long videos. Finally, we compare the performance of existing +MM-LLMs on video understanding benchmarks of various lengths and discuss +potential future directions for MM-LLMs in long video understanding. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ TransFair: Transferring Fairness from Ocular Disease Classification to + Progression Prediction + + +
+ The use of artificial intelligence (AI) in automated disease classification +significantly reduces healthcare costs and improves the accessibility of +services. However, this transformation has given rise to concerns about the +fairness of AI, which disproportionately affects certain groups, particularly +patients from underprivileged populations. Recently, a number of methods and +large-scale datasets have been proposed to address group performance +disparities. Although these methods have shown effectiveness in disease +classification tasks, they may fall short in ensuring fair prediction of +disease progression, mainly because of limited longitudinal data with diverse +demographics available for training a robust and equitable prediction model. In +this paper, we introduce TransFair to enhance demographic fairness in +progression prediction for ocular diseases. TransFair aims to transfer a +fairness-enhanced disease classification model to the task of progression +prediction with fairness preserved. Specifically, we train a fair EfficientNet, +termed FairEN, equipped with a fairness-aware attention mechanism using +extensive data for ocular disease classification. Subsequently, this fair +classification model is adapted to a fair progression prediction model through +knowledge distillation, which aims to minimize the latent feature distances +between the classification and progression prediction models. We evaluate +FairEN and TransFair for fairness-enhanced ocular disease classification and +progression prediction using both two-dimensional (2D) and 3D retinal images. +Extensive experiments and comparisons with models with and without considering +fairness learning show that TransFair effectively enhances demographic equity +in predicting ocular disease progression. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ Unleashing In-context Learning of Autoregressive Models for Few-shot + Image Manipulation + + +
+ Text-guided image manipulation has experienced notable advancement in recent +years. In order to mitigate linguistic ambiguity, few-shot learning with visual +examples has been applied for instructions that are underrepresented in the +training set, or difficult to describe purely in language. However, learning +from visual prompts requires strong reasoning capability, which diffusion +models are struggling with. To address this issue, we introduce a novel +multi-modal autoregressive model, dubbed $\textbf{InstaManip}$, that can +$\textbf{insta}$ntly learn a new image $\textbf{manip}$ulation operation from +textual and visual guidance via in-context learning, and apply it to new query +images. Specifically, we propose an innovative group self-attention mechanism +to break down the in-context learning process into two separate stages -- +learning and applying, which simplifies the complex problem into two easier +tasks. We also introduce a relation regularization method to further +disentangle image transformation features from irrelevant contents in exemplar +images. Extensive experiments suggest that our method surpasses previous +few-shot image manipulation models by a notable margin ($\geq$19% in human +evaluation). We also find our model can be further boosted by increasing the +number or diversity of exemplar images. + +
+
+ comment: 18 pages, 16 figures, 5 tables +
+
+
+
+
+
+
+
+ + Artificial Intelligence 163 + +
+
+
+ + ☆ Scaling BERT Models for Turkish Automatic Punctuation and Capitalization + Correction + + +
+ This paper investigates the effectiveness of BERT based models for automated +punctuation and capitalization corrections in Turkish texts across five +distinct model sizes. The models are designated as Tiny, Mini, Small, Medium, +and Base. The design and capabilities of each model are tailored to address the +specific challenges of the Turkish language, with a focus on optimizing +performance while minimizing computational overhead. The study presents a +systematic comparison of the performance metrics precision, recall, and F1 +score of each model, offering insights into their applicability in diverse +operational contexts. The results demonstrate a significant improvement in text +readability and accuracy as model size increases, with the Base model achieving +the highest correction precision. This research provides a comprehensive guide +for selecting the appropriate model size based on specific user needs and +computational resources, establishing a framework for deploying these models in +real-world applications to enhance the quality of written Turkish. + +
+
+ comment: 2024 Innovations in Intelligent Systems and Applications Conference + (ASYU) +
+
+
+
+
+ + ☆ Taming Scalable Visual Tokenizer for Autoregressive Image Generation + + +
+ Existing vector quantization (VQ) methods struggle with scalability, largely +attributed to the instability of the codebook that undergoes partial updates +during training. The codebook is prone to collapse as utilization decreases, +due to the progressively widening distribution gap between non-activated codes +and visual features. To solve the problem, we propose Index Backpropagation +Quantization (IBQ), a new VQ method for the joint optimization of all codebook +embeddings and the visual encoder. Applying a straight-through estimator on the +one-hot categorical distribution between the encoded feature and codebook, all +codes are differentiable and maintain a consistent latent space with the visual +encoder. IBQ enables scalable training of visual tokenizers and, for the first +time, achieves a large-scale codebook ($2^{18}$) with high dimension ($256$) +and high utilization. Experiments on the standard ImageNet benchmark +demonstrate the scalability and superiority of IBQ, achieving competitive +results on both reconstruction ($1.00$ rFID) and autoregressive visual +generation ($2.05$ gFID). The code and models are available at +https://github.com/TencentARC/SEED-Voken. + +
+
+
+
+
+ + ☆ T-REG: Preference Optimization with Token-Level Reward Regularization + + +
+ Reinforcement learning from human feedback (RLHF) has been crucial in +aligning large language models (LLMs) with human values. Traditionally, RLHF +involves generating responses to a query and using a reward model to assign a +reward to the entire response. However, this approach faces challenges due to +its reliance on a single, sparse reward, which makes it challenging for the +model to identify which parts of the sequence contribute most significantly to +the final reward. Recent methods have attempted to address this limitation by +introducing token-level rewards. However, these methods often rely on either a +trained credit assignment model or AI annotators, raising concerns about the +quality and reliability of the rewards. In this paper, we propose token-level +reward regularization (T-REG), a novel approach that leverages both +sequence-level and token-level rewards for preference optimization. Harnessing +the self-refinement capabilities of LLMs, our method uses contrastive prompting +to enable LLMs to self-generate token-level rewards. These self-generated +rewards then act as reward regularization, guiding the model to more +effectively distribute sequence-level rewards across tokens. This facilitates +better token-level credit assignment and enhances alignment performance. +Experiments on the instruction following benchmarks, including Alpaca Eval 2 +and Arena-Hard, show that our method consistently outperforms baseline methods +by up to 3.8% and 4.4%, respectively. We will release the code and models at +https://github.com/wzhouad/T-REG. + +
+
+
+
+
+ + ☆ AniGS: Animatable Gaussian Avatar from a Single Image with Inconsistent + Gaussian Reconstruction + + +
+ Generating animatable human avatars from a single image is essential for +various digital human modeling applications. Existing 3D reconstruction methods +often struggle to capture fine details in animatable models, while generative +approaches for controllable animation, though avoiding explicit 3D modeling, +suffer from viewpoint inconsistencies in extreme poses and computational +inefficiencies. In this paper, we address these challenges by leveraging the +power of generative models to produce detailed multi-view canonical pose +images, which help resolve ambiguities in animatable human reconstruction. We +then propose a robust method for 3D reconstruction of inconsistent images, +enabling real-time rendering during inference. Specifically, we adapt a +transformer-based video generation model to generate multi-view canonical pose +images and normal maps, pretraining on a large-scale video dataset to improve +generalization. To handle view inconsistencies, we recast the reconstruction +problem as a 4D task and introduce an efficient 3D modeling approach using 4D +Gaussian Splatting. Experiments demonstrate that our method achieves +photorealistic, real-time animation of 3D human avatars from in-the-wild +images, showcasing its effectiveness and generalization capability. + +
+
+ comment: Project Page: https://lingtengqiu.github.io/2024/AniGS/ +
+
+
+
+
+ + ☆ The Asymptotic Behavior of Attention in Transformers + + +
+ A key component of transformers is the attention mechanism orchestrating how +each token influences the propagation of every other token through a +transformer. In this paper we provide a rigorous, mathematical analysis of the +asymptotic properties of attention in transformers. Although we present several +results based on different assumptions, all of them point to the same +conclusion, all tokens asymptotically converge to each other, a phenomenon that +has been empirically reported in the literature. Our findings are carefully +compared with existing theoretical results and illustrated by simulations and +experimental studies using the GPT-2 model. + +
+
+
+
+
+ + ☆ Adaptive Informed Deep Neural Networks for Power Flow Analysis + + +
+ This study introduces PINN4PF, an end-to-end deep learning architecture for +power flow (PF) analysis that effectively captures the nonlinear dynamics of +large-scale modern power systems. The proposed neural network (NN) architecture +consists of two important advancements in the training pipeline: (A) a +double-head feed-forward NN that aligns with PF analysis, including an +activation function that adjusts to active and reactive power consumption +patterns, and (B) a physics-based loss function that partially incorporates +power system topology information. The effectiveness of the proposed +architecture is illustrated through 4-bus, 15-bus, 290-bus, and 2224-bus test +systems and is evaluated against two baselines: a linear regression model (LR) +and a black-box NN (MLP). The comparison is based on (i) generalization +ability, (ii) robustness, (iii) impact of training dataset size on +generalization ability, (iv) accuracy in approximating derived PF quantities +(specifically line current, line active power, and line reactive power), and +(v) scalability. Results demonstrate that PINN4PF outperforms both baselines +across all test systems by up to two orders of magnitude not only in terms of +direct criteria, e.g., generalization ability but also in terms of +approximating derived physical quantities. + +
+
+ comment: 10 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ Scaffold or Crutch? Examining College Students' Use and Views of + Generative AI Tools for STEM Education + + +
+ Developing problem-solving competency is central to Science, Technology, +Engineering, and Mathematics (STEM) education, yet translating this priority +into effective approaches to problem-solving instruction and assessment remain +a significant challenge. The recent proliferation of generative artificial +intelligence (genAI) tools like ChatGPT in higher education introduces new +considerations about how these tools can help or hinder students' development +of STEM problem-solving competency. Our research examines these considerations +by studying how and why college students use genAI tools in their STEM +coursework, focusing on their problem-solving support. We surveyed 40 STEM +college students from diverse U.S. institutions and 28 STEM faculty to +understand instructor perspectives on effective genAI tool use and guidance in +STEM courses. Our findings reveal high adoption rates and diverse applications +of genAI tools among STEM students. The most common use cases include finding +explanations, exploring related topics, summarizing readings, and helping with +problem-set questions. The primary motivation for using genAI tools was to save +time. Moreover, over half of student participants reported simply inputting +problems for AI to generate solutions, potentially bypassing their own +problem-solving processes. These findings indicate that despite high adoption +rates, students' current approaches to utilizing genAI tools often fall short +in enhancing their own STEM problem-solving competencies. The study also +explored students' and STEM instructors' perceptions of the benefits and risks +associated with using genAI tools in STEM education. Our findings provide +insights into how to guide students on appropriate genAI use in STEM courses +and how to design genAI-based tools to foster students' problem-solving +competency. + +
+
+
+
+
+ + ☆ QA-TOOLBOX: Conversational Question-Answering for process task guidance + in manufacturing + + +
+ In this work we explore utilizing LLMs for data augmentation for +manufacturing task guidance system. The dataset consists of representative +samples of interactions with technicians working in an advanced manufacturing +setting. The purpose of this work to explore the task, data augmentation for +the supported tasks and evaluating the performance of the existing LLMs. We +observe that that task is complex requiring understanding from procedure +specification documents, actions and objects sequenced temporally. The dataset +consists of 200,000+ question/answer pairs that refer to the spec document and +are grounded in narrations and/or video demonstrations. We compared the +performance of several popular open-sourced LLMs by developing a baseline using +each LLM and then compared the responses in a reference-free setting using +LLM-as-a-judge and compared the ratings with crowd-workers whilst validating +the ratings with experts. + +
+
+
+
+
+ + ☆ Scaling Image Tokenizers with Grouped Spherical Quantization + + +
+ Vision tokenizers have gained a lot of attraction due to their scalability +and compactness; previous works depend on old-school GAN-based hyperparameters, +biased comparisons, and a lack of comprehensive analysis of the scaling +behaviours. To tackle those issues, we introduce Grouped Spherical Quantization +(GSQ), featuring spherical codebook initialization and lookup regularization to +constrain codebook latent to a spherical surface. Our empirical analysis of +image tokenizer training strategies demonstrates that GSQ-GAN achieves superior +reconstruction quality over state-of-the-art methods with fewer training +iterations, providing a solid foundation for scaling studies. Building on this, +we systematically examine the scaling behaviours of GSQ, specifically in latent +dimensionality, codebook size, and compression ratios, and their impact on +model performance. Our findings reveal distinct behaviours at high and low +spatial compression levels, underscoring challenges in representing +high-dimensional latent spaces. We show that GSQ can restructure +high-dimensional latent into compact, low-dimensional spaces, thus enabling +efficient scaling with improved quality. As a result, GSQ-GAN achieves a 16x +down-sampling with a reconstruction FID (rFID) of 0.50. + +
+
+
+
+
+ + ☆ Time-Reversal Provides Unsupervised Feedback to LLMs + + +
+ Large Language Models (LLMs) are typically trained to predict in the forward +direction of time. However, recent works have shown that prompting these models +to look back and critique their own generations can produce useful feedback. +Motivated by this, we explore the question of whether LLMs can be empowered to +think (predict and score) backwards to provide unsupervised feedback that +complements forward LLMs. Towards this, we introduce Time Reversed Language +Models (TRLMs), which can score and generate queries when conditioned on +responses, effectively functioning in the reverse direction of time. Further, +to effectively infer in the response to query direction, we pre-train and +fine-tune a language model (TRLM-Ba) in the reverse token order from scratch. +We show empirically (and theoretically in a stylized setting) that +time-reversed models can indeed complement forward model predictions when used +to score the query given response for re-ranking multiple forward generations. +We obtain up to 5\% improvement on the widely used AlpacaEval Leaderboard over +the competent baseline of best-of-N re-ranking using self log-perplexity +scores. We further show that TRLM scoring outperforms conventional forward +scoring of response given query, resulting in significant gains in applications +such as citation generation and passage retrieval. We next leverage the +generative ability of TRLM to augment or provide unsupervised feedback to input +safety filters of LLMs, demonstrating a drastic reduction in false negative +rate with negligible impact on false positive rates against several attacks +published on the popular JailbreakBench leaderboard. + +
+
+
+
+
+ + ☆ Medical Multimodal Foundation Models in Clinical Diagnosis and + Treatment: Applications, Challenges, and Future Directions + + +
+ Recent advancements in deep learning have significantly revolutionized the +field of clinical diagnosis and treatment, offering novel approaches to improve +diagnostic precision and treatment efficacy across diverse clinical domains, +thus driving the pursuit of precision medicine. The growing availability of +multi-organ and multimodal datasets has accelerated the development of +large-scale Medical Multimodal Foundation Models (MMFMs). These models, known +for their strong generalization capabilities and rich representational power, +are increasingly being adapted to address a wide range of clinical tasks, from +early diagnosis to personalized treatment strategies. This review offers a +comprehensive analysis of recent developments in MMFMs, focusing on three key +aspects: datasets, model architectures, and clinical applications. We also +explore the challenges and opportunities in optimizing multimodal +representations and discuss how these advancements are shaping the future of +healthcare by enabling improved patient outcomes and more efficient clinical +workflows. + +
+
+
+
+
+ + ☆ Improving Dynamic Object Interactions in Text-to-Video Generation with + AI Feedback + + +
+ Large text-to-video models hold immense potential for a wide range of +downstream applications. However, these models struggle to accurately depict +dynamic object interactions, often resulting in unrealistic movements and +frequent violations of real-world physics. One solution inspired by large +language models is to align generated outputs with desired outcomes using +external feedback. This enables the model to refine its responses autonomously, +eliminating extensive manual data collection. In this work, we investigate the +use of feedback to enhance the object dynamics in text-to-video models. We aim +to answer a critical question: what types of feedback, paired with which +specific self-improvement algorithms, can most effectively improve text-video +alignment and realistic object interactions? We begin by deriving a unified +probabilistic objective for offline RL finetuning of text-to-video models. This +perspective highlights how design elements in existing algorithms like KL +regularization and policy projection emerge as specific choices within a +unified framework. We then use derived methods to optimize a set of text-video +alignment metrics (e.g., CLIP scores, optical flow), but notice that they often +fail to align with human perceptions of generation quality. To address this +limitation, we propose leveraging vision-language models to provide more +nuanced feedback specifically tailored to object dynamics in videos. Our +experiments demonstrate that our method can effectively optimize a wide variety +of rewards, with binary AI feedback driving the most significant improvements +in video quality for dynamic interactions, as confirmed by both AI and human +evaluations. Notably, we observe substantial gains when using reward signals +derived from AI feedback, particularly in scenarios involving complex +interactions between multiple objects and realistic depictions of objects +falling. + +
+
+ comment: Website: https://sites.google.com/view/aif-dynamic-t2v/ +
+
+
+
+
+ + ☆ Projection Abstractions in Planning Under the Lenses of Abstractions for + MDPs + + +
+ The concept of abstraction has been independently developed both in the +context of AI Planning and discounted Markov Decision Processes (MDPs). +However, the way abstractions are built and used in the context of Planning and +MDPs is different even though lots of commonalities can be highlighted. To this +day there is no work trying to relate and unify the two fields on the matter of +abstractions unraveling all the different assumptions and their effect on the +way they can be used. Therefore, in this paper we aim to do so by looking at +projection abstractions in Planning through the lenses of discounted MDPs. +Starting from a projection abstraction built according to Classical or +Probabilistic Planning techniques, we will show how the same abstraction can be +obtained under the abstraction frameworks available for discounted MDPs. Along +the way, we will focus on computational as well as representational advantages +and disadvantages of both worlds pointing out new research directions that are +of interest for both fields. + +
+
+
+
+
+ + ☆ AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand + Audio-Visual Information? + + +
+ Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini +1.5 Pro, and Reka Core, have expanded their capabilities to include vision and +audio modalities. While these models demonstrate impressive performance across +a wide range of audio-visual applications, our proposed DeafTest reveals that +MLLMs often struggle with simple tasks humans find trivial: 1) determining +which of two sounds is louder, and 2) determining which of two sounds has a +higher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a +comprehensive audio-visual benchmark designed to assess whether those MLLMs can +truly understand the audio-visual information. This benchmark encompasses 4,555 +carefully crafted problems, each incorporating text, visual, and audio +components. To successfully infer answers, models must effectively leverage +clues from both visual and audio inputs. To ensure precise and objective +evaluation of MLLM responses, we have structured the questions as +multiple-choice, eliminating the need for human evaluation or LLM-assisted +assessment. We benchmark a series of closed-source and open-source models and +summarize the observations. By revealing the limitations of current models, we +aim to provide useful insight for future dataset collection and model +development. + +
+
+ comment: Project page: https://av-odyssey.github.io/ +
+
+
+
+
+ + ☆ AI-Driven Resource Allocation Framework for Microservices in Hybrid + Cloud Platforms + + +
+ The increasing demand for scalable, efficient resource management in hybrid +cloud environments has led to the exploration of AI-driven approaches for +dynamic resource allocation. This paper presents an AI-driven framework for +resource allocation among microservices in hybrid cloud platforms. The +framework employs reinforcement learning (RL)-based resource utilization +optimization to reduce costs and improve performance. The framework integrates +AI models with cloud management tools to respond to challenges of dynamic +scaling and cost-efficient low-latency service delivery. The reinforcement +learning model continuously adjusts provisioned resources as required by the +microservices and predicts the future consumption trends to minimize both +under- and over-provisioning of resources. Preliminary simulation results +indicate that using AI in the provision of resources related to costs can +reduce expenditure by up to 30-40% compared to manual provisioning and +threshold-based auto-scaling approaches. It is also estimated that the +efficiency in resource utilization is expected to improve by 20%-30% with a +corresponding latency cut of 15%-20% during the peak demand periods. This study +compares the AI-driven approach with existing static and rule-based resource +allocation methods, demonstrating the capability of this new model to +outperform them in terms of flexibility and real-time interests. The results +indicate that reinforcement learning can make optimization of hybrid cloud +platforms even better, offering a 25-35% improvement in cost efficiency and the +power of scaling for microservice-based applications. The proposed framework is +a strong and scalable solution to managing cloud resources in dynamic and +performance-critical environments. + +
+
+ comment: 25 pages, 14 figures +
+
+
+
+
+ + ☆ CEGI: Measuring the trade-off between efficiency and carbon emissions + for SLMs and VLMs + + +
+ This paper analyzes the performance of Small Language Models (SLMs) and +Vision Language Models (VLMs) and evaluates the trade-off between model +performance and carbon emissions across 4 essential tasks: Image Captioning, +Visual Question Answering (VQA), Dialogue Summarization and Text-to-SQL +conversion. Various SLMs and VLMs belonging to the Qwen and LLaMA architecture +family are chosen and variants based on model size in terms of the number of +parameters, quantization level and fine-tuning parameters are evaluated. The +model variant's performance and carbon emissions are calculated. To quantify +the trade-off between model performance and carbon emissions, we introduce a +novel metric called CEGI (Carbon Efficient Gain Index). This metric represents +the carbon emission per unit percentage gain per million trainable parameters . +This metric provides a normalized measure to compare model's efficiency in +terms of performance improvement relative to their environmental cost. The +experiment's outcome demonstrates that fine-tuning SLMs and VLMs can achieve +performance levels comparable to Large Language Models (LLMs) while producing +significantly less carbon emissions. Our findings suggest that the marginal +gains in accuracy from larger models do not justify the substantial increase in +carbon emissions. Leveraging lower-bit quantization levels, the proposed metric +further enhances energy efficiency without compromising performance. This study +highlights balancing high performance and environmental sustainability. It +offers a valuable metric for selecting models suitable for +environmentally-friendly AI development. + +
+
+
+
+
+ + ☆ PrefixLLM: LLM-aided Prefix Circuit Design + + +
+ Prefix circuits are fundamental components in digital adders, widely used in +digital systems due to their efficiency in calculating carry signals. +Synthesizing prefix circuits with minimized area and delay is crucial for +enhancing the performance of modern computing systems. Recently, large language +models (LLMs) have demonstrated a surprising ability to perform text generation +tasks. We propose PrefixLLM, that leverages LLMs for prefix circuit synthesis. +PrefixLLM transforms the prefix circuit synthesis task into a structured text +generation problem, termed the Structured Prefix Circuit Representation (SPCR), +and introduces an iterative framework to automatically and accurately generate +valid SPCRs. We further present a design space exploration (DSE) framework that +uses LLMs to iteratively search for area and delay optimized prefix circuits. +Compared to state-of-the-art, PrefixLLM can reduce the area by 3.70% under the +same delay constraint. This work highlights the use of LLMs in the synthesis of +arithmetic circuits, which can be transformed into the structured text +generation. + +
+
+
+
+
+ + ☆ Explainable CTR Prediction via LLM Reasoning WSDM 2025 + + +
+ Recommendation Systems have become integral to modern user experiences, but +lack transparency in their decision-making processes. Existing explainable +recommendation methods are hindered by reliance on a post-hoc paradigm, wherein +explanation generators are trained independently of the underlying recommender +models. This paradigm necessitates substantial human effort in data +construction and raises concerns about explanation reliability. In this paper, +we present ExpCTR, a novel framework that integrates large language model based +explanation generation directly into the CTR prediction process. Inspired by +recent advances in reinforcement learning, we employ two carefully designed +reward mechanisms, LC alignment, which ensures explanations reflect user +intentions, and IC alignment, which maintains consistency with traditional +ID-based CTR models. Our approach incorporates an efficient training paradigm +with LoRA and a three-stage iterative process. ExpCTR circumvents the need for +extensive explanation datasets while fostering synergy between CTR prediction +and explanation generation. Experimental results demonstrate that ExpCTR +significantly enhances both recommendation accuracy and interpretability across +three real-world datasets. + +
+
+ comment: WSDM 2025 +
+
+
+
+
+ + ☆ Factored space models: Towards causality between levels of abstraction + + +
+ Causality plays an important role in understanding intelligent behavior, and +there is a wealth of literature on mathematical models for causality, most of +which is focused on causal graphs. Causal graphs are a powerful tool for a wide +range of applications, in particular when the relevant variables are known and +at the same level of abstraction. However, the given variables can also be +unstructured data, like pixels of an image. Meanwhile, the causal variables, +such as the positions of objects in the image, can be arbitrary deterministic +functions of the given variables. Moreover, the causal variables may form a +hierarchy of abstractions, in which the macro-level variables are deterministic +functions of the micro-level variables. Causal graphs are limited when it comes +to modeling this kind of situation. In the presence of deterministic +relationships there is generally no causal graph that satisfies both the Markov +condition and the faithfulness condition. We introduce factored space models as +an alternative to causal graphs which naturally represent both probabilistic +and deterministic relationships at all levels of abstraction. Moreover, we +introduce structural independence and establish that it is equivalent to +statistical independence in every distribution that factorizes over the +factored space. This theorem generalizes the classical soundness and +completeness theorem for d-separation. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Generating Critical Scenarios for Testing Automated Driving Systems + + +
+ Autonomous vehicles (AVs) have demonstrated significant potential in +revolutionizing transportation, yet ensuring their safety and reliability +remains a critical challenge, especially when exposed to dynamic and +unpredictable environments. Real-world testing of an Autonomous Driving System +(ADS) is both expensive and risky, making simulation-based testing a preferred +approach. In this paper, we propose AVASTRA, a Reinforcement Learning +(RL)-based approach to generate realistic critical scenarios for testing ADSs +in simulation environments. To capture the complexity of driving scenarios, +AVASTRA comprehensively represents the environment by both the internal states +of an ADS under-test (e.g., the status of the ADS's core components, speed, or +acceleration) and the external states of the surrounding factors in the +simulation environment (e.g., weather, traffic flow, or road condition). +AVASTRA trains the RL agent to effectively configure the simulation environment +that places the AV in dangerous situations and potentially leads it to +collisions. We introduce a diverse set of actions that allows the RL agent to +systematically configure both environmental conditions and traffic +participants. Additionally, based on established safety requirements, we +enforce heuristic constraints to ensure the realism and relevance of the +generated test scenarios. AVASTRA is evaluated on two popular simulation maps +with four different road configurations. Our results show AVASTRA's ability to +outperform the state-of-the-art approach by generating 30% to 115% more +collision scenarios. Compared to the baseline based on Random Search, AVASTRA +achieves up to 275% better performance. These results highlight the +effectiveness of AVASTRA in enhancing the safety testing of AVs through +realistic comprehensive critical scenario generation. + +
+
+
+
+
+ + ☆ TAB-Fields: A Maximum Entropy Framework for Mission-Aware Adversarial + Planning + + +
+ Autonomous agents operating in adversarial scenarios face a fundamental +challenge: while they may know their adversaries' high-level objectives, such +as reaching specific destinations within time constraints, the exact policies +these adversaries will employ remain unknown. Traditional approaches address +this challenge by treating the adversary's state as a partially observable +element, leading to a formulation as a Partially Observable Markov Decision +Process (POMDP). However, the induced belief-space dynamics in a POMDP require +knowledge of the system's transition dynamics, which, in this case, depend on +the adversary's unknown policy. Our key observation is that while an +adversary's exact policy is unknown, their behavior is necessarily constrained +by their mission objectives and the physical environment, allowing us to +characterize the space of possible behaviors without assuming specific +policies. In this paper, we develop Task-Aware Behavior Fields (TAB-Fields), a +representation that captures adversary state distributions over time by +computing the most unbiased probability distribution consistent with known +constraints. We construct TAB-Fields by solving a constrained optimization +problem that minimizes additional assumptions about adversary behavior beyond +mission and environmental requirements. We integrate TAB-Fields with standard +planning algorithms by introducing TAB-conditioned POMCP, an adaptation of +Partially Observable Monte Carlo Planning. Through experiments in simulation +with underwater robots and hardware implementations with ground robots, we +demonstrate that our approach achieves superior performance compared to +baselines that either assume specific adversary policies or neglect mission +constraints altogether. Evaluation videos and code are available at +https://tab-fields.github.io. + +
+
+
+
+
+ + ☆ Segmentation of Coronary Artery Stenosis in X-ray Angiography using + Mamba Models + + +
+ Coronary artery disease stands as one of the primary contributors to global +mortality rates. The automated identification of coronary artery stenosis from +X-ray images plays a critical role in the diagnostic process for coronary heart +disease. This task is challenging due to the complex structure of coronary +arteries, intrinsic noise in X-ray images, and the fact that stenotic coronary +arteries appear narrow and blurred in X-ray angiographies. This study employs +five different variants of the Mamba-based model and one variant of the Swin +Transformer-based model, primarily based on the U-Net architecture, for the +localization of stenosis in Coronary artery disease. Our best results showed an +F1 score of 68.79% for the U-Mamba BOT model, representing an 11.8% improvement +over the semi-supervised approach. + +
+
+
+
+
+ + ☆ Semantic Tokens in Retrieval Augmented Generation + + +
+ Retrieval-Augmented Generation (RAG) architectures have recently garnered +significant attention for their ability to improve truth grounding and +coherence in natural language processing tasks. However, the reliability of RAG +systems in producing accurate answers diminishes as the volume of data they +access increases. Even with smaller datasets, these systems occasionally fail +to address simple queries. This issue arises from their dependence on +state-of-the-art large language models (LLMs), which can introduce uncertainty +into the system's outputs. In this work, I propose a novel Comparative RAG +system that introduces an evaluator module to bridge the gap between +probabilistic RAG systems and deterministically verifiable responses. The +evaluator compares external recommendations with the retrieved document chunks, +adding a decision-making layer that enhances the system's reliability. This +approach ensures that the chunks retrieved are both semantically relevant and +logically consistent with deterministic insights, thereby improving the +accuracy and overall efficiency of RAG systems. This framework paves the way +for more reliable and scalable question-answering applications in domains +requiring high precision and verifiability. + +
+
+
+
+
+ + ☆ Graph-Powered Defense: Controller Area Network Intrusion Detection for + Unmanned Aerial Vehicles + + +
+ The network of services, including delivery, farming, and environmental +monitoring, has experienced exponential expansion in the past decade with +Unmanned Aerial Vehicles (UAVs). Yet, UAVs are not robust enough against +cyberattacks, especially on the Controller Area Network (CAN) bus. The CAN bus +is a general-purpose vehicle-bus standard to enable microcontrollers and +in-vehicle computers to interact, primarily connecting different Electronic +Control Units (ECUs). In this study, we focus on solving some of the most +critical security weaknesses in UAVs by developing a novel graph-based +intrusion detection system (IDS) leveraging the Uncomplicated Application-level +Vehicular Communication and Networking (UAVCAN) protocol. First, we decode CAN +messages based on UAVCAN protocol specification; second, we present a +comprehensive method of transforming tabular UAVCAN messages into graph +structures. Lastly, we apply various graph-based machine learning models for +detecting cyber-attacks on the CAN bus, including graph convolutional neural +networks (GCNNs), graph attention networks (GATs), Graph Sample and Aggregate +Networks (GraphSAGE), and graph structure-based transformers. Our findings show +that inductive models such as GATs, GraphSAGE, and graph-based transformers can +achieve competitive and even better accuracy than transductive models like +GCNNs in detecting various types of intrusions, with minimum information on +protocol specification, thus providing a generic robust solution for CAN bus +security for the UAVs. We also compared our results with baseline single-layer +Long Short-Term Memory (LSTM) and found that all our graph-based models perform +better without using any decoded features based on the UAVCAN protocol, +highlighting higher detection performance with protocol-independent capability. + +
+
+
+
+
+ + ☆ WEM-GAN: Wavelet transform based facial expression manipulation + + +
+ Facial expression manipulation aims to change human facial expressions +without affecting face recognition. In order to transform the facial +expressions to target expressions, previous methods relied on expression labels +to guide the manipulation process. However, these methods failed to preserve +the details of facial features, which causes the weakening or the loss of +identity information in the output image. In our work, we propose WEM-GAN, in +short for wavelet-based expression manipulation GAN, which puts more efforts on +preserving the details of the original image in the editing process. Firstly, +we take advantage of the wavelet transform technique and combine it with our +generator with a U-net autoencoder backbone, in order to improve the +generator's ability to preserve more details of facial features. Secondly, we +also implement the high-frequency component discriminator, and use +high-frequency domain adversarial loss to further constrain the optimization of +our model, providing the generated face image with more abundant details. +Additionally, in order to narrow the gap between generated facial expressions +and target expressions, we use residual connections between encoder and +decoder, while also using relative action units (AUs) several times. Extensive +qualitative and quantitative experiments have demonstrated that our model +performs better in preserving identity features, editing capability, and image +generation quality on the AffectNet dataset. It also shows superior performance +in metrics such as Average Content Distance (ACD) and Expression Distance (ED). + +
+
+
+
+
+ + ☆ Bias Analysis of AI Models for Undergraduate Student Admissions + + +
+ Bias detection and mitigation is an active area of research in machine +learning. This work extends previous research done by the authors to provide a +rigorous and more complete analysis of the bias found in AI predictive models. +Admissions data spanning six years was used to create an AI model to determine +whether a given student would be directly admitted into the School of Science +under various scenarios at a large urban research university. During this time, +submission of standardized test scores as part of an application became +optional which led to interesting questions about the impact of standardized +test scores on admission decisions. We developed and analyzed AI models to +understand which variables are important in admissions decisions, and how the +decision to exclude test scores affects the demographics of the students who +are admitted. We then evaluated the predictive models to detect and analyze +biases these models may carry with respect to three variables chosen to +represent sensitive populations: gender, race, and whether a student was the +first in his or her family to attend college. We also extended our analysis to +show that the biases detected were persistent. Finally, we included several +fairness metrics in our analysis and discussed the uses and limitations of +these metrics. + +
+
+
+
+
+ + ☆ Cooperative Cruising: Reinforcement Learning based Time-Headway Control + for Increased Traffic Efficiency + + +
+ The proliferation of Connected Automated Vehicles represents an unprecedented +opportunity for improving driving efficiency and alleviating traffic +congestion. However, existing research fails to address realistic multi-lane +highway scenarios without assuming connectivity, perception, and control +capabilities that are typically unavailable in current vehicles. This paper +proposes a novel AI system that is the first to improve highway traffic +efficiency compared with human-like traffic in realistic, simulated multi-lane +scenarios, while relying on existing connectivity, perception, and control +capabilities. At the core of our approach is a reinforcement learning based +controller that dynamically communicates time-headways to automated vehicles +near bottlenecks based on real-time traffic conditions. These desired +time-headways are then used by Adaptive Cruise Control (ACC) systems to adjust +their following distance. By (i) integrating existing traffic estimation +technology and low-bandwidth vehicle-to-infrastructure connectivity, (ii) +leveraging safety-certified ACC systems, and (iii) targeting localized +bottleneck challenges that can be addressed independently in different +locations, we propose a practical, safe, and scalable system that can +positively impact numerous road users. + +
+
+
+
+
+ + ☆ FCL-ViT: Task-Aware Attention Tuning for Continual Learning + + +
+ Continual Learning (CL) involves adapting the prior Deep Neural Network (DNN) +knowledge to new tasks, without forgetting the old ones. However, modern CL +techniques focus on provisioning memory capabilities to existing DNN models +rather than designing new ones that are able to adapt according to the task at +hand. This paper presents the novel Feedback Continual Learning Vision +Transformer (FCL-ViT) that uses a feedback mechanism to generate real-time +dynamic attention features tailored to the current task. The FCL-ViT operates +in two Phases. In phase 1, the generic image features are produced and +determine where the Transformer should attend on the current image. In phase 2, +task-specific image features are generated that leverage dynamic attention. To +this end, Tunable self-Attention Blocks (TABs) and Task Specific Blocks (TSBs) +are introduced that operate in both phases and are responsible for tuning the +TABs attention, respectively. The FCL-ViT surpasses state-of-the-art +performance on Continual Learning compared to benchmark methods, while +retaining a small number of trainable DNN parameters. + +
+
+
+
+
+ + ☆ Towards Rich Emotions in 3D Avatars: A Text-to-3D Avatar Generation + Benchmark + + +
+ Producing emotionally dynamic 3D facial avatars with text derived from spoken +words (Emo3D) has been a pivotal research topic in 3D avatar generation. While +progress has been made in general-purpose 3D avatar generation, the exploration +of generating emotional 3D avatars remains scarce, primarily due to the +complexities of identifying and rendering rich emotions from spoken words. This +paper reexamines Emo3D generation and draws inspiration from human processes, +breaking down Emo3D into two cascading steps: Text-to-3D Expression Mapping +(T3DEM) and 3D Avatar Rendering (3DAR). T3DEM is the most crucial step in +determining the quality of Emo3D generation and encompasses three key +challenges: Expression Diversity, Emotion-Content Consistency, and Expression +Fluidity. To address these challenges, we introduce a novel benchmark to +advance research in Emo3D generation. First, we present EmoAva, a large-scale, +high-quality dataset for T3DEM, comprising 15,000 text-to-3D expression +mappings that characterize the aforementioned three challenges in Emo3D +generation. Furthermore, we develop various metrics to effectively evaluate +models against these identified challenges. Next, to effectively model the +consistency, diversity, and fluidity of human expressions in the T3DEM step, we +propose the Continuous Text-to-Expression Generator, which employs an +autoregressive Conditional Variational Autoencoder for expression code +generation, enhanced with Latent Temporal Attention and Expression-wise +Attention mechanisms. Finally, to further enhance the 3DAR step on rendering +higher-quality subtle expressions, we present the Globally-informed Gaussian +Avatar (GiGA) model. GiGA incorporates a global information mechanism into 3D +Gaussian representations, enabling the capture of subtle micro-expressions and +seamless transitions between emotional states. + +
+
+ comment: 18 pages, 14 figures. Project website: + https://github.com/WalkerMitty/EmoAva +
+
+
+
+
+ + ☆ OODFace: Benchmarking Robustness of Face Recognition under Common + Corruptions and Appearance Variations + + +
+ With the rise of deep learning, facial recognition technology has seen +extensive research and rapid development. Although facial recognition is +considered a mature technology, we find that existing open-source models and +commercial algorithms lack robustness in certain real-world Out-of-Distribution +(OOD) scenarios, raising concerns about the reliability of these systems. In +this paper, we introduce OODFace, which explores the OOD challenges faced by +facial recognition models from two perspectives: common corruptions and +appearance variations. We systematically design 30 OOD scenarios across 9 major +categories tailored for facial recognition. By simulating these challenges on +public datasets, we establish three robustness benchmarks: LFW-C/V, CFP-FP-C/V, +and YTF-C/V. We then conduct extensive experiments on 19 different facial +recognition models and 3 commercial APIs, along with extended experiments on +face masks, Vision-Language Models (VLMs), and defense strategies to assess +their robustness. Based on the results, we draw several key insights, +highlighting the vulnerability of facial recognition systems to OOD data and +suggesting possible solutions. Additionally, we offer a unified toolkit that +includes all corruption and variation types, easily extendable to other +datasets. We hope that our benchmarks and findings can provide guidance for +future improvements in facial recognition model robustness. + +
+
+
+
+
+ + ☆ F-SE-LSTM: A Time Series Anomaly Detection Method with Frequency Domain + Information + + +
+ With the development of society, time series anomaly detection plays an +important role in network and IoT services. However, most existing anomaly +detection methods directly analyze time series in the time domain and cannot +distinguish some relatively hidden anomaly sequences. We attempt to analyze the +impact of frequency on time series from a frequency domain perspective, thus +proposing a new time series anomaly detection method called F-SE-LSTM. This +method utilizes two sliding windows and fast Fourier transform (FFT) to +construct a frequency matrix. Simultaneously, Squeeze-and-Excitation Networks +(SENet) and Long Short-Term Memory (LSTM) are employed to extract +frequency-related features within and between periods. Through comparative +experiments on multiple datasets such as Yahoo Webscope S5 and Numenta Anomaly +Benchmark, the results demonstrate that the frequency matrix constructed by +F-SE-LSTM exhibits better discriminative ability than ordinary time domain and +frequency domain data. Furthermore, F-SE-LSTM outperforms existing +state-of-the-art deep learning anomaly detection methods in terms of anomaly +detection capability and execution efficiency. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Gracefully Filtering Backdoor Samples for Generative Large Language + Models without Retraining COLING 2025 + + +
+ Backdoor attacks remain significant security threats to generative large +language models (LLMs). Since generative LLMs output sequences of +high-dimensional token logits instead of low-dimensional classification logits, +most existing backdoor defense methods designed for discriminative models like +BERT are ineffective for generative LLMs. Inspired by the observed differences +in learning behavior between backdoor and clean mapping in the frequency space, +we transform gradients of each training sample, directly influencing parameter +updates, into the frequency space. Our findings reveal a distinct separation +between the gradients of backdoor and clean samples in the frequency space. +Based on this phenomenon, we propose Gradient Clustering in the Frequency Space +for Backdoor Sample Filtering (GraCeFul), which leverages sample-wise gradients +in the frequency space to effectively identify backdoor samples without +requiring retraining LLMs. Experimental results show that GraCeFul outperforms +baselines significantly. Notably, GraCeFul exhibits remarkable computational +efficiency, achieving nearly 100% recall and F1 scores in identifying backdoor +samples, reducing the average success rate of various backdoor attacks to 0% +with negligible drops in clean accuracy across multiple free-style question +answering datasets. Additionally, GraCeFul generalizes to Llama-2 and Vicuna. +The codes are publicly available at https://github.com/ZrW00/GraceFul. + +
+
+ comment: Accepted at COLING 2025 +
+
+
+
+
+ + ☆ BYE: Build Your Encoder with One Sequence of Exploration Data for + Long-Term Dynamic Scene Understanding + + +
+ Dynamic scene understanding remains a persistent challenge in robotic +applications. Early dynamic mapping methods focused on mitigating the negative +influence of short-term dynamic objects on camera motion estimation by masking +or tracking specific categories, which often fall short in adapting to +long-term scene changes. Recent efforts address object association in long-term +dynamic environments using neural networks trained on synthetic datasets, but +they still rely on predefined object shapes and categories. Other methods +incorporate visual, geometric, or semantic heuristics for the association but +often lack robustness. In this work, we introduce BYE, a class-agnostic, +per-scene point cloud encoder that removes the need for predefined categories, +shape priors, or extensive association datasets. Trained on only a single +sequence of exploration data, BYE can efficiently perform object association in +dynamically changing scenes. We further propose an ensembling scheme combining +the semantic strengths of Vision Language Models (VLMs) with the scene-specific +expertise of BYE, achieving a 7% improvement and a 95% success rate in object +association tasks. Code and dataset are available at +https://byencoder.github.io. + +
+
+
+
+
+ + ☆ Artificial Expert Intelligence through PAC-reasoning + + +
+ Artificial Expert Intelligence (AEI) seeks to transcend the limitations of +both Artificial General Intelligence (AGI) and narrow AI by integrating +domain-specific expertise with critical, precise reasoning capabilities akin to +those of top human experts. Existing AI systems often excel at predefined tasks +but struggle with adaptability and precision in novel problem-solving. To +overcome this, AEI introduces a framework for ``Probably Approximately Correct +(PAC) Reasoning". This paradigm provides robust theoretical guarantees for +reliably decomposing complex problems, with a practical mechanism for +controlling reasoning precision. In reference to the division of human thought +into System 1 for intuitive thinking and System 2 for reflective +reasoning~\citep{tversky1974judgment}, we refer to this new type of reasoning +as System 3 for precise reasoning, inspired by the rigor of the scientific +method. AEI thus establishes a foundation for error-bounded, inference-time +learning. + +
+
+
+
+
+ + ☆ GerPS-Compare: Comparing NER methods for legal norm analysis + + +
+ We apply NER to a particular sub-genre of legal texts in German: the genre of +legal norms regulating administrative processes in public service +administration. The analysis of such texts involves identifying stretches of +text that instantiate one of ten classes identified by public service +administration professionals. We investigate and compare three methods for +performing Named Entity Recognition (NER) to detect these classes: a Rule-based +system, deep discriminative models, and a deep generative model. Our results +show that Deep Discriminative models outperform both the Rule-based system as +well as the Deep Generative model, the latter two roughly performing equally +well, outperforming each other in different classes. The main cause for this +somewhat surprising result is arguably the fact that the classes used in the +analysis are semantically and syntactically heterogeneous, in contrast to the +classes used in more standard NER tasks. Deep Discriminative models appear to +be better equipped for dealing with this heterogenerity than both generic LLMs +and human linguists designing rule-based NER systems. + +
+
+
+
+
+ + ☆ Knowledge-Enhanced Conversational Recommendation via Transformer-based + Sequential Modelling + + +
+ In conversational recommender systems (CRSs), conversations usually involve a +set of items and item-related entities or attributes, e.g., director is a +related entity of a movie. These items and item-related entities are often +mentioned along the development of a dialog, leading to potential sequential +dependencies among them. However, most of existing CRSs neglect these potential +sequential dependencies. In this article, we first propose a Transformer-based +sequential conversational recommendation method, named TSCR, to model the +sequential dependencies in the conversations to improve CRS. In TSCR, we +represent conversations by items and the item-related entities, and construct +user sequences to discover user preferences by considering both the mentioned +items and item-related entities. Based on the constructed sequences, we deploy +a Cloze task to predict the recommended items along a sequence. Meanwhile, in +certain domains, knowledge graphs formed by the items and their related +entities are readily available, which provide various different kinds of +associations among them. Given that TSCR does not benefit from such knowledge +graphs, we then propose a knowledge graph enhanced version of TSCR, called +TSCRKG. In specific, we leverage the knowledge graph to offline initialize our +model TSCRKG, and augment the user sequence of conversations (i.e., sequence of +the mentioned items and item-related entities in the conversation) with +multi-hop paths in the knowledge graph. Experimental results demonstrate that +our TSCR model significantly outperforms state-of-the-art baselines, and the +enhanced version TSCRKG further improves recommendation performance on top of +TSCR. + +
+
+ comment: Accepted by ACM TOIS +
+
+
+
+
+ + ☆ VISTA: A Panoramic View of Neural Representations + + +
+ We present VISTA (Visualization of Internal States and Their Associations), a +novel pipeline for visually exploring and interpreting neural network +representations. VISTA addresses the challenge of analyzing vast +multidimensional spaces in modern machine learning models by mapping +representations into a semantic 2D space. The resulting collages visually +reveal patterns and relationships within internal representations. We +demonstrate VISTA's utility by applying it to sparse autoencoder latents +uncovering new properties and interpretations. We review the VISTA methodology, +present findings from our case study ( https://got.drib.net/latents/ ), and +discuss implications for neural network interpretability across various domains +of machine learning. + +
+
+
+
+
+ + ☆ A Multi-Agent Framework for Extensible Structured Text Generation in + PLCs + + +
+ Programmable Logic Controllers (PLCs) are microcomputers essential for +automating factory operations. Structured Text (ST), a high-level language +adhering to the IEC 61131-3 standard, is pivotal for PLCs due to its ability to +express logic succinctly and to seamlessly integrate with other languages +within the same standard. However, vendors develop their own customized +versions of ST, and the lack of comprehensive and standardized documentation +for the full semantics of ST has contributed to inconsistencies in how the +language is implemented. Consequently, the steep learning curve associated with +ST, combined with ever-evolving industrial requirements, presents significant +challenges for developers. In response to these issues, we present AutoPLC, an +LLM-based approach designed to automate the generation of vendor-specific ST +code. To facilitate effective code generation, we first built a comprehensive +knowledge base, including Rq2ST Case Library (requirements and corresponding +implementations) and Instruction libraries. Then we developed a retrieval +module to incorporate the domain-specific knowledge by identifying pertinent +cases and instructions, guiding the LLM to generate code that meets the +requirements. In order to verify and improve the quality of the generated code, +we designed an adaptable code checker. If errors are detected, we initiate an +iterative self-improvement process to instruct the LLM to revise the generated +code. We evaluate AutoPLC's performance against seven state-of-the-art +baselines using three benchmarks, one for open-source basic ST and two for +commercial Structured Control Language (SCL) from Siemens. The results show +that our approach consistently achieves superior performance across all +benchmarks. Ablation study emphasizes the significance of our modules. Further +manual analysis confirm the practical utility of the ST code generated by +AutoPLC. + +
+
+
+
+
+ + ☆ OMENN: One Matrix to Explain Neural Networks + + +
+ Deep Learning (DL) models are often black boxes, making their decision-making +processes difficult to interpret. This lack of transparency has driven +advancements in eXplainable Artificial Intelligence (XAI), a field dedicated to +clarifying the reasoning behind DL model predictions. Among these, +attribution-based methods such as LRP and GradCAM are widely used, though they +rely on approximations that can be imprecise. + To address these limitations, we introduce One Matrix to Explain Neural +Networks (OMENN), a novel post-hoc method that represents a neural network as a +single, interpretable matrix for each specific input. This matrix is +constructed through a series of linear transformations that represent the +processing of the input by each successive layer in the neural network. As a +result, OMENN provides locally precise, attribution-based explanations of the +input across various modern models, including ViTs and CNNs. We present a +theoretical analysis of OMENN based on dynamic linearity property and validate +its effectiveness with extensive tests on two XAI benchmarks, demonstrating +that OMENN is competitive with state-of-the-art methods. + +
+
+ comment: Under review, code will be released after acceptance +
+
+
+
+
+ + ☆ HERO: Hint-Based Efficient and Reliable Query Optimizer VLDB 2025 + + +
+ We propose a novel model for learned query optimization which provides query +hints leading to better execution plans. The model addresses the three key +challenges in learned hint-based query optimization: reliable hint +recommendation (ensuring non-degradation of query latency), efficient hint +exploration, and fast inference. We provide an in-depth analysis of existing +NN-based approaches to hint-based optimization and experimentally confirm the +named challenges for them. Our alternative solution consists of a new inference +schema based on an ensemble of context-aware models and a graph storage for +reliable hint suggestion and fast inference, and a budget-controlled training +procedure with a local search algorithm that solves the issue of exponential +search space exploration. In experiments on standard benchmarks, our model +demonstrates optimization capability close to the best achievable with +coarse-grained hints. Controlling the degree of parallelism (query dop) in +addition to operator-related hints enables our model to achieve 3x latency +improvement on JOB benchmark which sets a new standard for optimization. Our +model is interpretable and easy to debug, which is particularly important for +deployment in production. + +
+
+ comment: Submitted to VLDB 2025; 13 pages; 13 figures +
+
+
+
+
+ + ☆ ScImage: How Good Are Multimodal Large Language Models at Scientific + Text-to-Image Generation? + + +
+ Multimodal large language models (LLMs) have demonstrated impressive +capabilities in generating high-quality images from textual instructions. +However, their performance in generating scientific images--a critical +application for accelerating scientific progress--remains underexplored. In +this work, we address this gap by introducing ScImage, a benchmark designed to +evaluate the multimodal capabilities of LLMs in generating scientific images +from textual descriptions. ScImage assesses three key dimensions of +understanding: spatial, numeric, and attribute comprehension, as well as their +combinations, focusing on the relationships between scientific objects (e.g., +squares, circles). We evaluate five models, GPT-4o, Llama, AutomaTikZ, Dall-E, +and StableDiffusion, using two modes of output generation: code-based outputs +(Python, TikZ) and direct raster image generation. Additionally, we examine +four different input languages: English, German, Farsi, and Chinese. Our +evaluation, conducted with 11 scientists across three criteria (correctness, +relevance, and scientific accuracy), reveals that while GPT-4o produces outputs +of decent quality for simpler prompts involving individual dimensions such as +spatial, numeric, or attribute understanding in isolation, all models face +challenges in this task, especially for more complex prompts. + +
+
+
+
+
+ + ☆ Dynamic Prompt Middleware: Contextual Prompt Refinement Controls for + Comprehension Tasks + + +
+ Effective prompting of generative AI is challenging for many users, +particularly in expressing context for comprehension tasks such as explaining +spreadsheet formulas, Python code, and text passages. Prompt middleware aims to +address this barrier by assisting in prompt construction, but barriers remain +for users in expressing adequate control so that they can receive AI-responses +that match their preferences. + We conduct a formative survey (n=38) investigating user needs for control +over AI-generated explanations in comprehension tasks, which uncovers a +trade-off between standardized but predictable support for prompting, and +adaptive but unpredictable support tailored to the user and task. To explore +this trade-off, we implement two prompt middleware approaches: Dynamic Prompt +Refinement Control (Dynamic PRC) and Static Prompt Refinement Control (Static +PRC). The Dynamic PRC approach generates context-specific UI elements that +provide prompt refinements based on the user's prompt and user needs from the +AI, while the Static PRC approach offers a preset list of generally applicable +refinements. + We evaluate these two approaches with a controlled user study (n=16) to +assess the impact of these approaches on user control of AI responses for +crafting better explanations. Results show a preference for the Dynamic PRC +approach as it afforded more control, lowered barriers to providing context, +and encouraged exploration and reflection of the tasks, but that reasoning +about the effects of different generated controls on the final output remains +challenging. Drawing on participant feedback, we discuss design implications +for future Dynamic PRC systems that enhance user control of AI responses. Our +findings suggest that dynamic prompt middleware can improve the user experience +of generative AI workflows by affording greater control and guide users to a +better AI response. + +
+
+
+
+
+ + ☆ Reinforcement learning to learn quantum states for Heisenberg scaling + accuracy + + +
+ Learning quantum states is a crucial task for realizing the potential of +quantum information technology. Recently, neural approaches have emerged as +promising methods for learning quantum states. We propose a meta-learning model +that employs reinforcement learning (RL) to optimize the process of learning +quantum states. For learning quantum states, our scheme trains a Hardware +efficient ansatz with a blackbox optimization algorithm, called evolution +strategy (ES). To enhance the efficiency of ES, a RL agent dynamically adjusts +the hyperparameters of ES. To facilitate the RL training, we introduce an +action repetition strategy inspired by curriculum learning. The RL agent +significantly improves the sample efficiency of learning random quantum states, +and achieves infidelity scaling close to the Heisenberg limit. We showcase that +the RL agent trained using 3-qubit states can be generalized to learning up to +5-qubit states. These results highlight the utility of RL-driven meta-learning +to enhance the efficiency and generalizability of learning quantum states. Our +approach can be applicable to improve quantum control, quantum optimization, +and quantum machine learning. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Sample Efficient Robot Learning in Supervised Effect Prediction Tasks + + +
+ In self-supervised robot learning, robots actively explore their environments +and generate data by acting on entities in the environment. Therefore, an +exploration policy is desired that ensures sample efficiency to minimize robot +execution costs while still providing accurate learning. For this purpose, the +robotic community has adopted Intrinsic Motivation (IM)-based approaches such +as Learning Progress (LP). On the machine learning front, Active Learning (AL) +has been used successfully, especially for classification tasks. In this work, +we develop a novel AL framework geared towards robotics regression tasks, such +as action-effect prediction and, more generally, for world model learning, +which we call MUSEL - Model Uncertainty for Sample Efficient Learning. MUSEL +aims to extract model uncertainty from the total uncertainty estimate given by +a suitable learning engine by making use of earning progress and input +diversity and use it to improve sample efficiency beyond the state-of-the-art +action-effect prediction methods. We demonstrate the feasibility of our model +by using a Stochastic Variational Gaussian Process (SVGP) as the learning +engine and testing the system on a set of robotic experiments in simulation. +The efficacy of MUSEL is demonstrated by comparing its performance to standard +methods used in robot action-effect learning. In a robotic tabletop environment +in which a robot manipulator is tasked with learning the effect of its actions, +the experiments show that MUSEL facilitates higher accuracy in learning action +effects while ensuring sample efficiency. + +
+
+ comment: 18 pages, 18 figures +
+
+
+
+
+ + ☆ Switchable deep beamformer for high-quality and real-time passive + acoustic mapping + + +
+ Passive acoustic mapping (PAM) is a promising tool for monitoring acoustic +cavitation activities in the applications of ultrasound therapy. Data-adaptive +beamformers for PAM have better image quality compared to the time exposure +acoustics (TEA) algorithms. However, the computational cost of data-adaptive +beamformers is considerably expensive. In this work, we develop a deep +beamformer based on a generative adversarial network, which can switch between +different transducer arrays and reconstruct high-quality PAM images directly +from radio frequency ultrasound signals with low computational cost. The deep +beamformer was trained on the dataset consisting of simulated and experimental +cavitation signals of single and multiple microbubble clouds measured by +different (linear and phased) arrays covering 1-15 MHz. We compared the +performance of the deep beamformer to TEA and three different data-adaptive +beamformers using the simulated and experimental test dataset. Compared with +TEA, the deep beamformer reduced the energy spread area by 18.9%-65.0% and +improved the image signal-to-noise ratio by 9.3-22.9 dB in average for the +different arrays in our data. Compared to the data-adaptive beamformers, the +deep beamformer reduced the computational cost by three orders of magnitude +achieving 10.5 ms image reconstruction speed in our data, while the image +quality was as good as that of the data-adaptive beamformers. These results +demonstrated the potential of the deep beamformer for high-resolution +monitoring of microbubble cavitation activities for ultrasound therapy. + +
+
+
+
+
+ + ☆ Enhanced Photovoltaic Power Forecasting: An iTransformer and LSTM-Based + Model Integrating Temporal and Covariate Interactions + + +
+ Accurate photovoltaic (PV) power forecasting is critical for integrating +renewable energy sources into the grid, optimizing real-time energy management, +and ensuring energy reliability amidst increasing demand. However, existing +models often struggle with effectively capturing the complex relationships +between target variables and covariates, as well as the interactions between +temporal dynamics and multivariate data, leading to suboptimal forecasting +accuracy. To address these challenges, we propose a novel model architecture +that leverages the iTransformer for feature extraction from target variables +and employs long short-term memory (LSTM) to extract features from covariates. +A cross-attention mechanism is integrated to fuse the outputs of both models, +followed by a Kolmogorov-Arnold network (KAN) mapping for enhanced +representation. The effectiveness of the proposed model is validated using +publicly available datasets from Australia, with experiments conducted across +four seasons. Results demonstrate that the proposed model effectively capture +seasonal variations in PV power generation and improve forecasting accuracy. + +
+
+
+
+
+ + ☆ Large Multimodal Agents for Accurate Phishing Detection with Enhanced + Token Optimization and Cost Reduction + + +
+ With the rise of sophisticated phishing attacks, there is a growing need for +effective and economical detection solutions. This paper explores the use of +large multimodal agents, specifically Gemini 1.5 Flash and GPT-4o mini, to +analyze both URLs and webpage screenshots via APIs, thus avoiding the +complexities of training and maintaining AI systems. Our findings indicate that +integrating these two data types substantially enhances detection performance +over using either type alone. However, API usage incurs costs per query that +depend on the number of input and output tokens. To address this, we propose a +two-tiered agentic approach: initially, one agent assesses the URL, and if +inconclusive, a second agent evaluates both the URL and the screenshot. This +method not only maintains robust detection performance but also significantly +reduces API costs by minimizing unnecessary multi-input queries. Cost analysis +shows that with the agentic approach, GPT-4o mini can process about 4.2 times +as many websites per $100 compared to the multimodal approach (107,440 vs. +25,626), and Gemini 1.5 Flash can process about 2.6 times more websites +(2,232,142 vs. 862,068). These findings underscore the significant economic +benefits of the agentic approach over the multimodal method, providing a viable +solution for organizations aiming to leverage advanced AI for phishing +detection while controlling expenses. + +
+
+ comment: Accepted in the 2nd International Conference on Foundation and Large + Language Models (FLLM2024) +
+
+
+
+
+ + ☆ CADMR: Cross-Attention and Disentangled Learning for Multimodal + Recommender Systems + + +
+ The increasing availability and diversity of multimodal data in recommender +systems offer new avenues for enhancing recommendation accuracy and user +satisfaction. However, these systems must contend with high-dimensional, sparse +user-item rating matrices, where reconstructing the matrix with only small +subsets of preferred items for each user poses a significant challenge. To +address this, we propose CADMR, a novel autoencoder-based multimodal +recommender system framework. CADMR leverages multi-head cross-attention +mechanisms and Disentangled Learning to effectively integrate and utilize +heterogeneous multimodal data in reconstructing the rating matrix. Our approach +first disentangles modality-specific features while preserving their +interdependence, thereby learning a joint latent representation. The multi-head +cross-attention mechanism is then applied to enhance user-item interaction +representations with respect to the learned multimodal item latent +representations. We evaluate CADMR on three benchmark datasets, demonstrating +significant performance improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Initial Study On Improving Segmentation By Combining Preoperative CT And + Intraoperative CBCT Using Synthetic Data + + +
+ Computer-Assisted Interventions enable clinicians to perform precise, +minimally invasive procedures, often relying on advanced imaging methods. +Cone-beam computed tomography (CBCT) can be used to facilitate +computer-assisted interventions, despite often suffering from artifacts that +pose challenges for accurate interpretation. While the degraded image quality +can affect image analysis, the availability of high quality, preoperative scans +offers potential for improvements. Here we consider a setting where +preoperative CT and intraoperative CBCT scans are available, however, the +alignment (registration) between the scans is imperfect to simulate a real +world scenario. We propose a multimodal learning method that fuses roughly +aligned CBCT and CT scans and investigate the effect on segmentation +performance. For this experiment we use synthetically generated data containing +real CT and synthetic CBCT volumes with corresponding voxel annotations. We +show that this fusion setup improves segmentation performance in $18$ out of +$20$ investigated setups. + +
+
+ comment: Accepted at BVM 2025. arXiv admin note: text overlap with + arXiv:2406.11650 +
+
+
+
+
+ + ☆ Deep Matrix Factorization with Adaptive Weights for Multi-View + Clustering + + +
+ Recently, deep matrix factorization has been established as a powerful model +for unsupervised tasks, achieving promising results, especially for multi-view +clustering. However, existing methods often lack effective feature selection +mechanisms and rely on empirical hyperparameter selection. To address these +issues, we introduce a novel Deep Matrix Factorization with Adaptive Weights +for Multi-View Clustering (DMFAW). Our method simultaneously incorporates +feature selection and generates local partitions, enhancing clustering results. +Notably, the features weights are controlled and adjusted by a parameter that +is dynamically updated using Control Theory inspired mechanism, which not only +improves the model's stability and adaptability to diverse datasets but also +accelerates convergence. A late fusion approach is then proposed to align the +weighted local partitions with the consensus partition. Finally, the +optimization problem is solved via an alternating optimization algorithm with +theoretically guaranteed convergence. Extensive experiments on benchmark +datasets highlight that DMFAW outperforms state-of-the-art methods in terms of +clustering performance. + +
+
+
+
+
+ + ☆ Conformal Symplectic Optimization for Stable Reinforcement Learning + + +
+ Training deep reinforcement learning (RL) agents necessitates overcoming the +highly unstable nonconvex stochastic optimization inherent in the +trial-and-error mechanism. To tackle this challenge, we propose a +physics-inspired optimization algorithm called relativistic adaptive gradient +descent (RAD), which enhances long-term training stability. By conceptualizing +neural network (NN) training as the evolution of a conformal Hamiltonian +system, we present a universal framework for transferring long-term stability +from conformal symplectic integrators to iterative NN updating rules, where the +choice of kinetic energy governs the dynamical properties of resulting +optimization algorithms. By utilizing relativistic kinetic energy, RAD +incorporates principles from special relativity and limits parameter updates +below a finite speed, effectively mitigating abnormal gradient influences. +Additionally, RAD models NN optimization as the evolution of a multi-particle +system where each trainable parameter acts as an independent particle with an +individual adaptive learning rate. We prove RAD's sublinear convergence under +general nonconvex settings, where smaller gradient variance and larger batch +sizes contribute to tighter convergence. Notably, RAD degrades to the +well-known adaptive moment estimation (ADAM) algorithm when its speed +coefficient is chosen as one and symplectic factor as a small positive value. +Experimental results show RAD outperforming nine baseline optimizers with five +RL algorithms across twelve environments, including standard benchmarks and +challenging scenarios. Notably, RAD achieves up to a 155.1% performance +improvement over ADAM in Atari games, showcasing its efficacy in stabilizing +and accelerating RL training. + +
+
+
+
+
+ + ☆ GQWformer: A Quantum-based Transformer for Graph Representation Learning + + +
+ Graph Transformers (GTs) have demonstrated significant advantages in graph +representation learning through their global attention mechanisms. However, the +self-attention mechanism in GTs tends to neglect the inductive biases inherent +in graph structures, making it chanllenging to effectively capture essential +structural information. To address this issue, we propose a novel approach that +integrate graph inductive bias into self-attention mechanisms by leveraging +quantum technology for structural encoding. In this paper, we introduce the +Graph Quantum Walk Transformer (GQWformer), a groundbreaking GNN framework that +utilizes quantum walks on attributed graphs to generate node quantum states. +These quantum states encapsulate rich structural attributes and serve as +inductive biases for the transformer, thereby enabling the generation of more +meaningful attention scores. By subsequently incorporating a recurrent neural +network, our design amplifies the model's ability to focus on both local and +global information. We conducted comprehensive experiments across five publicly +available datasets to evaluate the effectiveness of our model. These results +clearly indicate that GQWformer outperforms existing state-of-the-art graph +classification algorithms. These findings highlight the significant potential +of integrating quantum computing methodologies with traditional GNNs to advance +the field of graph representation learning, providing a promising direction for +future research and applications. + +
+
+
+
+
+ + ☆ VR Based Emotion Recognition Using Deep Multimodal Fusion With + Biosignals Across Multiple Anatomical Domains + + +
+ Emotion recognition is significantly enhanced by integrating multimodal +biosignals and IMU data from multiple domains. In this paper, we introduce a +novel multi-scale attention-based LSTM architecture, combined with +Squeeze-and-Excitation (SE) blocks, by leveraging multi-domain signals from the +head (Meta Quest Pro VR headset), trunk (Equivital Vest), and peripheral +(Empatica Embrace Plus) during affect elicitation via visual stimuli. Signals +from 23 participants were recorded, alongside self-assessed valence and arousal +ratings after each stimulus. LSTM layers extract features from each modality, +while multi-scale attention captures fine-grained temporal dependencies, and SE +blocks recalibrate feature importance prior to classification. We assess which +domain's signals carry the most distinctive emotional information during VR +experiences, identifying key biosignals contributing to emotion detection. The +proposed architecture, validated in a user study, demonstrates superior +performance in classifying valance and arousal level (high / low), showcasing +the efficacy of multi-domain and multi-modal fusion with biosignals (e.g., +TEMP, EDA) with IMU data (e.g., accelerometer) for emotion recognition in +real-world applications. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ AH-OCDA: Amplitude-based Curriculum Learning and Hopfield Segmentation + Model for Open Compound Domain Adaptation WACV 2025 + + +
+ Open compound domain adaptation (OCDA) is a practical domain adaptation +problem that consists of a source domain, target compound domain, and unseen +open domain. In this problem, the absence of domain labels and pixel-level +segmentation labels for both compound and open domains poses challenges to the +direct application of existing domain adaptation and generalization methods. To +address this issue, we propose Amplitude-based curriculum learning and a +Hopfield segmentation model for Open Compound Domain Adaptation (AH-OCDA). Our +method comprises two complementary components: 1) amplitude-based curriculum +learning and 2) Hopfield segmentation model. Without prior knowledge of target +domains within the compound domains, amplitude-based curriculum learning +gradually induces the semantic segmentation model to adapt from the near-source +compound domain to the far-source compound domain by ranking unlabeled compound +domain images through Fast Fourier Transform (FFT). Additionally, the Hopfield +segmentation model maps segmentation feature distributions from arbitrary +domains to the feature distributions of the source domain. AH-OCDA achieves +state-of-the-art performance on two OCDA benchmarks and extended open domains, +demonstrating its adaptability to continuously changing compound domains and +unseen open domains. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ☆ A Comprehensive Evaluation of Large Language Models on Aspect-Based + Sentiment Analysis + + +
+ Recently, Large Language Models (LLMs) have garnered increasing attention in +the field of natural language processing, revolutionizing numerous downstream +tasks with powerful reasoning and generation abilities. For example, In-Context +Learning (ICL) introduces a fine-tuning-free paradigm, allowing out-of-the-box +LLMs to execute downstream tasks by analogy learning without any fine-tuning. +Besides, in a fine-tuning-dependent paradigm where substantial training data +exists, Parameter-Efficient Fine-Tuning (PEFT), as the cost-effective methods, +enable LLMs to achieve excellent performance comparable to full fine-tuning. + However, these fascinating techniques employed by LLMs have not been fully +exploited in the ABSA field. Previous works probe LLMs in ABSA by merely using +randomly selected input-output pairs as demonstrations in ICL, resulting in an +incomplete and superficial evaluation. In this paper, we shed light on a +comprehensive evaluation of LLMs in the ABSA field, involving 13 datasets, 8 +ABSA subtasks, and 6 LLMs. Specifically, we design a unified task formulation +to unify ``multiple LLMs for multiple ABSA subtasks in multiple paradigms.'' +For the fine-tuning-dependent paradigm, we efficiently fine-tune LLMs using +instruction-based multi-task learning. For the fine-tuning-free paradigm, we +propose 3 demonstration selection strategies to stimulate the few-shot +abilities of LLMs. Our extensive experiments demonstrate that LLMs achieve a +new state-of-the-art performance compared to fine-tuned Small Language Models +(SLMs) in the fine-tuning-dependent paradigm. More importantly, in the +fine-tuning-free paradigm where SLMs are ineffective, LLMs with ICL still +showcase impressive potential and even compete with fine-tuned SLMs on some +ABSA subtasks. + +
+
+
+
+
+ + ☆ Sustainable Self-evolution Adversarial Training + + +
+ With the wide application of deep neural network models in various computer +vision tasks, there has been a proliferation of adversarial example generation +strategies aimed at deeply exploring model security. However, existing +adversarial training defense models, which rely on single or limited types of +attacks under a one-time learning process, struggle to adapt to the dynamic and +evolving nature of attack methods. Therefore, to achieve defense performance +improvements for models in long-term applications, we propose a novel +Sustainable Self-Evolution Adversarial Training (SSEAT) framework. +Specifically, we introduce a continual adversarial defense pipeline to realize +learning from various kinds of adversarial examples across multiple stages. +Additionally, to address the issue of model catastrophic forgetting caused by +continual learning from ongoing novel attacks, we propose an adversarial data +replay module to better select more diverse and key relearning data. +Furthermore, we design a consistency regularization strategy to encourage +current defense models to learn more from previously trained ones, guiding them +to retain more past knowledge and maintain accuracy on clean samples. Extensive +experiments have been conducted to verify the efficacy of the proposed SSEAT +defense method, which demonstrates superior defense performance and +classification accuracy compared to competitors. + +
+
+ comment: Accepted to ACMMM 2024 +
+
+
+
+
+ + ☆ Connecting Large Language Models with Blockchain: Advancing the + Evolution of Smart Contracts from Automation to Intelligence + + +
+ Blockchain smart contracts have catalyzed the development of decentralized +applications across various domains, including decentralized finance. However, +due to constraints in computational resources and the prevalence of data silos, +current smart contracts face significant challenges in fully leveraging the +powerful capabilities of Large Language Models (LLMs) for tasks such as +intelligent analysis and reasoning. To address this gap, this paper proposes +and implements a universal framework for integrating LLMs with blockchain data, +{\sysname}, effectively overcoming the interoperability barriers between +blockchain and LLMs. By combining semantic relatedness with truth discovery +methods, we introduce an innovative data aggregation approach, {\funcname}, +which significantly enhances the accuracy and trustworthiness of data generated +by LLMs. To validate the framework's effectiveness, we construct a dataset +consisting of three types of questions, capturing Q\&A interactions between 10 +oracle nodes and 5 LLM models. Experimental results demonstrate that, even with +40\% malicious nodes, the proposed solution improves data accuracy by an +average of 17.74\% compared to the optimal baseline. This research not only +provides an innovative solution for the intelligent enhancement of smart +contracts but also highlights the potential for deep integration between LLMs +and blockchain technology, paving the way for more intelligent and complex +applications of smart contracts in the future. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ VideoGen-of-Thought: A Collaborative Framework for Multi-Shot Video + Generation + + +
+ Current video generation models excel at generating short clips but still +struggle with creating multi-shot, movie-like videos. Existing models trained +on large-scale data on the back of rich computational resources are +unsurprisingly inadequate for maintaining a logical storyline and visual +consistency across multiple shots of a cohesive script since they are often +trained with a single-shot objective. To this end, we propose +VideoGen-of-Thought (VGoT), a collaborative and training-free architecture +designed specifically for multi-shot video generation. VGoT is designed with +three goals in mind as follows. Multi-Shot Video Generation: We divide the +video generation process into a structured, modular sequence, including (1) +Script Generation, which translates a curt story into detailed prompts for each +shot; (2) Keyframe Generation, responsible for creating visually consistent +keyframes faithful to character portrayals; and (3) Shot-Level Video +Generation, which transforms information from scripts and keyframes into shots; +(4) Smoothing Mechanism that ensures a consistent multi-shot output. Reasonable +Narrative Design: Inspired by cinematic scriptwriting, our prompt generation +approach spans five key domains, ensuring logical consistency, character +development, and narrative flow across the entire video. Cross-Shot +Consistency: We ensure temporal and identity consistency by leveraging +identity-preserving (IP) embeddings across shots, which are automatically +created from the narrative. Additionally, we incorporate a cross-shot smoothing +mechanism, which integrates a reset boundary that effectively combines latent +features from adjacent shots, resulting in smooth transitions and maintaining +visual coherence throughout the video. Our experiments demonstrate that VGoT +surpasses existing video generation methods in producing high-quality, +coherent, multi-shot videos. + +
+
+ comment: Webpage: https://cheliosoops.github.io/VGoT +
+
+
+
+
+ + ☆ Selective Reviews of Bandit Problems in AI via a Statistical View + + +
+ Reinforcement Learning (RL) is a widely researched area in artificial +intelligence that focuses on teaching agents decision-making through +interactions with their environment. A key subset includes stochastic +multi-armed bandit (MAB) and continuum-armed bandit (SCAB) problems, which +model sequential decision-making under uncertainty. This review outlines the +foundational models and assumptions of bandit problems, explores non-asymptotic +theoretical tools like concentration inequalities and minimax regret bounds, +and compares frequentist and Bayesian algorithms for managing +exploration-exploitation trade-offs. We also extend the discussion to $K$-armed +contextual bandits and SCAB, examining their methodologies, regret analyses, +and discussing the relation between the SCAB problems and the functional data +analysis. Finally, we highlight recent advances and ongoing challenges in the +field. + +
+
+ comment: 46 pages, 5 figures, +
+
+
+
+
+ + ☆ U-Net in Medical Image Segmentation: A Review of Its Applications Across + Modalities + + +
+ Medical imaging is essential in healthcare to provide key insights into +patient anatomy and pathology, aiding in diagnosis and treatment. Non-invasive +techniques such as X-ray, Magnetic Resonance Imaging (MRI), Computed Tomography +(CT), and Ultrasound (US), capture detailed images of organs, tissues, and +abnormalities. Effective analysis of these images requires precise segmentation +to delineate regions of interest (ROI), such as organs or lesions. Traditional +segmentation methods, relying on manual feature-extraction, are labor-intensive +and vary across experts. Recent advancements in Artificial Intelligence (AI) +and Deep Learning (DL), particularly convolutional models such as U-Net and its +variants (U-Net++ and U-Net 3+), have transformed medical image segmentation +(MIS) by automating the process and enhancing accuracy. These models enable +efficient, precise pixel-wise classification across various imaging modalities, +overcoming the limitations of manual segmentation. This review explores various +medical imaging techniques, examines the U-Net architectures and their +adaptations, and discusses their application across different modalities. It +also identifies common challenges in MIS and proposes potential solutions. + +
+
+
+
+
+ + ☆ Cross-Attention Head Position Patterns Can Align with Human Visual + Concepts in Text-to-Image Generative Models + + +
+ Recent text-to-image diffusion models leverage cross-attention layers, which +have been effectively utilized to enhance a range of visual generative tasks. +However, our understanding of cross-attention layers remains somewhat limited. +In this study, we present a method for constructing Head Relevance Vectors +(HRVs) that align with useful visual concepts. An HRV for a given visual +concept is a vector with a length equal to the total number of cross-attention +heads, where each element represents the importance of the corresponding head +for the given visual concept. We develop and employ an ordered weakening +analysis to demonstrate the effectiveness of HRVs as interpretable features. To +demonstrate the utility of HRVs, we propose concept strengthening and concept +adjusting methods and apply them to enhance three visual generative tasks. We +show that misinterpretations of polysemous words in image generation can be +corrected in most cases, five challenging attributes in image editing can be +successfully modified, and catastrophic neglect in multi-concept generation can +be mitigated. Overall, our work provides an advancement in understanding +cross-attention layers and introduces new approaches for fine-controlling these +layers at the head level. + +
+
+
+
+
+ + ☆ BANER: Boundary-Aware LLMs for Few-Shot Named Entity Recognition COLING 2025 + + +
+ Despite the recent success of two-stage prototypical networks in few-shot +named entity recognition (NER), challenges such as over/under-detected false +spans in the span detection stage and unaligned entity prototypes in the type +classification stage persist. Additionally, LLMs have not proven to be +effective few-shot information extractors in general. In this paper, we propose +an approach called Boundary-Aware LLMs for Few-Shot Named Entity Recognition to +address these issues. We introduce a boundary-aware contrastive learning +strategy to enhance the LLM's ability to perceive entity boundaries for +generalized entity spans. Additionally, we utilize LoRAHub to align information +from the target domain to the source domain, thereby enhancing adaptive +cross-domain classification capabilities. Extensive experiments across various +benchmarks demonstrate that our framework outperforms prior methods, validating +its effectiveness. In particular, the proposed strategies demonstrate +effectiveness across a range of LLM architectures. The code and data are +released on https://github.com/UESTC-GQJ/BANER. + +
+
+ comment: Appear on COLING 2025 +
+
+
+
+
+ + ☆ Deep learning approach for predicting the replicator equation in + evolutionary game theory + + +
+ This paper presents a physics-informed deep learning approach for predicting +the replicator equation, allowing accurate forecasting of population dynamics. +This methodological innovation allows us to derive governing differential or +difference equations for systems that lack explicit mathematical models. We +used the SINDy model first introduced by Fasel, Kaiser, Kutz, Brunton, and +Brunt 2016a to get the replicator equation, which will significantly advance +our understanding of evolutionary biology, economic systems, and social +dynamics. By refining predictive models across multiple disciplines, including +ecology, social structures, and moral behaviours, our work offers new insights +into the complex interplay of variables shaping evolutionary outcomes in +dynamic systems + +
+
+
+
+
+ + ☆ Unlocking Tuning-Free Few-Shot Adaptability in Visual Foundation Models + by Recycling Pre-Tuned LoRAs + + +
+ Large Language Models (LLMs) such as ChatGPT demonstrate strong few-shot +adaptability without requiring fine-tuning, positioning them ideal for +data-limited and real-time applications. However, this adaptability has not yet +been replicated in current Visual Foundation Models (VFMs), which require +explicit fine-tuning with sufficient tuning data. Besides, the +pretraining-finetuning paradigm has led to the surge of numerous task-specific +modular components, such as Low-Rank Adaptation (LoRA). For the first time, we +explore the potential of reusing diverse pre-tuned LoRAs without accessing +their original training data, to achieve tuning-free few-shot adaptation in +VFMs. Our framework, LoRA Recycle, distills a meta-LoRA from diverse pre-tuned +LoRAs with a meta-learning objective, using surrogate data generated inversely +from pre-tuned LoRAs themselves. The VFM, once equipped with the meta-LoRA, is +empowered to solve new few-shot tasks in a single forward pass, akin to the +in-context learning of LLMs. Additionally, we incorporate a double-efficient +mechanism tailored to our framework, significantly accelerating the +meta-training process while maintaining or even improving performance. +Extensive experiments across various few-shot classification benchmarks across +both in- and cross-domain scenarios demonstrate the superiority of our +framework. + +
+
+
+
+
+ + ☆ Recovering implicit physics model under real-world constraints ECAI 2024 + + +
+ Recovering a physics-driven model, i.e. a governing set of equations of the +underlying dynamical systems, from the real-world data has been of recent +interest. Most existing methods either operate on simulation data with +unrealistically high sampling rates or require explicit measurements of all +system variables, which is not amenable in real-world deployments. Moreover, +they assume the timestamps of external perturbations to the physical system are +known a priori, without uncertainty, implicitly discounting any sensor +time-synchronization or human reporting errors. In this paper, we propose a +novel liquid time constant neural network (LTC-NN) based architecture to +recover underlying model of physical dynamics from real-world data. The +automatic differentiation property of LTC-NN nodes overcomes problems +associated with low sampling rates, the input dependent time constant in the +forward pass of the hidden layer of LTC-NN nodes creates a massive search space +of implicit physical dynamics, the physics model solver based data +reconstruction loss guides the search for the correct set of implicit dynamics, +and the use of the dropout regularization in the dense layer ensures extraction +of the sparsest model. Further, to account for the perturbation timing error, +we utilize dense layer nodes to search through input shifts that results in the +lowest reconstruction loss. Experiments on four benchmark dynamical systems, +three with simulation data and one with the real-world data show that the +LTC-NN architecture is more accurate in recovering implicit physics model +coefficients than the state-of-the-art sparse model recovery approaches. We +also introduce four additional case studies (total eight) on real-life medical +examples in simulation and with real-world clinical data to show effectiveness +of our approach in recovering underlying model in practice. + +
+
+ comment: This paper is published in ECAI 2024, + https://ebooks.iospress.nl/volumearticle/69651 +
+
+
+
+
+ + ☆ DataLab: A Unifed Platform for LLM-Powered Business Intelligence + + +
+ Business intelligence (BI) transforms large volumes of data within modern +organizations into actionable insights for informed decision-making. Recently, +large language model (LLM)-based agents have streamlined the BI workflow by +automatically performing task planning, reasoning, and actions in executable +environments based on natural language (NL) queries. However, existing +approaches primarily focus on individual BI tasks such as NL2SQL and NL2VIS. +The fragmentation of tasks across different data roles and tools lead to +inefficiencies and potential errors due to the iterative and collaborative +nature of BI. In this paper, we introduce DataLab, a unified BI platform that +integrates a one-stop LLM-based agent framework with an augmented computational +notebook interface. DataLab supports a wide range of BI tasks for different +data roles by seamlessly combining LLM assistance with user customization +within a single environment. To achieve this unification, we design a domain +knowledge incorporation module tailored for enterprise-specific BI tasks, an +inter-agent communication mechanism to facilitate information sharing across +the BI workflow, and a cell-based context management strategy to enhance +context utilization efficiency in BI notebooks. Extensive experiments +demonstrate that DataLab achieves state-of-the-art performance on various BI +tasks across popular research benchmarks. Moreover, DataLab maintains high +effectiveness and efficiency on real-world datasets from Tencent, achieving up +to a 58.58% increase in accuracy and a 61.65% reduction in token cost on +enterprise-specific BI tasks. + +
+
+
+
+
+ + ☆ LayoutVLM: Differentiable Optimization of 3D Layout via Vision-Language + Models + + +
+ Open-universe 3D layout generation arranges unlabeled 3D assets conditioned +on language instruction. Large language models (LLMs) struggle with generating +physically plausible 3D scenes and adherence to input instructions, +particularly in cluttered scenes. We introduce LayoutVLM, a framework and scene +layout representation that exploits the semantic knowledge of Vision-Language +Models (VLMs) and supports differentiable optimization to ensure physical +plausibility. LayoutVLM employs VLMs to generate two mutually reinforcing +representations from visually marked images, and a self-consistent decoding +process to improve VLMs spatial planning. Our experiments show that LayoutVLM +addresses the limitations of existing LLM and constraint-based approaches, +producing physically plausible 3D layouts better aligned with the semantic +intent of input language instructions. We also demonstrate that fine-tuning +VLMs with the proposed scene layout representation extracted from existing +scene datasets can improve performance. + +
+
+ comment: project website: https://ai.stanford.edu/~sunfanyun/layoutvlm/ +
+
+
+
+
+ + ☆ Comparative Performance of Machine Learning Algorithms for Early Genetic + Disorder and Subclass Classification + + +
+ A great deal of effort has been devoted to discovering a particular genetic +disorder, but its classification across a broad spectrum of disorder classes +and types remains elusive. Early diagnosis of genetic disorders enables timely +interventions and improves outcomes. This study implements machine learning +models using basic clinical indicators measurable at birth or infancy to enable +diagnosis in preliminary life stages. Supervised learning algorithms were +implemented on a dataset of 22083 instances with 42 features like family +history, newborn metrics, and basic lab tests. Extensive hyperparameter tuning, +feature engineering, and selection were undertaken. Two multi-class classifiers +were developed: one for predicting disorder classes (mitochondrial, +multifactorial, and single-gene) and one for subtypes (9 disorders). +Performance was evaluated using accuracy, precision, recall, and the F1-score. +The CatBoost classifier achieved the highest accuracy of 77% for predicting +genetic disorder classes. For subtypes, SVM attained a maximum accuracy of 80%. +The study demonstrates the feasibility of using basic clinical data in machine +learning models for early categorization and diagnosis across various genetic +disorders. Applying ML with basic clinical indicators can enable timely +interventions once validated on larger datasets. It is necessary to conduct +further studies to improve model performance on this dataset. + +
+
+ comment: 16 pages, 11 figures, 9 tables +
+
+
+
+
+ + ☆ VideoICL: Confidence-based Iterative In-context Learning for + Out-of-Distribution Video Understanding + + +
+ Recent advancements in video large multimodal models (LMMs) have +significantly improved their video understanding and reasoning capabilities. +However, their performance drops on out-of-distribution (OOD) tasks that are +underrepresented in training data. Traditional methods like fine-tuning on OOD +datasets are impractical due to high computational costs. While In-context +learning (ICL) with demonstration examples has shown promising generalization +performance in language tasks and image-language tasks without fine-tuning, +applying ICL to video-language tasks faces challenges due to the limited +context length in Video LMMs, as videos require longer token lengths. To +address these issues, we propose VideoICL, a novel video in-context learning +framework for OOD tasks that introduces a similarity-based relevant example +selection strategy and a confidence-based iterative inference approach. This +allows to select the most relevant examples and rank them based on similarity, +to be used for inference. If the generated response has low confidence, our +framework selects new examples and performs inference again, iteratively +refining the results until a high-confidence response is obtained. This +approach improves OOD video understanding performance by extending effective +context length without incurring high costs. The experimental results on +multiple benchmarks demonstrate significant performance gains, especially in +domain-specific scenarios, laying the groundwork for broader video +comprehension applications. Code will be released at +https://github.com/KangsanKim07/VideoICL + +
+
+
+
+
+ + ☆ Generalizing Weisfeiler-Lehman Kernels to Subgraphs + + +
+ Subgraph representation learning has been effective in solving various +real-world problems. However, current graph neural networks (GNNs) produce +suboptimal results for subgraph-level tasks due to their inability to capture +complex interactions within and between subgraphs. To provide a more expressive +and efficient alternative, we propose WLKS, a Weisfeiler-Lehman (WL) kernel +generalized for subgraphs by applying the WL algorithm on induced $k$-hop +neighborhoods. We combine kernels across different $k$-hop levels to capture +richer structural information that is not fully encoded in existing models. Our +approach can balance expressiveness and efficiency by eliminating the need for +neighborhood sampling. In experiments on eight real-world and synthetic +benchmarks, WLKS significantly outperforms leading approaches on five datasets +while reducing training time, ranging from 0.01x to 0.25x compared to the +state-of-the-art. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Anatomically-Grounded Fact Checking of Automated Chest X-ray Reports + + +
+ With the emergence of large-scale vision-language models, realistic radiology +reports may be generated using only medical images as input guided by simple +prompts. However, their practical utility has been limited due to the factual +errors in their description of findings. In this paper, we propose a novel +model for explainable fact-checking that identifies errors in findings and +their locations indicated through the reports. Specifically, we analyze the +types of errors made by automated reporting methods and derive a new synthetic +dataset of images paired with real and fake descriptions of findings and their +locations from a ground truth dataset. A new multi-label cross-modal +contrastive regression network is then trained on this datsaset. We evaluate +the resulting fact-checking model and its utility in correcting reports +generated by several SOTA automated reporting tools on a variety of benchmark +datasets with results pointing to over 40\% improvement in report quality +through such error detection and correction. + +
+
+
+
+
+ + ☆ Self-Supervised Learning-Based Path Planning and Obstacle Avoidance + Using PPO and B-Splines in Unknown Environments + + +
+ This paper introduces SmartBSP, an advanced self-supervised learning +framework for real-time path planning and obstacle avoidance in autonomous +robotics navigating through complex environments. The proposed system +integrates Proximal Policy Optimization (PPO) with Convolutional Neural +Networks (CNN) and Actor-Critic architecture to process limited LIDAR inputs +and compute spatial decision-making probabilities. The robot's perceptual field +is discretized into a grid format, which the CNN analyzes to produce a spatial +probability distribution. During the training process a nuanced cost function +is minimized that accounts for path curvature, endpoint proximity, and obstacle +avoidance. Simulations results in different scenarios validate the algorithm's +resilience and adaptability across diverse operational scenarios. Subsequently, +Real-time experiments, employing the Robot Operating System (ROS), were carried +out to assess the efficacy of the proposed algorithm. + +
+
+
+
+
+ + ☆ Keeping Experts in the Loop: Expert-Guided Optimization for Clinical + Data Classification using Large Language Models + + +
+ Since the emergence of Large Language Models (LLMs), the challenge of +effectively leveraging their potential in healthcare has taken center stage. A +critical barrier to using LLMs for extracting insights from unstructured +clinical notes lies in the prompt engineering process. Despite its pivotal role +in determining task performance, a clear framework for prompt optimization +remains absent. Current methods to address this gap take either a manual prompt +refinement approach, where domain experts collaborate with prompt engineers to +create an optimal prompt, which is time-intensive and difficult to scale, or +through employing automatic prompt optimizing approaches, where the value of +the input of domain experts is not fully realized. To address this, we propose +StructEase, a novel framework that bridges the gap between automation and the +input of human expertise in prompt engineering. A core innovation of the +framework is SamplEase, an iterative sampling algorithm that identifies +high-value cases where expert feedback drives significant performance +improvements. This approach minimizes expert intervention, to effectively +enhance classification outcomes. This targeted approach reduces labeling +redundancy, mitigates human error, and enhances classification outcomes. We +evaluated the performance of StructEase using a dataset of de-identified +clinical narratives from the US National Electronic Injury Surveillance System +(NEISS), demonstrating significant gains in classification performance compared +to current methods. Our findings underscore the value of expert integration in +LLM workflows, achieving notable improvements in F1 score while maintaining +minimal expert effort. By combining transparency, flexibility, and scalability, +StructEase sets the foundation for a framework to integrate expert input into +LLM workflows in healthcare and beyond. + +
+
+
+
+
+ + ☆ VISCO: Benchmarking Fine-Grained Critique and Correction Towards + Self-Improvement in Visual Reasoning + + +
+ The ability of large vision-language models (LVLMs) to critique and correct +their reasoning is an essential building block towards their self-improvement. +However, a systematic analysis of such capabilities in LVLMs is still lacking. +We propose VISCO, the first benchmark to extensively analyze the fine-grained +critique and correction capabilities of LVLMs. Compared to existing work that +uses a single scalar value to critique the entire reasoning [4], VISCO features +dense and fine-grained critique, requiring LVLMs to evaluate the correctness of +each step in the chain-of-thought and provide natural language explanations to +support their judgments. Extensive evaluation of 24 LVLMs demonstrates that +human-written critiques significantly enhance the performance after correction, +showcasing the potential of the self-improvement strategy. However, the +model-generated critiques are less helpful and sometimes detrimental to the +performance, suggesting that critique is the crucial bottleneck. We identified +three common patterns in critique failures: failure to critique visual +perception, reluctance to "say no", and exaggerated assumption of error +propagation. To address these issues, we propose an effective LookBack strategy +that revisits the image to verify each piece of information in the initial +reasoning. LookBack significantly improves critique and correction performance +by up to 13.5%. + +
+
+ comment: Project: https://visco-benchmark.github.io/ +
+
+
+
+
+ + ☆ Analyzing the Impact of AI Tools on Student Study Habits and Academic + Performance + + +
+ This study explores the effectiveness of AI tools in enhancing student +learning, specifically in improving study habits, time management, and feedback +mechanisms. The research focuses on how AI tools can support personalized +learning, adaptive test adjustments, and provide real-time classroom analysis. +Student feedback revealed strong support for these features, and the study +found a significant reduction in study hours alongside an increase in GPA, +suggesting positive academic outcomes. Despite these benefits, challenges such +as over-reliance on AI and difficulties in integrating AI with traditional +teaching methods were also identified, emphasizing the need for AI tools to +complement conventional educational strategies rather than replace them. Data +were collected through a survey with a Likert scale and follow-up interviews, +providing both quantitative and qualitative insights. The analysis involved +descriptive statistics to summarize demographic data, AI usage patterns, and +perceived effectiveness, as well as inferential statistics (T-tests, ANOVA) to +examine the impact of demographic factors on AI adoption. Regression analysis +identified predictors of AI adoption, and qualitative responses were +thematically analyzed to understand students' perspectives on the future of AI +in education. This mixed-methods approach provided a comprehensive view of AI's +role in education and highlighted the importance of privacy, transparency, and +continuous refinement of AI features to maximize their educational benefits. + +
+
+
+
+
+ + ☆ Jailbreak Defense in a Narrow Domain: Limitations of Existing Methods + and a New Transcript-Classifier Approach NeurIPS 2024 + + +
+ Defending large language models against jailbreaks so that they never engage +in a broadly-defined set of forbidden behaviors is an open problem. In this +paper, we investigate the difficulty of jailbreak-defense when we only want to +forbid a narrowly-defined set of behaviors. As a case study, we focus on +preventing an LLM from helping a user make a bomb. We find that popular +defenses such as safety training, adversarial training, and input/output +classifiers are unable to fully solve this problem. In pursuit of a better +solution, we develop a transcript-classifier defense which outperforms the +baseline defenses we test. However, our classifier defense still fails in some +circumstances, which highlights the difficulty of jailbreak-defense even in a +narrow domain. + +
+
+ comment: Accepted to the AdvML-Frontiers and SoLaR workshops at NeurIPS 2024 +
+
+
+
+
+ + ☆ CausalMob: Causal Human Mobility Prediction with LLMs-derived Human + Intentions toward Public Events KDD 2025 + + +
+ Large-scale human mobility exhibits spatial and temporal patterns that can +assist policymakers in decision making. Although traditional prediction models +attempt to capture these patterns, they often interfered by non-periodic public +events, such as disasters and occasional celebrations. Since regular human +mobility patterns are heavily affected by these events, estimating their causal +effects is critical to accurate mobility predictions. Although news articles +provide unique perspectives on these events in an unstructured format, +processing is a challenge. In this study, we propose a causality-augmented +prediction model, called \textbf{CausalMob}, to analyze the causal effects of +public events. We first utilize large language models (LLMs) to extract human +intentions from news articles and transform them into features that act as +causal treatments. Next, the model learns representations of spatio-temporal +regional covariates from multiple data sources to serve as confounders for +causal inference. Finally, we present a causal effect estimation framework to +ensure event features remain independent of confounders during prediction. +Based on large-scale real-world data, the experimental results show that the +proposed model excels in human mobility prediction, outperforming +state-of-the-art models. + +
+
+ comment: Accepted by KDD 2025 +
+
+
+
+
+ + ☆ Failure Probability Estimation for Black-Box Autonomous Systems using + State-Dependent Importance Sampling Proposals + + +
+ Estimating the probability of failure is a critical step in developing +safety-critical autonomous systems. Direct estimation methods such as Monte +Carlo sampling are often impractical due to the rarity of failures in these +systems. Existing importance sampling approaches do not scale to sequential +decision-making systems with large state spaces and long horizons. We propose +an adaptive importance sampling algorithm to address these limitations. Our +method minimizes the forward Kullback-Leibler divergence between a +state-dependent proposal distribution and a relaxed form of the optimal +importance sampling distribution. Our method uses Markov score ascent methods +to estimate this objective. We evaluate our approach on four sequential systems +and show that it provides more accurate failure probability estimates than +baseline Monte Carlo and importance sampling techniques. This work is open +sourced. + +
+
+ comment: Submitted to L4DC 2025 +
+
+
+
+
+ + ☆ Revisiting the Initial Steps in Adaptive Gradient Descent Optimization NeurIPS 2024 + + +
+ Adaptive gradient optimization methods, such as Adam, are prevalent in +training deep neural networks across diverse machine learning tasks due to +their ability to achieve faster convergence. However, these methods often +suffer from suboptimal generalization compared to stochastic gradient descent +(SGD) and exhibit instability, particularly when training Transformer models. +In this work, we show the standard initialization of the second-order moment +estimation ($v_0 =0$) as a significant factor contributing to these +limitations. We introduce simple yet effective solutions: initializing the +second-order moment estimation with non-zero values, using either data-driven +or random initialization strategies. Empirical evaluations demonstrate that our +approach not only stabilizes convergence but also enhances the final +performance of adaptive gradient optimizers. Furthermore, by adopting the +proposed initialization strategies, Adam achieves performance comparable to +many recently proposed variants of adaptive gradient optimization methods, +highlighting the practical impact of this straightforward modification. + +
+
+ comment: OPT workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ Mining Tweets to Predict Future Bitcoin Price + + +
+ Bitcoin has increased investment interests in people during the last decade. +We have seen an increase in the number of posts on social media platforms about +cryptocurrency, especially Bitcoin. This project focuses on analyzing user +tweet data in combination with Bitcoin price data to see the relevance between +price fluctuations and the conversation between millions of people on Twitter. +This study also exploits this relationship between user tweets and bitcoin +prices to predict the future bitcoin price. We are utilizing novel techniques +and methods to analyze the data and make price predictions. + +
+
+
+
+
+ + ☆ Personalized Multimodal Large Language Models: A Survey + + +
+ Multimodal Large Language Models (MLLMs) have become increasingly important +due to their state-of-the-art performance and ability to integrate multiple +data modalities, such as text, images, and audio, to perform complex tasks with +high accuracy. This paper presents a comprehensive survey on personalized +multimodal large language models, focusing on their architecture, training +methods, and applications. We propose an intuitive taxonomy for categorizing +the techniques used to personalize MLLMs to individual users, and discuss the +techniques accordingly. Furthermore, we discuss how such techniques can be +combined or adapted when appropriate, highlighting their advantages and +underlying rationale. We also provide a succinct summary of personalization +tasks investigated in existing research, along with the evaluation metrics +commonly used. Additionally, we summarize the datasets that are useful for +benchmarking personalized MLLMs. Finally, we outline critical open challenges. +This survey aims to serve as a valuable resource for researchers and +practitioners seeking to understand and advance the development of personalized +multimodal large language models. + +
+
+
+
+
+ + ☆ Graph Learning for Planning: The Story Thus Far and Open Challenges + + +
+ Graph learning is naturally well suited for use in planning due to its +ability to exploit relational structures exhibited in planning domains and to +take as input planning instances with arbitrary number of objects. In this +paper, we study the usage of graph learning for planning thus far by studying +the theoretical and empirical effects on learning and planning performance of +(1) graph representations of planning tasks, (2) graph learning architectures, +and (3) optimisation formulations for learning. Our studies accumulate in the +GOOSE framework which learns domain knowledge from small planning tasks in +order to scale up to much larger planning tasks. In this paper, we also +highlight and propose the 5 open challenges in the general Learning for +Planning field that we believe need to be addressed for advancing the +state-of-the-art. + +
+
+
+
+
+ + ☆ A privacy-preserving distributed credible evidence fusion algorithm for + collective decision-making + + +
+ The theory of evidence reasoning has been applied to collective +decision-making in recent years. However, existing distributed evidence fusion +methods lead to participants' preference leakage and fusion failures as they +directly exchange raw evidence and do not assess evidence credibility like +centralized credible evidence fusion (CCEF) does. To do so, a +privacy-preserving distributed credible evidence fusion method with three-level +consensus (PCEF) is proposed in this paper. In evidence difference measure +(EDM) neighbor consensus, an evidence-free equivalent expression of EDM among +neighbored agents is derived with the shared dot product protocol for pignistic +probability and the identical judgment of two events with maximal subjective +probabilities, so that evidence privacy is guaranteed due to such irreversible +evidence transformation. In EDM network consensus, the non-neighbored EDMs are +inferred and neighbored EDMs reach uniformity via interaction between linear +average consensus (LAC) and low-rank matrix completion with rank adaptation to +guarantee EDM consensus convergence and no solution of inferring raw evidence +in numerical iteration style. In fusion network consensus, a privacy-preserving +LAC with a self-cancelling differential privacy term is proposed, where each +agent adds its randomness to the sharing content and step-by-step cancels such +randomness in consensus iterations. Besides, the sufficient condition of the +convergence to the CCEF is explored, and it is proven that raw evidence is +impossibly inferred in such an iterative consensus. The simulations show that +PCEF is close to CCEF both in credibility and fusion results and obtains higher +decision accuracy with less time-comsuming than existing methods. + +
+
+
+
+
+ + ☆ Benchmarking symbolic regression constant optimization schemes + + +
+ Symbolic regression is a machine learning technique, and it has seen many +advancements in recent years, especially in genetic programming approaches +(GPSR). Furthermore, it has been known for many years that constant +optimization of parameters, during the evolutionary search, greatly increases +GPSR performance However, different authors approach such tasks differently and +no consensus exists regarding which methods perform best. In this work, we +evaluate eight different parameter optimization methods, applied during +evolutionary search, over ten known benchmark problems, in two different +scenarios. We also propose using an under-explored metric called Tree Edit +Distance (TED), aiming to identify symbolic accuracy. In conjunction with +classical error measures, we develop a combined analysis of model performance +in symbolic regression. We then show that different constant optimization +methods perform better in certain scenarios and that there is no overall best +choice for every problem. Finally, we discuss how common metric decisions may +be biased and appear to generate better models in comparison. + +
+
+ comment: 9 pages, 10 figures, 2 tables +
+
+
+
+
+ + ☆ Optimizing Latent Goal by Learning from Trajectory Preference + + +
+ A glowing body of work has emerged focusing on instruction-following policies +for open-world agents, aiming to better align the agent's behavior with human +intentions. However, the performance of these policies is highly susceptible to +the initial prompt, which leads to extra efforts in selecting the best +instructions. We propose a framework named Preference Goal Tuning (PGT). PGT +allows an instruction following policy to interact with the environment to +collect several trajectories, which will be categorized into positive and +negative samples based on preference. Then we use preference learning to +fine-tune the initial goal latent representation with the categorized +trajectories while keeping the policy backbone frozen. The experiment result +shows that with minimal data and training, PGT achieves an average relative +improvement of 72.0% and 81.6% over 17 tasks in 2 different foundation policies +respectively, and outperforms the best human-selected instructions. Moreover, +PGT surpasses full fine-tuning in the out-of-distribution (OOD) task-execution +environments by 13.4%, indicating that our approach retains strong +generalization capabilities. Since our approach stores a single latent +representation for each task independently, it can be viewed as an efficient +method for continual learning, without the risk of catastrophic forgetting or +task interference. In short, PGT enhances the performance of agents across +nearly all tasks in the Minecraft Skillforge benchmark and demonstrates +robustness to the execution environment. + +
+
+
+
+
+ + ☆ OmniCreator: Self-Supervised Unified Generation with Universal Editing + + +
+ We introduce OmniCreator, a novel framework that can conduct text-prompted +unified (image+video) generation as well as editing all in one place. +OmniCreator acquires generative and universal editing capabilities in a +self-supervised manner, taking original text-video pairs as conditions while +utilizing the same video as a denoising target to learn the semantic +correspondence between video and text. During inference, when presented with a +text prompt and a video, OmniCreator is capable of generating a target that is +faithful to both, achieving a universal editing effect that is unconstrained as +opposed to existing editing work that primarily focuses on certain editing +types or relies on additional controls (e.g., structural conditions, attention +features, or DDIM inversion). On the other hand, when presented with a text +prompt only, OmniCreator becomes generative, producing high-quality video as a +result of the semantic correspondence learned. Importantly, we found that the +same capabilities extend to images as is, making OmniCreator a truly unified +framework. Further, due to the lack of existing generative video editing +benchmarks, we introduce the OmniBench-99 dataset, designed to evaluate the +performance of generative video editing models comprehensively. Extensive +experiments demonstrate that OmniCreator exhibits substantial superiority over +all other models. + +
+
+ comment: Project: https://haroldchen19.github.io/OmniCreator-Page/ +
+
+
+
+
+ + ☆ ShapeWords: Guiding Text-to-Image Synthesis with 3D Shape-Aware Prompts + + +
+ We introduce ShapeWords, an approach for synthesizing images based on 3D +shape guidance and text prompts. ShapeWords incorporates target 3D shape +information within specialized tokens embedded together with the input text, +effectively blending 3D shape awareness with textual context to guide the image +synthesis process. Unlike conventional shape guidance methods that rely on +depth maps restricted to fixed viewpoints and often overlook full 3D structure +or textual context, ShapeWords generates diverse yet consistent images that +reflect both the target shape's geometry and the textual description. +Experimental results show that ShapeWords produces images that are more +text-compliant, aesthetically plausible, while also maintaining 3D shape +awareness. + +
+
+ comment: Project webpage: https://lodurality.github.io/shapewords/ +
+
+
+
+
+ + ☆ Does Few-Shot Learning Help LLM Performance in Code Synthesis? + + +
+ Large language models (LLMs) have made significant strides at code generation +through improved model design, training, and chain-of-thought. However, +prompt-level optimizations remain an important yet under-explored aspect of +LLMs for coding. This work focuses on the few-shot examples present in most +code generation prompts, offering a systematic study on whether few-shot +examples improve LLM's coding capabilities, which few-shot examples have the +largest impact, and how to select impactful examples. Our work offers 2 +approaches for selecting few-shot examples, a model-free method, +CODEEXEMPLAR-FREE, and a model-based method, CODEEXEMPLAR-BASED. The 2 methods +offer a trade-off between improved performance and reliance on training data +and interpretability. Both methods significantly improve CodeLlama's coding +ability across the popular HumanEval+ coding benchmark. In summary, our work +provides valuable insights into how to pick few-shot examples in code +generation prompts to improve LLM code generation capabilities. + +
+
+
+
+
+ + ☆ Enhancing Trust in Large Language Models with Uncertainty-Aware + Fine-Tuning + + +
+ Large language models (LLMs) have revolutionized the field of natural +language processing with their impressive reasoning and question-answering +capabilities. However, these models are sometimes prone to generating +credible-sounding but incorrect information, a phenomenon known as LLM +hallucinations. Reliable uncertainty estimation in LLMs is essential for +fostering trust in their generated responses and serves as a critical tool for +the detection and prevention of erroneous or hallucinated outputs. To achieve +reliable and well-calibrated uncertainty quantification in open-ended and +free-form natural language generation, we propose an uncertainty-aware +fine-tuning approach for LLMs. This approach enhances the model's ability to +provide reliable uncertainty estimates without compromising accuracy, thereby +guiding them to produce more trustworthy responses. We introduce a novel +uncertainty-aware causal language modeling loss function, grounded in the +principles of decision theory. Through rigorous evaluation on multiple +free-form question-answering datasets and models, we demonstrate that our +uncertainty-aware fine-tuning approach yields better calibrated uncertainty +estimates in natural language generation tasks than fine-tuning with the +standard causal language modeling loss. Furthermore, the experimental results +show that the proposed method significantly improves the model's ability to +detect hallucinations and identify out-of-domain prompts. + +
+
+
+
+
+ + ☆ MLD-EA: Check and Complete Narrative Coherence by Introducing Emotions + and Actions + + +
+ Narrative understanding and story generation are critical challenges in +natural language processing (NLP), with much of the existing research focused +on summarization and question-answering tasks. While previous studies have +explored predicting plot endings and generating extended narratives, they often +neglect the logical coherence within stories, leaving a significant gap in the +field. To address this, we introduce the Missing Logic Detector by Emotion and +Action (MLD-EA) model, which leverages large language models (LLMs) to identify +narrative gaps and generate coherent sentences that integrate seamlessly with +the story's emotional and logical flow. The experimental results demonstrate +that the MLD-EA model enhances narrative understanding and story generation, +highlighting LLMs' potential as effective logic checkers in story writing with +logical coherence and emotional consistency. This work fills a gap in NLP +research and advances border goals of creating more sophisticated and reliable +story-generation systems. + +
+
+
+
+
+ + ☆ Removing Spurious Correlation from Neural Network Interpretations + + +
+ The existing algorithms for identification of neurons responsible for +undesired and harmful behaviors do not consider the effects of confounders such +as topic of the conversation. In this work, we show that confounders can create +spurious correlations and propose a new causal mediation approach that controls +the impact of the topic. In experiments with two large language models, we +study the localization hypothesis and show that adjusting for the effect of +conversation topic, toxicity becomes less localized. + +
+
+
+
+
+ + ☆ Deep-Learning Based Docking Methods: Fair Comparisons to Conventional + Docking Workflows + + +
+ The diffusion learning method, DiffDock, for docking small-molecule ligands +into protein binding sites was recently introduced. Results included +comparisons to more conventional docking approaches, with DiffDock showing +superior performance. Here, we employ a fully automatic workflow using the +Surflex-Dock methods to generate a fair baseline for conventional docking +approaches. Results were generated for the common and expected situation where +a binding site location is known and also for the condition of an unknown +binding site. For the known binding site condition, Surflex-Dock success rates +at 2.0 Angstroms RMSD far exceeded those for DiffDock (Top-1/Top-5 success +rates, respectively, were 68/81% compared with 45/51%). Glide performed with +similar success rates (67/73%) to Surflex-Dock for the known binding site +condition, and results for AutoDock Vina and Gnina followed this pattern. For +the unknown binding site condition, using an automated method to identify +multiple binding pockets, Surflex-Dock success rates again exceeded those of +DiffDock, but by a somewhat lesser margin. DiffDock made use of roughly 17,000 +co-crystal structures for learning (98% of PDBBind version 2020, pre-2019 +structures) for a training set in order to predict on 363 test cases (2% of +PDBBind 2020) from 2019 forward. DiffDock's performance was inextricably linked +with the presence of near-neighbor cases of close to identical protein-ligand +complexes in the training set for over half of the test set cases. DiffDock +exhibited a 40 percentage point difference on near-neighbor cases (two-thirds +of all test cases) compared with cases with no near-neighbor training case. +DiffDock has apparently encoded a type of table-lookup during its learning +process, rendering meaningful applications beyond its reach. Further, it does +not perform even close to competitively with a competently run modern docking +workflow. + +
+
+ comment: 19 pages including references and appendices, 7 figures +
+
+
+
+
+ + ☆ Modeling and Discovering Direct Causes for Predictive Models + + +
+ We introduce a causal modeling framework that captures the input-output +behavior of predictive models (e.g., machine learning models) by representing +it using causal graphs. The framework enables us to define and identify +features that directly cause the predictions, which has broad implications for +data collection and model evaluation. We show two assumptions under which the +direct causes can be discovered from data, one of which further simplifies the +discovery process. In addition to providing sound and complete algorithms, we +propose an optimization technique based on an independence rule that can be +integrated with the algorithms to speed up the discovery process both +theoretically and empirically. + +
+
+
+
+
+ + ☆ Out-of-Distribution Detection for Neurosymbolic Autonomous Cyber Agents + + +
+ Autonomous agents for cyber applications take advantage of modern defense +techniques by adopting intelligent agents with conventional and +learning-enabled components. These intelligent agents are trained via +reinforcement learning (RL) algorithms, and can learn, adapt to, reason about +and deploy security rules to defend networked computer systems while +maintaining critical operational workflows. However, the knowledge available +during training about the state of the operational network and its environment +may be limited. The agents should be trustworthy so that they can reliably +detect situations they cannot handle, and hand them over to cyber experts. In +this work, we develop an out-of-distribution (OOD) Monitoring algorithm that +uses a Probabilistic Neural Network (PNN) to detect anomalous or OOD situations +of RL-based agents with discrete states and discrete actions. To demonstrate +the effectiveness of the proposed approach, we integrate the OOD monitoring +algorithm with a neurosymbolic autonomous cyber agent that uses behavior trees +with learning-enabled components. We evaluate the proposed approach in a +simulated cyber environment under different adversarial strategies. +Experimental results over a large number of episodes illustrate the overall +efficiency of our proposed approach. + +
+
+ comment: 9 pages, 10 figures, IEEE International Conference on AI in + Cybersecurity (ICAIC), 2025 +
+
+
+
+
+ + ☆ Constrained Identifiability of Causal Effects + + +
+ We study the identification of causal effects in the presence of different +types of constraints (e.g., logical constraints) in addition to the causal +graph. These constraints impose restrictions on the models (parameterizations) +induced by the causal graph, reducing the set of models considered by the +identifiability problem. We formalize the notion of constrained +identifiability, which takes a set of constraints as another input to the +classical definition of identifiability. We then introduce a framework for +testing constrained identifiability by employing tractable Arithmetic Circuits +(ACs), which enables us to accommodate constraints systematically. We show that +this AC-based approach is at least as complete as existing algorithms (e.g., +do-calculus) for testing classical identifiability, which only assumes the +constraint of strict positivity. We use examples to demonstrate the +effectiveness of this AC-based approach by showing that unidentifiable causal +effects may become identifiable under different types of constraints. + +
+
+
+
+
+ + ♻ ☆ Towards Neuro-Symbolic Video Understanding ECCV + + +
+ The unprecedented surge in video data production in recent years necessitates +efficient tools to extract meaningful frames from videos for downstream tasks. +Long-term temporal reasoning is a key desideratum for frame retrieval systems. +While state-of-the-art foundation models, like VideoLLaMA and ViCLIP, are +proficient in short-term semantic understanding, they surprisingly fail at +long-term reasoning across frames. A key reason for this failure is that they +intertwine per-frame perception and temporal reasoning into a single deep +network. Hence, decoupling but co-designing semantic understanding and temporal +reasoning is essential for efficient scene identification. We propose a system +that leverages vision-language models for semantic understanding of individual +frames but effectively reasons about the long-term evolution of events using +state machines and temporal logic (TL) formulae that inherently capture memory. +Our TL-based reasoning improves the F1 score of complex event identification by +9-15% compared to benchmarks that use GPT4 for reasoning on state-of-the-art +self-driving datasets such as Waymo and NuScenes. + +
+
+ comment: Accepted by The European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ Neuro-Symbolic Evaluation of Text-to-Video Models using Formal + Verification + + +
+ Recent advancements in text-to-video models such as Sora, Gen-3, MovieGen, +and CogVideoX are pushing the boundaries of synthetic video generation, with +adoption seen in fields like robotics, autonomous driving, and entertainment. +As these models become prevalent, various metrics and benchmarks have emerged +to evaluate the quality of the generated videos. However, these metrics +emphasize visual quality and smoothness, neglecting temporal fidelity and +text-to-video alignment, which are crucial for safety-critical applications. To +address this gap, we introduce NeuS-V, a novel synthetic video evaluation +metric that rigorously assesses text-to-video alignment using neuro-symbolic +formal verification techniques. Our approach first converts the prompt into a +formally defined Temporal Logic (TL) specification and translates the generated +video into an automaton representation. Then, it evaluates the text-to-video +alignment by formally checking the video automaton against the TL +specification. Furthermore, we present a dataset of temporally extended prompts +to evaluate state-of-the-art video generation models against our benchmark. We +find that NeuS-V demonstrates a higher correlation by over 5x with human +evaluations when compared to existing metrics. Our evaluation further reveals +that current video generation models perform poorly on these temporally complex +prompts, highlighting the need for future work in improving text-to-video +generation capabilities. + +
+
+
+
+
+ + ♻ ☆ From Isolated Conversations to Hierarchical Schemas: Dynamic Tree Memory + Representation for LLMs + + +
+ Recent advancements in large language models have significantly improved +their context windows, yet challenges in effective long-term memory management +remain. We introduce MemTree, an algorithm that leverages a dynamic, +tree-structured memory representation to optimize the organization, retrieval, +and integration of information, akin to human cognitive schemas. MemTree +organizes memory hierarchically, with each node encapsulating aggregated +textual content, corresponding semantic embeddings, and varying abstraction +levels across the tree's depths. Our algorithm dynamically adapts this memory +structure by computing and comparing semantic embeddings of new and existing +information to enrich the model's context-awareness. This approach allows +MemTree to handle complex reasoning and extended interactions more effectively +than traditional memory augmentation methods, which often rely on flat lookup +tables. Evaluations on benchmarks for multi-turn dialogue understanding and +document question answering show that MemTree significantly enhances +performance in scenarios that demand structured memory management. + +
+
+
+
+
+ + ♻ ☆ Accelerating Proximal Policy Optimization Learning Using Task Prediction + for Solving Environments with Delayed Rewards + + +
+ In this paper, we tackle the challenging problem of delayed rewards in +reinforcement learning (RL). While Proximal Policy Optimization (PPO) has +emerged as a leading Policy Gradient method, its performance can degrade under +delayed rewards. We introduce two key enhancements to PPO: a hybrid policy +architecture that combines an offline policy (trained on expert demonstrations) +with an online PPO policy, and a reward shaping mechanism using Time Window +Temporal Logic (TWTL). The hybrid architecture leverages offline data +throughout training while maintaining PPO's theoretical guarantees. Building on +the monotonic improvement framework of Trust Region Policy Optimization (TRPO), +we prove that our approach ensures improvement over both the offline policy and +previous iterations, with a bounded performance gap of +$(2\varsigma\gamma\alpha^2)/(1-\gamma)^2$, where $\alpha$ is the mixing +parameter, $\gamma$ is the discount factor, and $\varsigma$ bounds the expected +advantage. Additionally, we prove that our TWTL-based reward shaping preserves +the optimal policy of the original problem. TWTL enables formal translation of +temporal objectives into immediate feedback signals that guide learning. We +demonstrate the effectiveness of our approach through extensive experiments on +an inverted pendulum and a lunar lander environments, showing improvements in +both learning speed and final performance compared to standard PPO and +offline-only approaches. + +
+
+
+
+
+ + ♻ ☆ Leveraging LLM for Automated Ontology Extraction and Knowledge Graph + Generation + + +
+ Extracting relevant and structured knowledge from large, complex technical +documents within the Reliability and Maintainability (RAM) domain is +labor-intensive and prone to errors. Our work addresses this challenge by +presenting OntoKGen, a genuine pipeline for ontology extraction and Knowledge +Graph (KG) generation. OntoKGen leverages Large Language Models (LLMs) through +an interactive user interface guided by our adaptive iterative Chain of Thought +(CoT) algorithm to ensure that the ontology extraction process and, thus, KG +generation align with user-specific requirements. Although KG generation +follows a clear, structured path based on the confirmed ontology, there is no +universally correct ontology as it is inherently based on the user's +preferences. OntoKGen recommends an ontology grounded in best practices, +minimizing user effort and providing valuable insights that may have been +overlooked, all while giving the user complete control over the final ontology. +Having generated the KG based on the confirmed ontology, OntoKGen enables +seamless integration into schemeless, non-relational databases like Neo4j. This +integration allows for flexible storage and retrieval of knowledge from +diverse, unstructured sources, facilitating advanced querying, analysis, and +decision-making. Moreover, the generated KG serves as a robust foundation for +future integration into Retrieval Augmented Generation (RAG) systems, offering +enhanced capabilities for developing domain-specific intelligent applications. + +
+
+
+
+
+ + ♻ ☆ BPP-Search: Enhancing Tree of Thought Reasoning for Mathematical + Modeling Problem Solving + + +
+ LLMs exhibit advanced reasoning capabilities, offering the potential to +transform natural language questions into mathematical models. However, +existing open-source datasets in operations research domain lack detailed +annotations of the modeling process, such as variable definitions, focusing +solely on objective values, which hinders reinforcement learning applications. +To address this, we release the StructuredOR dataset, annotated with +comprehensive labels that capture the complete mathematical modeling process. +We further propose BPP-Search, a algorithm that integrates reinforcement +learning into a tree-of-thought structure using Beam search, a Process reward +model, and a pairwise Preference algorithm. This approach enables efficient +exploration of tree structures, avoiding exhaustive search while improving +accuracy. Extensive experiments on StructuredOR, NL4OPT, and MAMO-ComplexLP +datasets show that BPP-Search significantly outperforms state-of-the-art +methods. In tree-based reasoning, BPP-Search excels in accuracy and efficiency, +enabling faster retrieval of correct solutions. + +
+
+
+
+
+ + ♻ ☆ Filtered Direct Preference Optimization EMNLP 2024 + + +
+ Reinforcement learning from human feedback (RLHF) plays a crucial role in +aligning language models with human preferences. While the significance of +dataset quality is generally recognized, explicit investigations into its +impact within the RLHF framework, to our knowledge, have been limited. This +paper addresses the issue of text quality within the preference dataset by +focusing on direct preference optimization (DPO), an increasingly adopted +reward-model-free RLHF method. We confirm that text quality significantly +influences the performance of models optimized with DPO more than those +optimized with reward-model-based RLHF. Building on this new insight, we +propose an extension of DPO, termed filtered direct preference optimization +(fDPO). fDPO uses a trained reward model to monitor the quality of texts within +the preference dataset during DPO training. Samples of lower quality are +discarded based on comparisons with texts generated by the model being +optimized, resulting in a more accurate dataset. Experimental results +demonstrate that fDPO enhances the final model performance. Our code is +available at https://github.com/CyberAgentAILab/filtered-dpo. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Closed-Form Interpretation of Neural Network Latent Spaces with Symbolic + Gradients + + +
+ It has been demonstrated in many scientific fields that artificial neural +networks like autoencoders or Siamese networks encode meaningful concepts in +their latent spaces. However, there does not exist a comprehensive framework +for retrieving this information in a human-readable form without prior +knowledge. In order to extract these concepts, we introduce a framework for +finding closed-form interpretations of neurons in latent spaces of artificial +neural networks. The interpretation framework is based on embedding trained +neural networks into an equivalence class of functions that encode the same +concept. We interpret these neural networks by finding an intersection between +the equivalence class and human-readable equations defined by a symbolic search +space. The approach is demonstrated by retrieving invariants of matrices and +conserved quantities of dynamical systems from latent spaces of Siamese neural +networks. + +
+
+
+
+
+ + ♻ ☆ FullStack Bench: Evaluating LLMs as Full Stack Coders + + +
+ As the capabilities of code large language models (LLMs) continue to expand, +their applications across diverse code intelligence domains are rapidly +increasing. However, most existing datasets only evaluate limited application +domains. To address this gap, we have developed a comprehensive code evaluation +dataset FullStack Bench focusing on full-stack programming, which encompasses a +wide range of application domains (e.g., basic programming, data analysis, +software engineering, mathematics, and machine learning). Besides, to assess +multilingual programming capabilities, in FullStack Bench, we design real-world +instructions and corresponding unit test cases from 16 widely-used programming +languages to reflect real-world usage scenarios rather than simple +translations. Moreover, we also release an effective code sandbox execution +tool (i.e., SandboxFusion) supporting various programming languages and +packages to evaluate the performance of our FullStack Bench efficiently. +Comprehensive experimental results on our FullStack Bench demonstrate the +necessity and effectiveness of our FullStack Bench and SandboxFusion. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Burning RED: Unlocking Subtask-Driven Reinforcement Learning and + Risk-Awareness in Average-Reward Markov Decision Processes + + +
+ Average-reward Markov decision processes (MDPs) provide a foundational +framework for sequential decision-making under uncertainty. However, +average-reward MDPs have remained largely unexplored in reinforcement learning +(RL) settings, with the majority of RL-based efforts having been allocated to +episodic and discounted MDPs. In this work, we study a unique structural +property of average-reward MDPs and utilize it to introduce Reward-Extended +Differential (or RED) reinforcement learning: a novel RL framework that can be +used to effectively and efficiently solve various subtasks simultaneously in +the average-reward setting. We introduce a family of RED learning algorithms +for prediction and control, including proven-convergent algorithms for the +tabular case. We then showcase the power of these algorithms by demonstrating +how they can be used to learn a policy that optimizes, for the first time, the +well-known conditional value-at-risk (CVaR) risk measure in a fully-online +manner, without the use of an explicit bi-level optimization scheme or an +augmented state-space. + +
+
+
+
+
+ + ♻ ☆ Introduction to Reinforcement Learning + + +
+ Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI), +focuses on training agents to make decisions by interacting with their +environment to maximize cumulative rewards. This paper provides an overview of +RL, covering its core concepts, methodologies, and resources for further +learning. It offers a thorough explanation of fundamental components such as +states, actions, policies, and reward signals, ensuring readers develop a solid +foundational understanding. Additionally, the paper presents a variety of RL +algorithms, categorized based on the key factors such as model-free, +model-based, value-based, policy-based, and other key factors. Resources for +learning and implementing RL, such as books, courses, and online communities +are also provided. By offering a clear, structured introduction, this paper +aims to simplify the complexities of RL for beginners, providing a +straightforward pathway to understanding. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic + Vision-language Context Sparsification + + +
+ Multimodal Large Language Models (MLLMs) have achieved remarkable success in +vision understanding, reasoning, and interaction. However, the inference +computation and memory increase progressively with the generation of output +tokens during decoding, directly affecting the efficacy of MLLMs. Existing +methods attempt to reduce the vision context redundancy to achieve efficient +MLLMs. Unfortunately, the efficiency benefits of the vision context reduction +in the prefill stage gradually diminish during the decoding stage. To address +this problem, we proposed a dynamic vision-language context sparsification +framework Dynamic-LLaVA, which dynamically reduces the redundancy of vision +context in the prefill stage and decreases the memory and computation overhead +of the generated language context during decoding. Dynamic-LLaVA designs a +tailored sparsification inference scheme for different inference modes, i.e., +prefill, decoding with and without KV cache, to achieve efficient inference of +MLLMs. In practice, Dynamic-LLaVA can reduce computation consumption by +$\sim$75\% in the prefill stage. Meanwhile, throughout the entire generation +process of MLLMs, Dynamic-LLaVA reduces the $\sim$50\% computation consumption +under decoding without KV cache, while saving $\sim$50\% GPU memory overhead +when decoding with KV cache, due to the vision-language context sparsification. +Extensive experiments also demonstrate that Dynamic-LLaVA achieves efficient +inference for MLLMs with negligible understanding and generation ability +degradation or even performance gains compared to the full-context inference +baselines. Code is available at https://github.com/Osilly/dynamic_llava . + +
+
+ comment: Code is available at https://github.com/Osilly/dynamic_llava +
+
+
+
+
+ + ♻ ☆ Understanding complex crowd dynamics with generative neural simulators + + +
+ Understanding the dynamics of pedestrian crowds is an outstanding challenge +crucial for designing efficient urban infrastructure and ensuring safe crowd +management. To this end, both small-scale laboratory and large-scale real-world +measurements have been used. However, these approaches respectively lack +statistical resolution and parametric controllability, both essential to +discovering physical relationships underlying the complex stochastic dynamics +of crowds. Here, we establish an investigation paradigm that offers +laboratory-like controllability, while ensuring the statistical resolution of +large-scale real-world datasets. Using our data-driven Neural Crowd Simulator +(NeCS), which we train on large-scale data and validate against key statistical +features of crowd dynamics, we show that we can perform effective surrogate +crowd dynamics experiments without training on specific scenarios. We not only +reproduce known experimental results on pairwise avoidance, but also uncover +the vision-guided and topological nature of N-body interactions. These findings +show how virtual experiments based on neural simulation enable data-driven +scientific discovery. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Paired Autoencoders for Likelihood-free Estimation in Inverse Problems + + +
+ We consider the solution of nonlinear inverse problems where the forward +problem is a discretization of a partial differential equation. Such problems +are notoriously difficult to solve in practice and require minimizing a +combination of a data-fit term and a regularization term. The main +computational bottleneck of typical algorithms is the direct estimation of the +data misfit. Therefore, likelihood-free approaches have become appealing +alternatives. Nonetheless, difficulties in generalization and limitations in +accuracy have hindered their broader utility and applicability. In this work, +we use a paired autoencoder framework as a likelihood-free estimator for +inverse problems. We show that the use of such an architecture allows us to +construct a solution efficiently and to overcome some known open problems when +using likelihood-free estimators. In particular, our framework can assess the +quality of the solution and improve on it if needed. We demonstrate the +viability of our approach using examples from full waveform inversion and +inverse electromagnetic imaging. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ EVOR: Evolving Retrieval for Code Generation + + +
+ Recently the retrieval-augmented generation (RAG) has been successfully +applied in code generation. However, existing pipelines for retrieval-augmented +code generation (RACG) employ static knowledge bases with a single source, +limiting the adaptation capabilities of Large Language Models (LLMs) to domains +they have insufficient knowledge of. In this work, we develop a novel pipeline, +EVOR, that employs the synchronous evolution of both queries and diverse +knowledge bases. On two realistic settings where the external knowledge is +required to solve code generation tasks, we compile four new datasets +associated with frequently updated libraries and long-tail programming +languages, named EVOR-BENCH. Extensive experiments demonstrate that EVOR +achieves two to four times of execution accuracy compared to other methods such +as Reflexion (Shinn et al., 2024), DocPrompting (Zhou et al., 2023), etc. We +demonstrate that EVOR is flexible and can be easily combined with them to +achieve further improvement. Further analysis reveals that EVOR benefits from +the synchronous evolution of queries and documents and the diverse information +sources in the knowledge base. We hope that our studies will inspire more +insights into the design of advanced RACG pipelines in future research. Our +model, code, and data are available at https://arks-codegen.github.io. + +
+
+ comment: Retrieval-augmented code generation +
+
+
+
+
+ + ♻ ☆ Predictive Models in Sequential Recommendations: Bridging Performance + Laws with Data Quality Insights + + +
+ Sequential Recommendation (SR) plays a critical role in predicting users' +sequential preferences. Despite its growing prominence in various industries, +the increasing scale of SR models incurs substantial computational costs and +unpredictability, challenging developers to manage resources efficiently. Under +this predicament, Scaling Laws have achieved significant success by examining +the loss as models scale up. However, there remains a disparity between loss +and model performance, which is of greater concern in practical applications. +Moreover, as data continues to expand, it incorporates repetitive and +inefficient data. In response, we introduce the Performance Law for SR models, +which aims to theoretically investigate and model the relationship between +model performance and data quality. Specifically, we first fit the HR and NDCG +metrics to transformer-based SR models. Subsequently, we propose Approximate +Entropy (ApEn) to assess data quality, presenting a more nuanced approach +compared to traditional data quantity metrics. Our method enables accurate +predictions across various dataset scales and model sizes, demonstrating a +strong correlation in large SR models and offering insights into achieving +optimal performance for any given model configuration. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Exploring the hierarchical structure of human plans via program + generation + + +
+ Human behavior is often assumed to be hierarchically structured, made up of +abstract actions that can be decomposed into concrete actions. However, +behavior is typically measured as a sequence of actions, which makes it +difficult to infer its hierarchical structure. In this paper, we explore how +people form hierarchically structured plans, using an experimental paradigm +with observable hierarchical representations: participants create programs that +produce sequences of actions in a language with explicit hierarchical +structure. This task lets us test two well-established principles of human +behavior: utility maximization (i.e. using fewer actions) and minimum +description length (MDL; i.e. having a shorter program). We find that humans +are sensitive to both metrics, but that both accounts fail to predict a +qualitative feature of human-created programs, namely that people prefer +programs with reuse over and above the predictions of MDL. We formalize this +preference for reuse by extending the MDL account into a generative model over +programs, modeling hierarchy choice as the induction of a grammar over actions. +Our account can explain the preference for reuse and provides better +predictions of human behavior, going beyond simple accounts of compressibility +to highlight a principle that guides hierarchical planning. + +
+
+
+
+
+ + ♻ ☆ A Probabilistic Perspective on Unlearning and Alignment for Large + Language Models + + +
+ Comprehensive evaluation of Large Language Models (LLMs) is an open research +problem. Existing evaluations rely on deterministic point estimates generated +via greedy decoding. However, we find that deterministic evaluations fail to +capture the whole output distribution of a model, yielding inaccurate +estimations of model capabilities. This is particularly problematic in critical +contexts such as unlearning and alignment, where precise model evaluations are +crucial. To remedy this, we introduce the first formal probabilistic evaluation +framework in LLMs. Namely, we derive novel metrics with high-probability +guarantees concerning the output distribution of a model. Our metrics are +application-independent and allow practitioners to make more reliable estimates +about model capabilities before deployment. Through a case study focused on +unlearning, we reveal that deterministic evaluations falsely indicate +successful unlearning, whereas our probabilistic evaluations demonstrate that +most if not all of the supposedly unlearned information remains accessible in +these models. Additionally, we propose a novel unlearning loss based on entropy +optimization and adaptive temperature scaling, which significantly improves +unlearning in probabilistic settings on recent benchmarks. Our proposed shift +from point estimates to probabilistic evaluations of output distributions +represents an important step toward comprehensive evaluations of LLMs. Code +available at https://github.com/yascho/probabilistic-unlearning. + +
+
+
+
+
+ + ♻ ☆ BayLing 2: A Multilingual Large Language Model with Efficient Language + Alignment + + +
+ Large language models (LLMs), with their powerful generative capabilities and +vast knowledge, empower various tasks in everyday life. However, these +abilities are primarily concentrated in high-resource languages, leaving +low-resource languages with weaker generative capabilities and relatively +limited knowledge. Enhancing the multilingual capabilities of LLMs is therefore +crucial for serving over 100 linguistic communities worldwide. An intuitive +approach to enhance the multilingual capabilities would be to construct +instruction data for various languages, but constructing instruction data for +over 100 languages is prohibitively costly. In this paper, we introduce BayLing +2, which efficiently transfers generative capabilities and knowledge from +high-resource languages to low-resource languages through language alignment. +To achieve this, we constructed a dataset of 3.2 million instructions, +comprising high-resource language instructions (Chinese and English) and +cross-lingual instructions for 100+ languages and performed instruction tuning +based on the dataset to facilitate the capability transfer between languages. +Using Llama as the foundation model, we developed BayLing-2-7B, BayLing-2-13B, +and BayLing-2-8B, and conducted a comprehensive evaluation of BayLing. For +multilingual translation across 100+ languages, BayLing shows superior +performance compared to open-source models of similar scale. For multilingual +knowledge and understanding benchmarks, BayLing achieves significant +improvements across over 20 low-resource languages, demonstrating its +capability of effective knowledge transfer from high-resource to low-resource +languages. Furthermore, results on English benchmarks indicate that BayLing +maintains high performance in highresource languages while enhancing the +performance in low-resource languages. Demo, homepage, code and models of +BayLing are available. + +
+
+ comment: BayLing 2's online demo: http://nlp.ict.ac.cn/bayling/demo. BayLing + 2's code and models: https://github.com/ictnlp/BayLing +
+
+
+
+
+ + ♻ ☆ A Novel Approach to Comprehending Users' Preferences for Accurate + Personalized News Recommendation + + +
+ Personalized news recommendation aims to assist users in finding news +articles that align with their interests, which plays a pivotal role in +mitigating users' information overload problem. Although many recent works have +been studied for better personalized news recommendation, the following +challenges should be explored more: (C1) Comprehending manifold intents coupled +within a news article, (C2) Differentiating varying post-read preferences of +news articles, and (C3) Addressing the cold-start user problem. To tackle the +aforementioned challenges together, in this paper, we propose a novel +personalized news recommendation framework (CROWN) that employs (1) +category-guided intent disentanglement for (C1), (2) consistency-based news +representation for (C2), and (3) GNN-enhanced hybrid user representation for +(C3). Furthermore, we incorporate a category prediction into the training +process of CROWN as an auxiliary task, which provides supplementary supervisory +signals to enhance intent disentanglement. Extensive experiments on two +real-world datasets reveal that (1) CROWN provides consistent performance +improvements over ten state-of-the-art news recommendation methods and (2) the +proposed strategies significantly improve the accuracy of CROWN. + +
+
+ comment: 10 pages, 6 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ PolarBEVDet: Exploring Polar Representation for Multi-View 3D Object + Detection in Bird's-Eye-View + + +
+ Recently, LSS-based multi-view 3D object detection provides an economical and +deployment-friendly solution for autonomous driving. However, all the existing +LSS-based methods transform multi-view image features into a Cartesian +Bird's-Eye-View(BEV) representation, which does not take into account the +non-uniform image information distribution and hardly exploits the view +symmetry. In this paper, in order to adapt the image information distribution +and preserve the view symmetry by regular convolution, we propose to employ the +polar BEV representation to substitute the Cartesian BEV representation. To +achieve this, we elaborately tailor three modules: a polar view transformer to +generate the polar BEV representation, a polar temporal fusion module for +fusing historical polar BEV features and a polar detection head to predict the +polar-parameterized representation of the object. In addition, we design a 2D +auxiliary detection head and a spatial attention enhancement module to improve +the quality of feature extraction in perspective view and BEV, respectively. +Finally, we integrate the above improvements into a novel multi-view 3D object +detector, PolarBEVDet. Experiments on nuScenes show that PolarBEVDet achieves +the superior performance. The code is available at +https://github.com/Yzichen/PolarBEVDet.git. + +
+
+ comment: 11 pages, 6 figures. This work has been submitted to the IEEE for + possible publication +
+
+
+
+
+ + ♻ ☆ PITN: Physics-Informed Temporal Networks for Cuffless Blood Pressure + Estimation + + +
+ Monitoring blood pressure with non-invasive sensors has gained popularity for +providing comfortable user experiences, one of which is a significant function +of smart wearables. Although providing a comfortable user experience, such +methods are suffering from the demand for a significant amount of realistic +data to train an individual model for each subject, especially considering the +invasive or obtrusive BP ground-truth measurements. To tackle this challenge, +we introduce a novel physics-informed temporal network~(PITN) with adversarial +contrastive learning to enable precise BP estimation with very limited data. +Specifically, we first enhance the physics-informed neural network~(PINN) with +the temporal block for investigating BP dynamics' multi-periodicity for +personal cardiovascular cycle modeling and temporal variation. We then employ +adversarial training to generate extra physiological time series data, +improving PITN's robustness in the face of sparse subject-specific training +data. Furthermore, we utilize contrastive learning to capture the +discriminative variations of cardiovascular physiologic phenomena. This +approach aggregates physiological signals with similar blood pressure values in +latent space while separating clusters of samples with dissimilar blood +pressure values. Experiments on three widely-adopted datasets with different +modailties (\emph{i.e.,} bioimpedance, PPG, millimeter-wave) demonstrate the +superiority and effectiveness of the proposed methods over previous +state-of-the-art approaches. The code is available +at~\url{https://github.com/Zest86/ACL-PITN}. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ From Robustness to Explainability and Back Again + + +
+ Formal explainability guarantees the rigor of computed explanations, and so +it is paramount in domains where rigor is critical, including those deemed +high-risk. Unfortunately, since its inception formal explainability has been +hampered by poor scalability. At present, this limitation still holds true for +some families of classifiers, the most significant being deep neural networks. +This paper addresses the poor scalability of formal explainability and proposes +novel efficient algorithms for computing formal explanations. The novel +algorithm computes explanations by answering instead a number of robustness +queries, and such that the number of such queries is at most linear on the +number of features. Consequently, the proposed algorithm establishes a direct +relationship between the practical complexity of formal explainability and that +of robustness. To achieve the proposed goals, the paper generalizes the +definition of formal explanations, thereby allowing the use of robustness tools +that are based on different distance norms, and also by reasoning in terms of +some target degree of robustness. Preliminary experiments validate the +practical efficiency of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ EnrichEvent: Enriching Social Data with Contextual Information for + Emerging Event Extraction + + +
+ Social platforms have emerged as crucial platforms for disseminating +information and discussing real-life social events, offering researchers an +excellent opportunity to design and implement novel event detection frameworks. +However, most existing approaches only exploit keyword burstiness or network +structures to detect unspecified events. Thus, they often need help identifying +unknown events regarding the challenging nature of events and social data. +Social data, e.g., tweets, is characterized by misspellings, incompleteness, +word sense ambiguation, irregular language, and variation in aspects of +opinions. Moreover, extracting discriminative features and patterns for +evolving events by exploiting the limited structural knowledge is almost +infeasible. To address these challenges, in this paper, we propose a novel +framework, namely EnrichEvent, that leverages the linguistic and contextual +representations of streaming social data. In particular, we leverage contextual +and linguistic knowledge to detect semantically related tweets and enhance the +effectiveness of the event detection approaches. Eventually, our proposed +framework produces cluster chains for each event to show the evolving variation +of the event through time. We conducted extensive experiments to evaluate our +framework, validating its high performance and effectiveness in detecting and +distinguishing unspecified social events. + +
+
+
+
+
+ + ♻ ☆ Morescient GAI for Software Engineering (Extended Version) + + +
+ The ability of Generative AI (GAI) technology to automatically check, +synthesize and modify software engineering artifacts promises to revolutionize +all aspects of software engineering. Using GAI for software engineering tasks +is consequently one of the most rapidly expanding fields of software +engineering research, with over a hundred LLM-based code models having been +published since 2021. However, the overwhelming majority of existing code +models share a major weakness - they are exclusively trained on the syntactic +facet of software, significantly lowering their trustworthiness in tasks +dependent on software semantics. To address this problem, a new class of +"Morescient" GAI is needed that is "aware" of (i.e., trained on) both the +semantic and static facets of software. This, in turn, will require a new +generation of software observation platforms capable of generating large +quantities of execution observations in a structured and readily analyzable +way. In this paper, we present a vision and roadmap for how such "Morescient" +GAI models can be engineered, evolved and disseminated according to the +principles of open science. + +
+
+ comment: To appear in ACM Transactions on Software Engineering and + Methodology, Special Issue "2030 Roadmap Software Engineering" +
+
+
+
+
+ + ♻ ☆ Latent Diffusion Model-Enabled Low-Latency Semantic Communication in the + Presence of Semantic Ambiguities and Wireless Channel Noises + + +
+ Deep learning (DL)-based Semantic Communications (SemCom) is becoming +critical to maximize overall efficiency of communication networks. +Nevertheless, SemCom is sensitive to wireless channel uncertainties, source +outliers, and suffer from poor generalization bottlenecks. To address the +mentioned challenges, this paper develops a latent diffusion model-enabled +SemCom system with three key contributions, i.e., i) to handle potential +outliers in the source data, semantic errors obtained by projected gradient +descent based on the vulnerabilities of DL models, are utilized to update the +parameters and obtain an outlier-robust encoder, ii) a lightweight single-layer +latent space transformation adapter completes one-shot learning at the +transmitter and is placed before the decoder at the receiver, enabling +adaptation for out-of-distribution data and enhancing human-perceptual quality, +and iii) an end-to-end consistency distillation (EECD) strategy is used to +distill the diffusion models trained in latent space, enabling deterministic +single or few-step low-latency denoising in various noisy channels while +maintaining high semantic quality. Extensive numerical experiments across +different datasets demonstrate the superiority of the proposed SemCom system, +consistently proving its robustness to outliers, the capability to transmit +data with unknown distributions, and the ability to perform real-time channel +denoising tasks while preserving high human perceptual quality, outperforming +the existing denoising approaches in semantic metrics like learned perceptual +image path similarity (LPIPS). + +
+
+
+
+
+ + ♻ ☆ The Logic of Counterfactuals and the Epistemology of Causal Inference + + +
+ The 2021 Nobel Prize in Economics recognizes a type of causal model known as +the Rubin causal model, or potential outcome framework, which deserves far more +attention from philosophers than it currently receives. To spark philosophers' +interest, I develop a dialectic connecting the Rubin causal model to the +Lewis-Stalnaker debate on a logical principle of counterfactuals: Conditional +Excluded Middle (CEM). I begin by playing good cop for CEM, developing a new +argument in its favor -- a Quine-Putnam-style indispensability argument. This +argument is based on the observation that CEM seems to be indispensable to the +Rubin causal model, which underpins our best scientific theory of causal +inference in health and social sciences -- a Nobel Prize-winning theory. +Indeed, CEM has long remained a core assumption of the Rubin causal model, +despite challenges from within the statistics and economics communities over +twenty years ago. I then switch sides to play bad cop for CEM, undermining the +indispensability argument by developing a new theory of causal inference that +dispenses with CEM while preserving the successes of the original theory +(thanks to a new theorem proved here). The key, somewhat surprisingly, is to +integrate two approaches to causal modeling: the Rubin causal model, more +familiar in health and social sciences, and the causal Bayes net, more familiar +in philosophy. The good cop/bad cop dialectic is concluded with a connection to +broader philosophical issues, including intertheory relations, the revisability +of logic, and the role of background assumptions in justifying scientific +inference. + +
+
+
+
+
+ + ♻ ☆ Governance of Generative Artificial Intelligence for Companies + + +
+ Generative Artificial Intelligence (GenAI), specifically large language +models like ChatGPT, has swiftly entered organizations without adequate +governance, posing both opportunities and risks. Despite extensive debates on +GenAI's transformative nature and regulatory measures, limited research +addresses organizational governance, encompassing technical and business +perspectives. Although numerous frameworks for governance of AI exist, it is +not clear to what extent they apply to GenAI. Our review paper fills this gap +by surveying recent works with the purpose of better understanding fundamental +characteristics of GenAI and adjusting prior frameworks specifically towards +GenAI governance within companies. To do so, it extends Nickerson's framework +development processes to include prior conceptualizations. Our framework +outlines the scope, objectives, and governance mechanisms tailored to harness +business opportunities as well as mitigate risks associated with GenAI +integration. Our research contributes a focused approach to GenAI governance, +offering practical insights for companies navigating the challenges of GenAI +adoption and highlighting research gaps. + +
+
+
+
+
+ + ♻ ☆ LLM-ABBA: Understanding time series via symbolic approximation + + +
+ The success of large language models (LLMs) for time series has been +demonstrated in previous work. Utilizing a symbolic time series representation, +one can efficiently bridge the gap between LLMs and time series. However, the +remaining challenge is to exploit the semantic information hidden in time +series by using symbols or existing tokens of LLMs, while aligning the +embedding space of LLMs according to the hidden information of time series. The +symbolic time series approximation (STSA) method called adaptive Brownian +bridge-based symbolic aggregation (ABBA) shows outstanding efficacy in +preserving salient time series features by modeling time series patterns in +terms of amplitude and period while using existing tokens of LLMs. + In this paper, we introduce a method, called LLM-ABBA, that integrates ABBA +into large language models for various downstream time series tasks. By +symbolizing time series, LLM-ABBA compares favorably to the recent +state-of-the-art (SOTA) in UCR and three medical time series classification +tasks. Meanwhile, a fixed-polygonal chain trick in ABBA is introduced to +\kc{avoid obvious drifting} during prediction tasks by significantly mitigating +the effects of cumulative error arising from misused symbols during the +transition from symbols to numerical values. In time series regression tasks, +LLM-ABBA achieves the new SOTA on Time Series Extrinsic Regression (TSER) +benchmarks. LLM-ABBA also shows competitive prediction capability compared to +recent SOTA time series prediction results. We believe this framework can also +seamlessly extend to other time series tasks. + +
+
+
+
+
+ + ♻ ☆ OceanCastNet: A Deep Learning Ocean Wave Model with Energy Conservation + + +
+ Traditional wave forecasting models, although based on energy conservation +equations, are computationally expensive. On the other hand, existing deep +learning geophysical fluid models, while computationally efficient, often +suffer from issues such as energy dissipation in long-term forecasts. This +paper proposes a novel energy-balanced deep learning wave forecasting model +called OceanCastNet (OCN). By incorporating wind fields at the current, +previous, and future time steps, as well as wave fields at the current and +previous time steps as input variables, OCN maintains energy balance within the +model. Furthermore, the model employs adaptive Fourier operators as its core +components and designs a masked loss function to better handle the impact of +land-sea boundaries. A series of experiments on the ERA5 dataset demonstrate +that OCN can achieve short-term forecast accuracy comparable to traditional +models while exhibiting an understanding of the wave generation process. In +comparative experiments under both normal and extreme conditions, OCN +consistently outperforms the widely used WaveWatch III model in the industry. +Even after long-term forecasting, OCN maintains a stable and energy-rich state. +By further constructing a simple meteorological model, OCN-wind, which +considers energy balance, this paper confirms the importance of energy +constraints for improving the long-term forecast performance of deep learning +meteorological models. This finding provides new ideas for future research on +deep learning geophysical fluid models. + +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ Normalizing self-supervised learning for provably reliable Change Point + Detection + + +
+ Change point detection (CPD) methods aim to identify abrupt shifts in the +distribution of input data streams. Accurate estimators for this task are +crucial across various real-world scenarios. Yet, traditional unsupervised CPD +techniques face significant limitations, often relying on strong assumptions or +suffering from low expressive power due to inherent model simplicity. In +contrast, representation learning methods overcome these drawbacks by offering +flexibility and the ability to capture the full complexity of the data without +imposing restrictive assumptions. However, these approaches are still emerging +in the CPD field and lack robust theoretical foundations to ensure their +reliability. Our work addresses this gap by integrating the expressive power of +representation learning with the groundedness of traditional CPD techniques. We +adopt spectral normalization (SN) for deep representation learning in CPD tasks +and prove that the embeddings after SN are highly informative for CPD. Our +method significantly outperforms current state-of-the-art methods during the +comprehensive evaluation via three standard CPD datasets. + +
+
+
+
+
+ + ♻ ☆ ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness + in Web Agents + + +
+ Recent advancements in Web agents have introduced novel architectures and +benchmarks showcasing progress in autonomous web navigation and interaction. +However, most existing benchmarks prioritize effectiveness and accuracy, +overlooking factors like safety and trustworthiness which are essential for +deploying web agents in enterprise settings. We present STWebAgentBench, a +benchmark designed to evaluate web agents safety and trustworthiness across six +critical dimensions, essential for reliability in enterprise applications. This +benchmark is grounded in a detailed framework that defines safe and trustworthy +(ST) agent behavior. Our work extends WebArena with safety templates and +evaluation functions to assess safety policy compliance rigorously. We +introduce the Completion Under Policy to measure task success while adhering to +policies, alongside the Risk Ratio, which quantifies policy violations across +dimensions, providing actionable insights to address safety gaps. Our +evaluation reveals that current SOTA agents struggle with policy adherence and +cannot yet be relied upon for critical business applications. We open-source +this benchmark and invite the community to contribute, with the goal of +fostering a new generation of safer, more trustworthy AI agents. All code, +data, environment reproduction resources, and video demonstrations are +available at https://sites.google.com/view/st-webagentbench/home. + +
+
+
+
+
+ + ♻ ☆ Towards Cross-Lingual Audio Abuse Detection in Low-Resource Settings + with Few-Shot Learning COLING 2025 + + +
+ Online abusive content detection, particularly in low-resource settings and +within the audio modality, remains underexplored. We investigate the potential +of pre-trained audio representations for detecting abusive language in +low-resource languages, in this case, in Indian languages using Few Shot +Learning (FSL). Leveraging powerful representations from models such as Wav2Vec +and Whisper, we explore cross-lingual abuse detection using the ADIMA dataset +with FSL. Our approach integrates these representations within the +Model-Agnostic Meta-Learning (MAML) framework to classify abusive language in +10 languages. We experiment with various shot sizes (50-200) evaluating the +impact of limited data on performance. Additionally, a feature visualization +study was conducted to better understand model behaviour. This study highlights +the generalization ability of pre-trained models in low-resource scenarios and +offers valuable insights into detecting abusive language in multilingual +contexts. + +
+
+ comment: Accepted as part of the proceedings of COLING 2025 +
+
+
+
+
+ + ♻ ☆ A Domain-Independent Agent Architecture for Adaptive Operation in + Evolving Open Worlds + + +
+ Model-based reasoning agents are ill-equipped to act in novel situations in +which their model of the environment no longer sufficiently represents the +world. We propose HYDRA - a framework for designing model-based agents +operating in mixed discrete-continuous worlds, that can autonomously detect +when the environment has evolved from its canonical setup, understand how it +has evolved, and adapt the agents' models to perform effectively. HYDRA is +based upon PDDL+, a rich modeling language for planning in mixed, +discrete-continuous environments. It augments the planning module with visual +reasoning, task selection, and action execution modules for closed-loop +interaction with complex environments. HYDRA implements a novel meta-reasoning +process that enables the agent to monitor its own behavior from a variety of +aspects. The process employs a diverse set of computational methods to maintain +expectations about the agent's own behavior in an environment. Divergences from +those expectations are useful in detecting when the environment has evolved and +identifying opportunities to adapt the underlying models. HYDRA builds upon +ideas from diagnosis and repair and uses a heuristics-guided search over model +changes such that they become competent in novel conditions. The HYDRA +framework has been used to implement novelty-aware agents for three diverse +domains - CartPole++ (a higher dimension variant of a classic control problem), +Science Birds (an IJCAI competition problem), and PogoStick (a specific problem +domain in Minecraft). We report empirical observations from these domains to +demonstrate the efficacy of various components in the novelty meta-reasoning +process. + +
+
+
+
+
+ + ♻ ☆ Center-Sensitive Kernel Optimization for Efficient On-Device Incremental + Learning + + +
+ To facilitate the evolution of edge intelligence in ever-changing +environments, we study on-device incremental learning constrained in limited +computation resource in this paper. Current on-device training methods just +focus on efficient training without considering the catastrophic forgetting, +preventing the model getting stronger when continually exploring the world. To +solve this problem, a direct solution is to involve the existing incremental +learning mechanisms into the on-device training framework. Unfortunately, such +a manner cannot work well as those mechanisms usually introduce large +additional computational cost to the network optimization process, which would +inevitably exceed the memory capacity of the edge devices. To address this +issue, this paper makes an early effort to propose a simple but effective +edge-friendly incremental learning framework. Based on an empirical study on +the knowledge intensity of the kernel elements of the neural network, we find +that the center kernel is the key for maximizing the knowledge intensity for +learning new data, while freezing the other kernel elements would get a good +balance on the model's capacity for overcoming catastrophic forgetting. Upon +this finding, we further design a center-sensitive kernel optimization +framework to largely alleviate the cost of the gradient computation and +back-propagation. Besides, a dynamic channel element selection strategy is also +proposed to facilitate a sparse orthogonal gradient projection for further +reducing the optimization complexity, upon the knowledge explored from the new +task data. Extensive experiments validate our method is efficient and +effective, e.g., our method achieves average accuracy boost of 38.08% with even +less memory and approximate computation compared to existing on-device training +methods, indicating its significant potential for on-device incremental +learning. + +
+
+
+
+
+ + ♻ ☆ VISION-XL: High Definition Video Inverse Problem Solver using Latent + Image Diffusion Models + + +
+ In this paper, we propose a novel framework for solving high-definition video +inverse problems using latent image diffusion models. Building on recent +advancements in spatio-temporal optimization for video inverse problems using +image diffusion models, our approach leverages latent-space diffusion models to +achieve enhanced video quality and resolution. To address the high +computational demands of processing high-resolution frames, we introduce a +pseudo-batch consistent sampling strategy, allowing efficient operation on a +single GPU. Additionally, to improve temporal consistency, we present +batch-consistent inversion, an initialization technique that incorporates +informative latents from the measurement frame. By integrating with SDXL, our +framework achieves state-of-the-art video reconstruction across a wide range of +spatio-temporal inverse problems, including complex combinations of frame +averaging and various spatial degradations, such as deblurring, +super-resolution, and inpainting. Unlike previous methods, our approach +supports multiple aspect ratios (landscape, vertical, and square) and delivers +HD-resolution reconstructions (exceeding 1280x720) in under 2.5 minutes on a +single NVIDIA 4090 GPU. + +
+
+ comment: Project page: https://vision-xl.github.io/ +
+
+
+
+
+ + ♻ ☆ MBA-RAG: a Bandit Approach for Adaptive Retrieval-Augmented Generation + through Question Complexity COLING 2025 + + +
+ Retrieval Augmented Generation (RAG) has proven to be highly effective in +boosting the generative performance of language model in knowledge-intensive +tasks. However, existing RAG framework either indiscriminately perform +retrieval or rely on rigid single-class classifiers to select retrieval +methods, leading to inefficiencies and suboptimal performance across queries of +varying complexity. To address these challenges, we propose a reinforcement +learning-based framework that dynamically selects the most suitable retrieval +strategy based on query complexity. % our solution Our approach leverages a +multi-armed bandit algorithm, which treats each retrieval method as a distinct +``arm'' and adapts the selection process by balancing exploration and +exploitation. Additionally, we introduce a dynamic reward function that +balances accuracy and efficiency, penalizing methods that require more +retrieval steps, even if they lead to a correct result. Our method achieves new +state of the art results on multiple single-hop and multi-hop datasets while +reducing retrieval costs. Our code are available at +https://github.com/FUTUREEEEEE/MBA . + +
+
+ comment: COLING 2025 +
+
+
+
+
+ + ♻ ☆ MarineFormer: A Spatio-Temporal Attention Model for USV Navigation in + Dynamic Marine Environments + + +
+ Navigating autonomously in marine environments including dynamic and static +obstacles, and strong flow disturbances, such as in high-flow rivers, poses +significant challenges for USVs. To address these challenges, we propose a +novel methodology that leverages two types of attention: spatial attention, +which learns to integrate diverse environmental factors and sensory information +into navigation decisions, and temporal attention within a transformer +framework to account for the dynamic, continuously changing nature of the +environment. We devise MarineFormer, a Trans{\bf former}-based navigation +policy for dynamic {\bf Marine} environments, trained end-to-end through +reinforcement learning (RL). At its core, MarineFormer uses graph attention to +capture spatial information and a transformer architecture to process temporal +sequences in an environment that simulates a 2D turbulent marine condition +involving multiple static and dynamic obstacles. We extensively evaluate the +performance of the proposed method versus the state-of-the-art methods, as well +as other classical planners. Our approach outperforms the state-of-the-art by +nearly $20\%$ in episode completion success rate and additionally enhances the +USV's path length efficiency. + +
+
+
+
+
+ + ♻ ☆ Bidirectional Decoding: Improving Action Chunking via Closed-Loop + Resampling + + +
+ Predicting and executing a sequence of actions without intermediate +replanning, known as action chunking, is increasingly used in robot learning +from human demonstrations. Yet, its reported effects on the learned policy are +inconsistent: some studies find it crucial for achieving strong results, while +others observe decreased performance. In this paper, we first dissect how +action chunking impacts the divergence between a learner and a demonstrator. We +find that action chunking allows the learner to better capture the temporal +dependencies in demonstrations but at the cost of reduced reactivity in +stochastic environments. To address this tradeoff, we propose Bidirectional +Decoding (BID), a test-time inference algorithm that bridges action chunking +with closed-loop operations. BID samples multiple predictions at each time step +and searches for the optimal one based on two criteria: (i) backward coherence, +which favors samples that align with previous decisions; (ii) forward contrast, +which seeks samples of high likelihood for future plans. By coupling decisions +within and across action chunks, BID promotes consistency over time while +maintaining reactivity to unexpected changes. Experimental results show that +BID boosts the performance of two state-of-the-art generative policies across +seven simulation benchmarks and two real-world tasks. Code and videos are +available at https://bid-robot.github.io. + +
+
+ comment: Project website: https://bid-robot.github.io/ +
+
+
+
+
+ + ♻ ☆ CultureLLM: Incorporating Cultural Differences into Large Language + Models NeurIPS 2024 + + +
+ Large language models (LLMs) are reported to be partial to certain cultures +owing to the training data dominance from the English corpora. Since +multilingual cultural data are often expensive to collect, existing efforts +handle this by prompt engineering or culture-specific pre-training. However, +they might overlook the knowledge deficiency of low-resource culture and +require extensive computing resources. In this paper, we propose CultureLLM, a +cost-effective solution to incorporate cultural differences into LLMs. +CultureLLM adopts World Value Survey (WVS) as seed data and generates +semantically equivalent training data via the proposed semantic data +augmentation. Using only 50 seed samples from WVS with augmented data, we +fine-tune culture-specific LLMs and one unified model (CultureLLM-One) for 9 +cultures covering rich and low-resource languages. Extensive experiments on 60 +culture-related datasets demonstrate that CultureLLM significantly outperforms +various counterparts such as GPT-3.5 (by 8.1%) and Gemini Pro (by 9.5%) with +comparable performance to GPT-4 or even better. Our human study shows that the +generated samples are semantically equivalent to the original samples, +providing an effective solution for LLMs augmentation. Code is released at +https://github.com/Scarelette/CultureLLM. + +
+
+ comment: NeurIPS 2024; Code is at https://github.com/Scarelette/CultureLLM +
+
+
+
+
+ + ♻ ☆ Harmful Fine-tuning Attacks and Defenses for Large Language Models: A + Survey + + +
+ Recent research demonstrates that the nascent fine-tuning-as-a-service +business model exposes serious safety concerns -- fine-tuning over a few +harmful data uploaded by the users can compromise the safety alignment of the +model. The attack, known as harmful fine-tuning attack, has raised a broad +research interest among the community. However, as the attack is still new, +\textbf{we observe that there are general misunderstandings within the research +community.} To clear up concern, this paper provide a comprehensive overview to +three aspects of harmful fine-tuning: attacks setting, defense design and +evaluation methodology. Specifically, we first present the threat model of the +problem, and introduce the harmful fine-tuning attack and its variants. Then we +systematically survey the existing literature on attacks/defenses/mechanical +analysis of the problem. Finally, we introduce the evaluation methodology and +outline future research directions that might contribute to the development of +the field. Additionally, we present a list of questions of interest, which +might be useful to refer to when reviewers in the peer review process question +the realism of the experiment/attack/defense setting. A curated list of +relevant papers is maintained and made accessible at: +https://github.com/git-disl/awesome_LLM-harmful-fine-tuning-papers. + +
+
+
+
+
+ + ♻ ☆ Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation + Models + + +
+ Go-Explore is a powerful family of algorithms designed to solve +hard-exploration problems built on the principle of archiving discovered +states, and iteratively returning to and exploring from the most promising +states. This approach has led to superhuman performance across a wide variety +of challenging problems including Atari games and robotic control, but requires +manually designing heuristics to guide exploration (i.e., determine which +states to save and explore from, and what actions to consider next), which is +time-consuming and infeasible in general. To resolve this, we propose +Intelligent Go-Explore (IGE) which greatly extends the scope of the original +Go-Explore by replacing these handcrafted heuristics with the intelligence and +internalized human notions of interestingness captured by giant pretrained +foundation models (FMs). This provides IGE with a human-like ability to +instinctively identify how interesting or promising any new state is (e.g., +discovering new objects, locations, or behaviors), even in complex environments +where heuristics are hard to define. Moreover, IGE offers the exciting +opportunity to recognize and capitalize on serendipitous discoveries-states +encountered during exploration that are valuable in terms of exploration, yet +where what makes them interesting was not anticipated by the human user. We +evaluate our algorithm on a diverse range of language and vision-based tasks +that require search and exploration. Across these tasks, IGE strongly exceeds +classic reinforcement learning and graph search baselines, and also succeeds +where prior state-of-the-art FM agents like Reflexion completely fail. Overall, +Intelligent Go-Explore combines the tremendous strengths of FMs and the +powerful Go-Explore algorithm, opening up a new frontier of research into +creating more generally capable agents with impressive exploration +capabilities. + +
+
+
+
+
+ + ♻ ☆ Embedded Prompt Tuning: Towards Enhanced Calibration of Pretrained + Models for Medical Images + + +
+ Foundation models pre-trained on large-scale data have been widely witnessed +to achieve success in various natural imaging downstream tasks. +Parameter-efficient fine-tuning (PEFT) methods aim to adapt foundation models +to new domains by updating only a small portion of parameters in order to +reduce computational overhead. However, the effectiveness of these PEFT +methods, especially in cross-domain few-shot scenarios, e.g., medical image +analysis, has not been fully explored. In this work, we facilitate the study of +the performance of PEFT when adapting foundation models to medical image +classification tasks. Furthermore, to alleviate the limitations of prompt +introducing ways and approximation capabilities on Transformer architectures of +mainstream prompt tuning methods, we propose the Embedded Prompt Tuning (EPT) +method by embedding prompt tokens into the expanded channels. We also find that +there are anomalies in the feature space distribution of foundation models +during pre-training process, and prompt tuning can help mitigate this negative +impact. To explain this phenomenon, we also introduce a novel perspective to +understand prompt tuning: Prompt tuning is a distribution calibrator. And we +support it by analyzing patch-wise scaling and feature separation operations +contained in EPT. Our experiments show that EPT outperforms several +state-of-the-art fine-tuning methods by a significant margin on few-shot +medical image classification tasks, and completes the fine-tuning process +within highly competitive time, indicating EPT is an effective PEFT method. The +source code is available at github.com/zuwenqiang/EPT. + +
+
+
+
+
+ + ♻ ☆ FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL NeurIPS '24 + + +
+ Multi-agent reinforcement learning has demonstrated significant potential in +addressing complex cooperative tasks across various real-world applications. +However, existing MARL approaches often rely on the restrictive assumption that +the number of entities (e.g., agents, obstacles) remains constant between +training and inference. This overlooks scenarios where entities are dynamically +removed or added during the inference trajectory -- a common occurrence in +real-world environments like search and rescue missions and dynamic combat +situations. In this paper, we tackle the challenge of intra-trajectory dynamic +entity composition under zero-shot out-of-domain (OOD) generalization, where +such dynamic changes cannot be anticipated beforehand. Our empirical studies +reveal that existing MARL methods suffer significant performance degradation +and increased uncertainty in these scenarios. In response, we propose +FlickerFusion, a novel OOD generalization method that acts as a universally +applicable augmentation technique for MARL backbone methods. FlickerFusion +stochastically drops out parts of the observation space, emulating being +in-domain when inferenced OOD. The results show that FlickerFusion not only +achieves superior inference rewards but also uniquely reduces uncertainty +vis-\`a-vis the backbone, compared to existing methods. Benchmarks, +implementations, and model weights are organized and open-sourced at +flickerfusion305.github.io, accompanied by ample demo video renderings. + +
+
+ comment: NeurIPS '24 Open-World Agents Workshop +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Federated Learning via Homomorphic Adversarial + Networks + + +
+ Privacy-preserving federated learning (PPFL) aims to train a global model for +multiple clients while maintaining their data privacy. However, current PPFL +protocols exhibit one or more of the following insufficiencies: considerable +degradation in accuracy, the requirement for sharing keys, and cooperation +during the key generation or decryption processes. As a mitigation, we develop +the first protocol that utilizes neural networks to implement PPFL, as well as +incorporating an Aggregatable Hybrid Encryption scheme tailored to the needs of +PPFL. We name these networks as Homomorphic Adversarial Networks (HANs) which +demonstrate that neural networks are capable of performing tasks similar to +multi-key homomorphic encryption (MK-HE) while solving the problems of key +distribution and collaborative decryption. Our experiments show that HANs are +robust against privacy attacks. Compared with non-private federated learning, +experiments conducted on multiple datasets demonstrate that HANs exhibit a +negligible accuracy loss (at most 1.35%). Compared to traditional MK-HE +schemes, HANs increase encryption aggregation speed by 6,075 times while +incurring a 29.2 times increase in communication overhead. + +
+
+
+
+
+ + ♻ ☆ Developing Story: Case Studies of Generative AI's Use in Journalism + + +
+ Journalists are among the many users of large language models (LLMs). To +better understand the journalist-AI interactions, we conduct a study of LLM +usage by two news agencies through browsing the WildChat dataset, identifying +candidate interactions, and verifying them by matching to online published +articles. Our analysis uncovers instances where journalists provide sensitive +material such as confidential correspondence with sources or articles from +other agencies to the LLM as stimuli and prompt it to generate articles, and +publish these machine-generated articles with limited intervention (median +output-publication ROUGE-L of 0.62). Based on our findings, we call for further +research into what constitutes responsible use of AI, and the establishment of +clear guidelines and best practices on using LLMs in a journalistic context. + +
+
+
+
+
+ + ♻ ☆ Yi-Lightning Technical Report + + +
+ This technical report presents Yi-Lightning, our latest flagship large +language model (LLM). It achieves exceptional performance, ranking 6th overall +on Chatbot Arena, with particularly strong results (2nd to 4th place) in +specialized categories including Chinese, Math, Coding, and Hard Prompts. +Yi-Lightning leverages an enhanced Mixture-of-Experts (MoE) architecture, +featuring advanced expert segmentation and routing mechanisms coupled with +optimized KV-caching techniques. Our development process encompasses +comprehensive pre-training, supervised fine-tuning (SFT), and reinforcement +learning from human feedback (RLHF), where we devise deliberate strategies for +multi-stage training, synthetic data construction, and reward modeling. +Furthermore, we implement RAISE (Responsible AI Safety Engine), a +four-component framework to address safety issues across pre-training, +post-training, and serving phases. Empowered by our scalable super-computing +infrastructure, all these innovations substantially reduce training, deployment +and inference costs while maintaining high-performance standards. With further +evaluations on public academic benchmarks, Yi-Lightning demonstrates +competitive performance against top-tier LLMs, while we observe a notable +disparity between traditional, static benchmark results and real-world, dynamic +human preferences. This observation prompts a critical reassessment of +conventional benchmarks' utility in guiding the development of more intelligent +and powerful AI systems for practical applications. Yi-Lightning is now +available through our developer platform at https://platform.lingyiwanwu.com. + +
+
+
+
+
+ + ♻ ☆ CPRM: A LLM-based Continual Pre-training Framework for Relevance + Modeling in Commercial Search + + +
+ Relevance modeling between queries and items stands as a pivotal component in +commercial search engines, directly affecting the user experience. Given the +remarkable achievements of large language models (LLMs) in various natural +language processing (NLP) tasks, LLM-based relevance modeling is gradually +being adopted within industrial search systems. Nevertheless, foundational LLMs +lack domain-specific knowledge and do not fully exploit the potential of +in-context learning. Furthermore, structured item text remains underutilized, +and there is a shortage in the supply of corresponding queries and background +knowledge. We thereby propose CPRM (Continual Pre-training for Relevance +Modeling), a framework designed for the continual pre-training of LLMs to +address these issues. Our CPRM framework includes three modules: 1) employing +both queries and multi-field item to jointly pre-train for enhancing domain +knowledge, 2) applying in-context pre-training, a novel approach where LLMs are +pre-trained on a sequence of related queries or items, and 3) conducting +reading comprehension on items to produce associated domain knowledge and +background information (e.g., generating summaries and corresponding queries) +to further strengthen LLMs. Results on offline experiments and online A/B +testing demonstrate that our model achieves convincing performance compared to +strong baselines. + +
+
+
+
+
+ + ♻ ☆ Proactive Agent: Shifting LLM Agents from Reactive Responses to Active + Assistance + + +
+ Agents powered by large language models have shown remarkable abilities in +solving complex tasks. However, most agent systems remain reactive, limiting +their effectiveness in scenarios requiring foresight and autonomous +decision-making. In this paper, we tackle the challenge of developing proactive +agents capable of anticipating and initiating tasks without explicit human +instructions. We propose a novel data-driven approach for this problem. +Firstly, we collect real-world human activities to generate proactive task +predictions. These predictions are then labeled by human annotators as either +accepted or rejected. The labeled data is used to train a reward model that +simulates human judgment and serves as an automatic evaluator of the +proactiveness of LLM agents. Building on this, we develop a comprehensive data +generation pipeline to create a diverse dataset, ProactiveBench, containing +6,790 events. Finally, we demonstrate that fine-tuning models with the proposed +ProactiveBench can significantly elicit the proactiveness of LLM agents. +Experimental results show that our fine-tuned model achieves an F1-Score of +66.47% in proactively offering assistance, outperforming all open-source and +close-source models. These results highlight the potential of our method in +creating more proactive and effective agent systems, paving the way for future +advancements in human-agent collaboration. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Intelligent Spark Agents: A Modular LangGraph Framework for Scalable, + Visualized, and Enhanced Big Data Machine Learning Workflows + + +
+ Apache Spark is better suited for load data mining and machine learning that +require a lot of iteration by using memory-distributed data sets. Due to the +complexity of Spark, the high learning threshold of Scala, and the low +reusability of its code, this paper designs and implements a Spark-based visual +process AI+machine learning method under a big data environment. On the one +hand, it designs component models to describe the basic steps of machine +learning, including data preprocessing, feature processing, and model training. +Practice and validate evaluation. On the other hand, a visual process modeling +tool is provided to support analysts to design machine learning processes, +which can be translated automatically into Spark platform code for efficient +execution. This tool can greatly improve the AI machine learning efficiency of +the Spark platform. This paper introduces the method theory, key technologies, +and effectiveness of the tool. + This paper explores the application of Spark in the field of large model +agents. Langchain, as an open-source framework, is committed to simplifying the +development of end-to-end applications based on language models. It provides +interfaces for interacting with a variety of large language models, optimizing +prompt engineering, and endowing large models with the ability to invoke +external tools. LangGraph demonstrates its powerful state management and graph +construction capabilities by defining node functions and graphs to build +complex agent applications. The development of Spark agent applications based +on LangGraph has further promoted the development of AI applications in the big +data analysis environment . + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Mamba-based Pedestrian Attribute Recognition + + +
+ Current strong pedestrian attribute recognition models are developed based on +Transformer networks, which are computationally heavy. Recently proposed models +with linear complexity (e.g., Mamba) have garnered significant attention and +have achieved a good balance between accuracy and computational cost across a +variety of visual tasks. Relevant review articles also suggest that while these +models can perform well on some pedestrian attribute recognition datasets, they +are generally weaker than the corresponding Transformer models. To further tap +into the potential of the novel Mamba architecture for PAR tasks, this paper +designs and adapts Mamba into two typical PAR frameworks, i.e., the text-image +fusion approach and pure vision Mamba multi-label recognition framework. It is +found that interacting with attribute tags as additional input does not always +lead to an improvement, specifically, Vim can be enhanced, but VMamba cannot. +This paper further designs various hybrid Mamba-Transformer variants and +conducts thorough experimental validations. These experimental results indicate +that simply enhancing Mamba with a Transformer does not always lead to +performance improvements but yields better results under certain settings. We +hope this empirical study can further inspire research in Mamba for PAR, and +even extend into the domain of multi-label recognition, through the design of +these network structures and comprehensive experimentation. The source code of +this work will be released at \url{https://github.com/Event-AHU/OpenPAR} + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ♻ ☆ Probabilistic Analysis of Copyright Disputes and Generative AI Safety + + +
+ This paper presents a probabilistic approach to analyzing copyright +infringement disputes by formalizing relevant judicial principles within a +coherent framework based on the random-worlds method. It provides a structured +analysis of key evidentiary principles, with a particular focus on the +``inverse ratio rule"--a controversial doctrine adopted by some courts. +Although this rule has faced significant criticism, a formal proof demonstrates +its validity, provided it is properly defined. Additionally, the paper examines +the heightened copyright risks posed by generative AI, highlighting how +extensive access to copyrighted material by generative models increases the +risk of infringement. Utilizing the probabilistic approach, the Near +Access-Free (NAF) condition, previously proposed as a potential mitigation +strategy, is evaluated. The analysis reveals that while the NAF condition +mitigates some infringement risks, its justifiability and efficacy are +questionable in certain contexts. These findings demonstrate how a rigorous +probabilistic approach can advance our understanding of copyright jurisprudence +and its interaction with emerging technologies. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Interventional Causal Discovery in a Mixture of DAGs NeurIPS 2024 + + +
+ Causal interactions among a group of variables are often modeled by a single +causal graph. In some domains, however, these interactions are best described +by multiple co-existing causal graphs, e.g., in dynamical systems or genomics. +This paper addresses the hitherto unknown role of interventions in learning +causal interactions among variables governed by a mixture of causal systems, +each modeled by one directed acyclic graph (DAG). Causal discovery from +mixtures is fundamentally more challenging than single-DAG causal discovery. +Two major difficulties stem from (i)~an inherent uncertainty about the +skeletons of the component DAGs that constitute the mixture and (ii)~possibly +cyclic relationships across these component DAGs. This paper addresses these +challenges and aims to identify edges that exist in at least one component DAG +of the mixture, referred to as the true edges. First, it establishes matching +necessary and sufficient conditions on the size of interventions required to +identify the true edges. Next, guided by the necessity results, an adaptive +algorithm is designed that learns all true edges using $O(n^2)$ interventions, +where $n$ is the number of nodes. Remarkably, the size of the interventions is +optimal if the underlying mixture model does not contain cycles across its +components. More generally, the gap between the intervention size used by the +algorithm and the optimal size is quantified. It is shown to be bounded by the +cyclic complexity number of the mixture model, defined as the size of the +minimal intervention that can break the cycles in the mixture, which is upper +bounded by the number of cycles among the ancestors of a node. + +
+
+ comment: NeurIPS 2024 camera-ready version +
+
+
+
+
+ + ♻ ☆ Analyzing Nobel Prize Literature with Large Language Models + + +
+ This study examines the capabilities of advanced Large Language Models +(LLMs), particularly the o1 model, in the context of literary analysis. The +outputs of these models are compared directly to those produced by +graduate-level human participants. By focusing on two Nobel Prize-winning short +stories, 'Nine Chapters' by Han Kang, the 2024 laureate, and 'Friendship' by +Jon Fosse, the 2023 laureate, the research explores the extent to which AI can +engage with complex literary elements such as thematic analysis, +intertextuality, cultural and historical contexts, linguistic and structural +innovations, and character development. Given the Nobel Prize's prestige and +its emphasis on cultural, historical, and linguistic richness, applying LLMs to +these works provides a deeper understanding of both human and AI approaches to +interpretation. The study uses qualitative and quantitative evaluations of +coherence, creativity, and fidelity to the text, revealing the strengths and +limitations of AI in tasks typically reserved for human expertise. While LLMs +demonstrate strong analytical capabilities, particularly in structured tasks, +they often fall short in emotional nuance and coherence, areas where human +interpretation excels. This research underscores the potential for human-AI +collaboration in the humanities, opening new opportunities in literary studies +and beyond. + +
+
+
+
+
+ + ♻ ☆ Towards Universal Mesh Movement Networks NeurIPS 2024 + + +
+ Solving complex Partial Differential Equations (PDEs) accurately and +efficiently is an essential and challenging problem in all scientific and +engineering disciplines. Mesh movement methods provide the capability to +improve the accuracy of the numerical solution without increasing the overall +mesh degree of freedom count. Conventional sophisticated mesh movement methods +are extremely expensive and struggle to handle scenarios with complex boundary +geometries. However, existing learning-based methods require re-training from +scratch given a different PDE type or boundary geometry, which limits their +applicability, and also often suffer from robustness issues in the form of +inverted elements. In this paper, we introduce the Universal Mesh Movement +Network (UM2N), which -- once trained -- can be applied in a non-intrusive, +zero-shot manner to move meshes with different size distributions and +structures, for solvers applicable to different PDE types and boundary +geometries. UM2N consists of a Graph Transformer (GT) encoder for extracting +features and a Graph Attention Network (GAT) based decoder for moving the mesh. +We evaluate our method on advection and Navier-Stokes based examples, as well +as a real-world tsunami simulation case. Our method outperforms existing +learning-based mesh movement methods in terms of the benchmarks described +above. In comparison to the conventional sophisticated Monge-Amp\`ere +PDE-solver based method, our approach not only significantly accelerates mesh +movement, but also proves effective in scenarios where the conventional method +fails. Our project page is at https://erizmr.github.io/UM2N/. + +
+
+ comment: Accepted at NeurIPS 2024 as a spotlight paper +
+
+
+
+
+ + ♻ ☆ From Seconds to Hours: Reviewing MultiModal Large Language Models on + Comprehensive Long Video Understanding + + +
+ The integration of Large Language Models (LLMs) with visual encoders has +recently shown promising performance in visual understanding tasks, leveraging +their inherent capability to comprehend and generate human-like text for visual +reasoning. Given the diverse nature of visual data, MultiModal Large Language +Models (MM-LLMs) exhibit variations in model designing and training for +understanding images, short videos, and long videos. Our paper focuses on the +substantial differences and unique challenges posed by long video understanding +compared to static image and short video understanding. Unlike static images, +short videos encompass sequential frames with both spatial and within-event +temporal information, while long videos consist of multiple events with +between-event and long-term temporal information. In this survey, we aim to +trace and summarize the advancements of MM-LLMs from image understanding to +long video understanding. We review the differences among various visual +understanding tasks and highlight the challenges in long video understanding, +including more fine-grained spatiotemporal details, dynamic events, and +long-term dependencies. We then provide a detailed summary of the advancements +in MM-LLMs in terms of model design and training methodologies for +understanding long videos. Finally, we compare the performance of existing +MM-LLMs on video understanding benchmarks of various lengths and discuss +potential future directions for MM-LLMs in long video understanding. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ QuadrupedGPT: Towards a Versatile Quadruped Agent in Open-ended Worlds + + +
+ As robotic agents increasingly assist humans in reality, quadruped robots +offer unique opportunities for interaction in complex scenarios due to their +agile movement. However, building agents that can autonomously navigate, adapt, +and respond to versatile goals remains a significant challenge. In this work, +we introduce QuadrupedGPT designed to follow diverse commands with agility +comparable to that of a pet. The primary challenges addressed include: i) +effectively utilizing multimodal observations for informed decision-making; ii) +achieving agile control by integrating locomotion and navigation; iii) +developing advanced cognition to execute long-term objectives. Our QuadrupedGPT +interprets human commands and environmental contexts using a large multimodal +model. Leveraging its extensive knowledge base, the agent autonomously assigns +parameters for adaptive locomotion policies and devises safe yet efficient +paths toward its goals. Additionally, it employs high-level reasoning to +decompose long-term goals into a sequence of executable subgoals. Through +comprehensive experiments, our agent shows proficiency in handling diverse +tasks and intricate instructions, representing a significant step toward the +development of versatile quadruped agents for open-ended environments. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ TransFair: Transferring Fairness from Ocular Disease Classification to + Progression Prediction + + +
+ The use of artificial intelligence (AI) in automated disease classification +significantly reduces healthcare costs and improves the accessibility of +services. However, this transformation has given rise to concerns about the +fairness of AI, which disproportionately affects certain groups, particularly +patients from underprivileged populations. Recently, a number of methods and +large-scale datasets have been proposed to address group performance +disparities. Although these methods have shown effectiveness in disease +classification tasks, they may fall short in ensuring fair prediction of +disease progression, mainly because of limited longitudinal data with diverse +demographics available for training a robust and equitable prediction model. In +this paper, we introduce TransFair to enhance demographic fairness in +progression prediction for ocular diseases. TransFair aims to transfer a +fairness-enhanced disease classification model to the task of progression +prediction with fairness preserved. Specifically, we train a fair EfficientNet, +termed FairEN, equipped with a fairness-aware attention mechanism using +extensive data for ocular disease classification. Subsequently, this fair +classification model is adapted to a fair progression prediction model through +knowledge distillation, which aims to minimize the latent feature distances +between the classification and progression prediction models. We evaluate +FairEN and TransFair for fairness-enhanced ocular disease classification and +progression prediction using both two-dimensional (2D) and 3D retinal images. +Extensive experiments and comparisons with models with and without considering +fairness learning show that TransFair effectively enhances demographic equity +in predicting ocular disease progression. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ A Single-Loop Gradient Descent and Perturbed Ascent Algorithm for + Nonconvex Functional Constrained Optimization ICML 2022 + + +
+ Nonconvex constrained optimization problems can be used to model a number of +machine learning problems, such as multi-class Neyman-Pearson classification +and constrained Markov decision processes. However, such kinds of problems are +challenging because both the objective and constraints are possibly nonconvex, +so it is difficult to balance the reduction of the loss value and reduction of +constraint violation. Although there are a few methods that solve this class of +problems, all of them are double-loop or triple-loop algorithms, and they +require oracles to solve some subproblems up to certain accuracy by tuning +multiple hyperparameters at each iteration. In this paper, we propose a novel +gradient descent and perturbed ascent (GDPA) algorithm to solve a class of +smooth nonconvex inequality constrained problems. The GDPA is a primal-dual +algorithm, which only exploits the first-order information of both the +objective and constraint functions to update the primal and dual variables in +an alternating way. The key feature of the proposed algorithm is that it is a +single-loop algorithm, where only two step-sizes need to be tuned. We show that +under a mild regularity condition GDPA is able to find Karush-Kuhn-Tucker (KKT) +points of nonconvex functional constrained problems with convergence rate +guarantees. To the best of our knowledge, it is the first single-loop algorithm +that can solve the general nonconvex smooth problems with nonconvex inequality +constraints. Numerical results also showcase the superiority of GDPA compared +with the best-known algorithms (in terms of both stationarity measure and +feasibility of the obtained solutions). + +
+
+ comment: This work was published in the Proceedings of the Thirty-Ninth + International Conference on Machine Learning (ICML 2022) +
+
+
+
+
+ + ♻ ☆ Tackling GenAI Copyright Issues: Originality Estimation and + Genericization + + +
+ The rapid progress of generative AI technology has sparked significant +copyright concerns, leading to numerous lawsuits filed against AI developers. +Notably, generative AI's capacity for generating copyrighted characters has +been well documented in the literature, and while various techniques for +mitigating copyright issues have been studied, significant risks remain. Here, +we propose a genericization method that modifies the outputs of a generative +model to make them more generic and less likely to imitate distinctive features +of copyrighted materials. To achieve this, we introduce a metric for +quantifying the level of originality of data, estimated by drawing samples from +a generative model, and applied in the genericization process. As a practical +implementation, we introduce PREGen (Prompt Rewriting-Enhanced Genericization), +which combines our genericization method with an existing mitigation technique. +Compared to the existing method, PREGen reduces the likelihood of generating +copyrighted characters by more than half when the names of copyrighted +characters are used as the prompt. Additionally, while generative models can +produce copyrighted characters even when their names are not directly mentioned +in the prompt, PREGen almost entirely prevents the generation of such +characters in these cases. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Learning General Representation of 12-Lead Electrocardiogram with a + Joint-Embedding Predictive Architecture + + +
+ Electrocardiogram (ECG) captures the heart's electrical signals, offering +valuable information for diagnosing cardiac conditions. However, the scarcity +of labeled data makes it challenging to fully leverage supervised learning in +medical domain. Self-supervised learning (SSL) offers a promising solution, +enabling models to learn from unlabeled data and uncover meaningful patterns. +In this paper, we show that masked modeling in the latent space can be a +powerful alternative to existing self-supervised methods in the ECG domain. We +introduce ECG-JEPA, a SSL model for 12-lead ECG analysis that learns semantic +representations of ECG data by predicting in the hidden latent space, bypassing +the need to reconstruct raw signals. This approach offers several advantages in +the ECG domain: (1) it avoids producing unnecessary details, such as noise, +which is common in ECG; and (2) it addresses the limitations of na\"ive L2 loss +between raw signals. Another key contribution is the introduction of +Cross-Pattern Attention (CroPA), a specialized masked attention mechanism +tailored for 12-lead ECG data. ECG-JEPA is trained on the union of several open +ECG datasets, totaling approximately 180,000 samples, and achieves +state-of-the-art performance in various downstream tasks including ECG +classification and feature prediction. Our code is openly available at +https://github.com/sehunfromdaegu/ECG_JEPA. + +
+
+
+
+
+ + ♻ ☆ InstantSwap: Fast Customized Concept Swapping across Sharp Shape + Differences + + +
+ Recent advances in Customized Concept Swapping (CCS) enable a text-to-image +model to swap a concept in the source image with a customized target concept. +However, the existing methods still face the challenges of inconsistency and +inefficiency. They struggle to maintain consistency in both the foreground and +background during concept swapping, especially when the shape difference is +large between objects. Additionally, they either require time-consuming +training processes or involve redundant calculations during inference. To +tackle these issues, we introduce InstantSwap, a new CCS method that aims to +handle sharp shape disparity at speed. Specifically, we first extract the bbox +of the object in the source image automatically based on attention map analysis +and leverage the bbox to achieve both foreground and background consistency. +For background consistency, we remove the gradient outside the bbox during the +swapping process so that the background is free from being modified. For +foreground consistency, we employ a cross-attention mechanism to inject +semantic information into both source and target concepts inside the box. This +helps learn semantic-enhanced representations that encourage the swapping +process to focus on the foreground objects. To improve swapping speed, we avoid +computing gradients at each timestep but instead calculate them periodically to +reduce the number of forward passes, which improves efficiency a lot with a +little sacrifice on performance. Finally, we establish a benchmark dataset to +facilitate comprehensive evaluation. Extensive evaluations demonstrate the +superiority and versatility of InstantSwap. Project Page: +https://instantswap.github.io/ + +
+
+ comment: Project Page: https://instantswap.github.io/. Github Page: + https://github.com/chenyangzhu1/InstantSwap +
+
+
+
+
+ + ♻ ☆ Large Language Model-Brained GUI Agents: A Survey + + +
+ GUIs have long been central to human-computer interaction, providing an +intuitive and visually-driven way to access and interact with digital systems. +The advent of LLMs, particularly multimodal models, has ushered in a new era of +GUI automation. They have demonstrated exceptional capabilities in natural +language understanding, code generation, and visual processing. This has paved +the way for a new generation of LLM-brained GUI agents capable of interpreting +complex GUI elements and autonomously executing actions based on natural +language instructions. These agents represent a paradigm shift, enabling users +to perform intricate, multi-step tasks through simple conversational commands. +Their applications span across web navigation, mobile app interactions, and +desktop automation, offering a transformative user experience that +revolutionizes how individuals interact with software. This emerging field is +rapidly advancing, with significant progress in both research and industry. + To provide a structured understanding of this trend, this paper presents a +comprehensive survey of LLM-brained GUI agents, exploring their historical +evolution, core components, and advanced techniques. We address research +questions such as existing GUI agent frameworks, the collection and utilization +of data for training specialized GUI agents, the development of large action +models tailored for GUI tasks, and the evaluation metrics and benchmarks +necessary to assess their effectiveness. Additionally, we examine emerging +applications powered by these agents. Through a detailed analysis, this survey +identifies key research gaps and outlines a roadmap for future advancements in +the field. By consolidating foundational knowledge and state-of-the-art +developments, this work aims to guide both researchers and practitioners in +overcoming challenges and unlocking the full potential of LLM-brained GUI +agents. + +
+
+ comment: The collection of papers reviewed in this survey will be hosted and + regularly updated on the GitHub repository: + https://github.com/vyokky/LLM-Brained-GUI-Agents-Survey Additionally, a + searchable webpage is available at https://aka.ms/gui-agent for easier access + and exploration +
+
+
+
+
+ + ♻ ☆ Scaling laws for learning with real and surrogate data + + +
+ Collecting large quantities of high-quality data can be prohibitively +expensive or impractical, and a bottleneck in machine learning. One may instead +augment a small set of $n$ data points from the target distribution with data +from more accessible sources, e.g. data collected under different circumstances +or synthesized by generative models. We refer to such data as `surrogate data'. +We study a weighted empirical risk minimization (ERM) approach for integrating +surrogate data into training. We analyze mathematically this method under +several classical statistical models, and validate our findings empirically on +datasets from different domains. Our main findings are: $(i)$ Integrating +surrogate data can significantly reduce the test error on the original +distribution. Surprisingly, this can happen even when the surrogate data is +unrelated to the original ones. We trace back this behavior to the classical +Stein's paradox. $(ii)$ In order to reap the benefit of surrogate data, it is +crucial to use optimally weighted ERM. $(iii)$ The test error of models trained +on mixtures of real and surrogate data is approximately described by a scaling +law. This scaling law can be used to predict the optimal weighting scheme, and +to choose the amount of surrogate data to add. + +
+
+ comment: Added new experiment and minor changes +
+
+
+
+
+ + ♻ ☆ Understanding Representation of Deep Equilibrium Models from Neural + Collapse Perspective + + +
+ Deep Equilibrium Model (DEQ), which serves as a typical implicit neural +network, emphasizes their memory efficiency and competitive performance +compared to explicit neural networks. However, there has been relatively +limited theoretical analysis on the representation of DEQ. In this paper, we +utilize the Neural Collapse ($\mathcal{NC}$) as a tool to systematically +analyze the representation of DEQ under both balanced and imbalanced +conditions. $\mathcal{NC}$ is an interesting phenomenon in the neural network +training process that characterizes the geometry of class features and +classifier weights. While extensively studied in traditional explicit neural +networks, the $\mathcal{NC}$ phenomenon has not received substantial attention +in the context of implicit neural networks. We theoretically show that +$\mathcal{NC}$ exists in DEQ under balanced conditions. Moreover, in imbalanced +settings, despite the presence of minority collapse, DEQ demonstrated +advantages over explicit neural networks. These advantages include the +convergence of extracted features to the vertices of a simplex equiangular +tight frame and self-duality properties under mild conditions, highlighting +DEQ's superiority in handling imbalanced datasets. Finally, we validate our +theoretical analyses through experiments in both balanced and imbalanced +scenarios. + +
+
+
+
+
+ + ♻ ☆ Mediating Modes of Thought: LLM's for design scripting + + +
+ Architects adopt visual scripting and parametric design tools to explore more +expansive design spaces (Coates, 2010), refine their thinking about the +geometric logic of their design (Woodbury, 2010), and overcome conventional +software limitations (Burry, 2011). Despite two decades of effort to make +design scripting more accessible, a disconnect between a designer's free ways +of thinking and the rigidity of algorithms remains (Burry, 2011). Recent +developments in Large Language Models (LLMs) suggest this might soon change, as +LLMs encode a general understanding of human context and exhibit the capacity +to produce geometric logic. This project speculates that if LLMs can +effectively mediate between user intent and algorithms, they become a powerful +tool to make scripting in design more widespread and fun. We explore if such +systems can interpret natural language prompts to assemble geometric operations +relevant to computational design scripting. In the system, multiple layers of +LLM agents are configured with specific context to infer the user intent and +construct a sequential logic. Given a user's high-level text prompt, a +geometric description is created, distilled into a sequence of logic +operations, and mapped to software-specific commands. The completed script is +constructed in the user's visual programming interface. The system succeeds in +generating complete visual scripts up to a certain complexity but fails beyond +this complexity threshold. It shows how LLMs can make design scripting much +more aligned with human creativity and thought. Future research should explore +conversational interactions, expand to multimodal inputs and outputs, and +assess the performance of these tools. + +
+
+ comment: Published at ACADIA 2024 +
+
+
+
+
+ + ♻ ☆ Towards Fair RAG: On the Impact of Fair Ranking in Retrieval-Augmented + Generation NeurIPS 2024 + + +
+ Many language models now enhance their responses with retrieval capabilities, +leading to the widespread adoption of retrieval-augmented generation (RAG) +systems. However, despite retrieval being a core component of RAG, much of the +research in this area overlooks the extensive body of work on fair ranking, +neglecting the importance of considering all stakeholders involved. This paper +presents the first systematic evaluation of RAG systems integrated with fair +rankings. We focus specifically on measuring the fair exposure of each relevant +item across the rankings utilized by RAG systems (i.e., item-side fairness), +aiming to promote equitable growth for relevant item providers. To gain a deep +understanding of the relationship between item-fairness, ranking quality, and +generation quality in the context of RAG, we analyze nine different RAG systems +that incorporate fair rankings across seven distinct datasets. Our findings +indicate that RAG systems with fair rankings can maintain a high level of +generation quality and, in many cases, even outperform traditional RAG systems, +despite the general trend of a tradeoff between ensuring fairness and +maintaining system-effectiveness. We believe our insights lay the groundwork +for responsible and equitable RAG systems and open new avenues for future +research. We publicly release our codebase and dataset at +https://github.com/kimdanny/Fair-RAG. + +
+
+ comment: Top 5 Spotlight at AFME Workshop at NeurIPS 2024 +
+
+
+
+
+
+
+
+ + Genomics 3 + +
+
+
+ + ☆ Single-Cell Omics Arena: A Benchmark Study for Large Language Models on + Cell Type Annotation Using Single-Cell Data + + +
+ Over the past decade, the revolution in single-cell sequencing has enabled +the simultaneous molecular profiling of various modalities across thousands of +individual cells, allowing scientists to investigate the diverse functions of +complex tissues and uncover underlying disease mechanisms. Among all the +analytical steps, assigning individual cells to specific types is fundamental +for understanding cellular heterogeneity. However, this process is usually +labor-intensive and requires extensive expert knowledge. Recent advances in +large language models (LLMs) have demonstrated their ability to efficiently +process and synthesize vast corpora of text to automatically extract essential +biological knowledge, such as marker genes, potentially promoting more +efficient and automated cell type annotations. To thoroughly evaluate the +capability of modern instruction-tuned LLMs in automating the cell type +identification process, we introduce SOAR, a comprehensive benchmarking study +of LLMs for cell type annotation tasks in single-cell genomics. Specifically, +we assess the performance of 8 instruction-tuned LLMs across 11 datasets, +spanning multiple cell types and species. Our study explores the potential of +LLMs to accurately classify and annotate cell types in single-cell RNA +sequencing (scRNA-seq) data, while extending their application to multiomics +data through cross-modality translation. Additionally, we evaluate the +effectiveness of chain-of-thought (CoT) prompting techniques in generating +detailed biological insights during the annotation process. The results +demonstrate that LLMs can provide robust interpretations of single-cell data +without requiring additional fine-tuning, advancing the automation of cell type +annotation in genomics research. + +
+
+
+
+
+ + ☆ iSEEtree: interactive explorer for hierarchical data + + +
+ $\textbf{Motivation:}$ Hierarchical data structures are prevalent across +several fields of research, as they represent an organised and efficient +approach to study complex interconnected systems. Their significance is +particularly evident in microbiome analysis, where microbial communities are +classified at various taxonomic levels along the phylogenetic tree. In light of +this trend, the R/Bioconductor community has established a reproducible +analytical framework for hierarchical data, which relies on the highly generic +and optimised TreeSummarizedExperiment data container. However, using this +framework requires basic proficiency in programming. + $\textbf{Results:}$ To reduce the entry requirements, we developed iSEEtree, +an R shiny app which provides a visual interface for the analysis and +exploration of TreeSummarizedExperiment objects, thereby expanding the +interactive graphics capabilities of related work to hierarchical structures. +This way, users can interactively explore several aspects of their data without +the need for extensive knowledge of R programming. We describe how iSEEtree +enables the exploration of hierarchical multi-table data and demonstrate its +functionality with applications to microbiome analysis. + $\textbf{Availability and Implementation:}$ iSEEtree was implemented in the R +programming language and is available on Bioconductor at +$\href{https://bioconductor.org/packages/iSEEtree}{https\text{:}//bioconductor\text{.}org/packages/iSEEtree}$ +under an Artistic 2.0 license. + $\textbf{Contact:}$ $\href{email}{giulio\text{.}benedetti@utu\text{.}fi}$ or +$\href{email}{leo\text{.}lahti@utu\text{.}fi}$. + +
+
+ comment: 4 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ The influence of chromosomal inversions on genetic variation and clinal + patterns in genomic data of Drosophila melanogaster + + +
+ Chromosomal inversions are structural mutations resulting in the reversal of +the gene order along the corresponding genomic region. Due to their influence +on recombination patterns, they can have a major influence on genetic variation +and the evolutionary process. Accordingly, inversions can act as supergenes +that keep together co-adapted gene complexes that form the genetic basis of +many complex phenotypes in diverse organisms. In this book chapter, I will +present an analysis pipeline to investigate the influence of two common +cosmopolitan inversion, In(2L)t and In(3R)Payne, on genome-wide genetic +variation and differentiation in world-wide populations of the vinegar fly +Drosophila melanogaster. We will use single-individual and pooled resequencing +data in combination with population genomics analysis tools to explore the +impact of these two inversions on genetic variation, population structure, and +clinal variation in natural populations. + +
+
+ comment: book chapter; 47 pages, 9 Figures +
+
+
+
+
+
+
+
+ + Machine Learning 152 + +
+
+
+ + ☆ Scaling BERT Models for Turkish Automatic Punctuation and Capitalization + Correction + + +
+ This paper investigates the effectiveness of BERT based models for automated +punctuation and capitalization corrections in Turkish texts across five +distinct model sizes. The models are designated as Tiny, Mini, Small, Medium, +and Base. The design and capabilities of each model are tailored to address the +specific challenges of the Turkish language, with a focus on optimizing +performance while minimizing computational overhead. The study presents a +systematic comparison of the performance metrics precision, recall, and F1 +score of each model, offering insights into their applicability in diverse +operational contexts. The results demonstrate a significant improvement in text +readability and accuracy as model size increases, with the Base model achieving +the highest correction precision. This research provides a comprehensive guide +for selecting the appropriate model size based on specific user needs and +computational resources, establishing a framework for deploying these models in +real-world applications to enhance the quality of written Turkish. + +
+
+ comment: 2024 Innovations in Intelligent Systems and Applications Conference + (ASYU) +
+
+
+
+
+ + ☆ An ADHD Diagnostic Interface Based on EEG Spectrograms and Deep Learning + Techniques + + +
+ This paper introduces an innovative approach to +Attention-deficit/hyperactivity disorder (ADHD) diagnosis by employing deep +learning (DL) techniques on electroencephalography (EEG) signals. This method +addresses the limitations of current behavior-based diagnostic methods, which +often lead to misdiagnosis and gender bias. By utilizing a publicly available +EEG dataset and converting the signals into spectrograms, a Resnet-18 +convolutional neural network (CNN) architecture was used to extract features +for ADHD classification. The model achieved a high precision, recall, and an +overall F1 score of 0.9. Feature extraction highlighted significant brain +regions (frontopolar, parietal, and occipital lobes) associated with ADHD. +These insights guided the creation of a three-part digital diagnostic system, +facilitating cost-effective and accessible ADHD screening, especially in school +environments. This system enables earlier and more accurate identification of +students at risk for ADHD, providing timely support to enhance their +developmental outcomes. This study showcases the potential of integrating EEG +analysis with DL to enhance ADHD diagnostics, presenting a viable alternative +to traditional methods. + +
+
+ comment: Presented at SIPAIM 2024 +
+
+
+
+
+ + ☆ T-REG: Preference Optimization with Token-Level Reward Regularization + + +
+ Reinforcement learning from human feedback (RLHF) has been crucial in +aligning large language models (LLMs) with human values. Traditionally, RLHF +involves generating responses to a query and using a reward model to assign a +reward to the entire response. However, this approach faces challenges due to +its reliance on a single, sparse reward, which makes it challenging for the +model to identify which parts of the sequence contribute most significantly to +the final reward. Recent methods have attempted to address this limitation by +introducing token-level rewards. However, these methods often rely on either a +trained credit assignment model or AI annotators, raising concerns about the +quality and reliability of the rewards. In this paper, we propose token-level +reward regularization (T-REG), a novel approach that leverages both +sequence-level and token-level rewards for preference optimization. Harnessing +the self-refinement capabilities of LLMs, our method uses contrastive prompting +to enable LLMs to self-generate token-level rewards. These self-generated +rewards then act as reward regularization, guiding the model to more +effectively distribute sequence-level rewards across tokens. This facilitates +better token-level credit assignment and enhances alignment performance. +Experiments on the instruction following benchmarks, including Alpaca Eval 2 +and Arena-Hard, show that our method consistently outperforms baseline methods +by up to 3.8% and 4.4%, respectively. We will release the code and models at +https://github.com/wzhouad/T-REG. + +
+
+
+
+
+ + ☆ The Asymptotic Behavior of Attention in Transformers + + +
+ A key component of transformers is the attention mechanism orchestrating how +each token influences the propagation of every other token through a +transformer. In this paper we provide a rigorous, mathematical analysis of the +asymptotic properties of attention in transformers. Although we present several +results based on different assumptions, all of them point to the same +conclusion, all tokens asymptotically converge to each other, a phenomenon that +has been empirically reported in the literature. Our findings are carefully +compared with existing theoretical results and illustrated by simulations and +experimental studies using the GPT-2 model. + +
+
+
+
+
+ + ☆ Planning-Guided Diffusion Policy Learning for Generalizable Contact-Rich + Bimanual Manipulation + + +
+ Contact-rich bimanual manipulation involves precise coordination of two arms +to change object states through strategically selected contacts and motions. +Due to the inherent complexity of these tasks, acquiring sufficient +demonstration data and training policies that generalize to unseen scenarios +remain a largely unresolved challenge. Building on recent advances in planning +through contacts, we introduce Generalizable Planning-Guided Diffusion Policy +Learning (GLIDE), an approach that effectively learns to solve contact-rich +bimanual manipulation tasks by leveraging model-based motion planners to +generate demonstration data in high-fidelity physics simulation. Through +efficient planning in randomized environments, our approach generates +large-scale and high-quality synthetic motion trajectories for tasks involving +diverse objects and transformations. We then train a task-conditioned diffusion +policy via behavior cloning using these demonstrations. To tackle the +sim-to-real gap, we propose a set of essential design options in feature +extraction, task representation, action prediction, and data augmentation that +enable learning robust prediction of smooth action sequences and generalization +to unseen scenarios. Through experiments in both simulation and the real world, +we demonstrate that our approach can enable a bimanual robotic system to +effectively manipulate objects of diverse geometries, dimensions, and physical +properties. Website: https://glide-manip.github.io/ + +
+
+
+
+
+ + ☆ Mind the Gap: Examining the Self-Improvement Capabilities of Large + Language Models + + +
+ Self-improvement is a mechanism in Large Language Model (LLM) pre-training, +post-training and test-time inference. We explore a framework where the model +verifies its own outputs, filters or reweights data based on this verification, +and distills the filtered data. Despite several empirical successes, a +fundamental understanding is still lacking. In this work, we initiate a +comprehensive, modular and controlled study on LLM self-improvement. We provide +a mathematical formulation for self-improvement, which is largely governed by a +quantity which we formalize as the generation-verification gap. Through +experiments with various model families and tasks, we discover a scaling +phenomenon of self-improvement -- a variant of the generation-verification gap +scales monotonically with the model pre-training flops. We also examine when +self-improvement is possible, an iterative self-improvement procedure, and ways +to improve its performance. Our findings not only advance understanding of LLM +self-improvement with practical implications, but also open numerous avenues +for future research into its capabilities and boundaries. + +
+
+ comment: 41 pages, 19 figures +
+
+
+
+
+ + ☆ Interpretable Generalized Additive Models for Datasets with Missing + Values NeurIPS 2024 + + +
+ Many important datasets contain samples that are missing one or more feature +values. Maintaining the interpretability of machine learning models in the +presence of such missing data is challenging. Singly or multiply imputing +missing values complicates the model's mapping from features to labels. On the +other hand, reasoning on indicator variables that represent missingness +introduces a potentially large number of additional terms, sacrificing +sparsity. We solve these problems with M-GAM, a sparse, generalized, additive +modeling approach that incorporates missingness indicators and their +interaction terms while maintaining sparsity through l0 regularization. We show +that M-GAM provides similar or superior accuracy to prior methods while +significantly improving sparsity relative to either imputation or naive +inclusion of indicator variables. + +
+
+ comment: Published in NeurIPS 2024 +
+
+
+
+
+ + ☆ The Space Complexity of Approximating Logistic Loss + + +
+ We provide space complexity lower bounds for data structures that approximate +logistic loss up to $\epsilon$-relative error on a logistic regression problem +with data $\mathbf{X} \in \mathbb{R}^{n \times d}$ and labels $\mathbf{y} \in +\{-1,1\}^d$. The space complexity of existing coreset constructions depend on a +natural complexity measure $\mu_\mathbf{y}(\mathbf{X})$, first defined in +(Munteanu, 2018). We give an $\tilde{\Omega}(\frac{d}{\epsilon^2})$ space +complexity lower bound in the regime $\mu_\mathbf{y}(\mathbf{X}) = O(1)$ that +shows existing coresets are optimal in this regime up to lower order factors. +We also prove a general $\tilde{\Omega}(d\cdot \mu_\mathbf{y}(\mathbf{X}))$ +space lower bound when $\epsilon$ is constant, showing that the dependency on +$\mu_\mathbf{y}(\mathbf{X})$ is not an artifact of mergeable coresets. Finally, +we refute a prior conjecture that $\mu_\mathbf{y}(\mathbf{X})$ is hard to +compute by providing an efficient linear programming formulation, and we +empirically compare our algorithm to prior approximate methods. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.14284 +
+
+
+
+
+ + ☆ Sharp-It: A Multi-view to Multi-view Diffusion Model for 3D Synthesis + and Manipulation + + +
+ Advancements in text-to-image diffusion models have led to significant +progress in fast 3D content creation. One common approach is to generate a set +of multi-view images of an object, and then reconstruct it into a 3D model. +However, this approach bypasses the use of a native 3D representation of the +object and is hence prone to geometric artifacts and limited in controllability +and manipulation capabilities. An alternative approach involves native 3D +generative models that directly produce 3D representations. These models, +however, are typically limited in their resolution, resulting in lower quality +3D objects. In this work, we bridge the quality gap between methods that +directly generate 3D representations and ones that reconstruct 3D objects from +multi-view images. We introduce a multi-view to multi-view diffusion model +called Sharp-It, which takes a 3D consistent set of multi-view images rendered +from a low-quality object and enriches its geometric details and texture. The +diffusion model operates on the multi-view set in parallel, in the sense that +it shares features across the generated views. A high-quality 3D model can then +be reconstructed from the enriched multi-view set. By leveraging the advantages +of both 2D and 3D approaches, our method offers an efficient and controllable +method for high-quality 3D content creation. We demonstrate that Sharp-It +enables various 3D applications, such as fast synthesis, editing, and +controlled generation, while attaining high-quality assets. + +
+
+ comment: Project page at https://yiftachede.github.io/Sharp-It/ +
+
+
+
+
+ + ☆ The effect of priors on Learning with Restricted Boltzmann Machines + + +
+ Restricted Boltzmann Machines (RBMs) are generative models designed to learn +from data with a rich underlying structure. In this work, we explore a +teacher-student setting where a student RBM learns from examples generated by a +teacher RBM, with a focus on the effect of the unit priors on learning +efficiency. We consider a parametric class of priors that interpolate between +continuous (Gaussian) and binary variables. This approach models various +possible choices of visible units, hidden units, and weights for both the +teacher and student RBMs. + By analyzing the phase diagram of the posterior distribution in both the +Bayes optimal and mismatched regimes, we demonstrate the existence of a triple +point that defines the critical dataset size necessary for learning through +generalization. The critical size is strongly influenced by the properties of +the teacher, and thus the data, but is unaffected by the properties of the +student RBM. Nevertheless, a prudent choice of student priors can facilitate +training by expanding the so-called signal retrieval region, where the machine +generalizes effectively. + +
+
+
+
+
+ + ☆ Medical Multimodal Foundation Models in Clinical Diagnosis and + Treatment: Applications, Challenges, and Future Directions + + +
+ Recent advancements in deep learning have significantly revolutionized the +field of clinical diagnosis and treatment, offering novel approaches to improve +diagnostic precision and treatment efficacy across diverse clinical domains, +thus driving the pursuit of precision medicine. The growing availability of +multi-organ and multimodal datasets has accelerated the development of +large-scale Medical Multimodal Foundation Models (MMFMs). These models, known +for their strong generalization capabilities and rich representational power, +are increasingly being adapted to address a wide range of clinical tasks, from +early diagnosis to personalized treatment strategies. This review offers a +comprehensive analysis of recent developments in MMFMs, focusing on three key +aspects: datasets, model architectures, and clinical applications. We also +explore the challenges and opportunities in optimizing multimodal +representations and discuss how these advancements are shaping the future of +healthcare by enabling improved patient outcomes and more efficient clinical +workflows. + +
+
+
+
+
+ + ☆ Improving Dynamic Object Interactions in Text-to-Video Generation with + AI Feedback + + +
+ Large text-to-video models hold immense potential for a wide range of +downstream applications. However, these models struggle to accurately depict +dynamic object interactions, often resulting in unrealistic movements and +frequent violations of real-world physics. One solution inspired by large +language models is to align generated outputs with desired outcomes using +external feedback. This enables the model to refine its responses autonomously, +eliminating extensive manual data collection. In this work, we investigate the +use of feedback to enhance the object dynamics in text-to-video models. We aim +to answer a critical question: what types of feedback, paired with which +specific self-improvement algorithms, can most effectively improve text-video +alignment and realistic object interactions? We begin by deriving a unified +probabilistic objective for offline RL finetuning of text-to-video models. This +perspective highlights how design elements in existing algorithms like KL +regularization and policy projection emerge as specific choices within a +unified framework. We then use derived methods to optimize a set of text-video +alignment metrics (e.g., CLIP scores, optical flow), but notice that they often +fail to align with human perceptions of generation quality. To address this +limitation, we propose leveraging vision-language models to provide more +nuanced feedback specifically tailored to object dynamics in videos. Our +experiments demonstrate that our method can effectively optimize a wide variety +of rewards, with binary AI feedback driving the most significant improvements +in video quality for dynamic interactions, as confirmed by both AI and human +evaluations. Notably, we observe substantial gains when using reward signals +derived from AI feedback, particularly in scenarios involving complex +interactions between multiple objects and realistic depictions of objects +falling. + +
+
+ comment: Website: https://sites.google.com/view/aif-dynamic-t2v/ +
+
+
+
+
+ + ☆ Wasserstein Markets for Differentially-Private Data + + +
+ Data is an increasingly vital component of decision making processes across +industries. However, data access raises privacy concerns motivating the need +for privacy-preserving techniques such as differential privacy. Data markets +provide a means to enable wider access as well as determine the appropriate +privacy-utility trade-off. Existing data market frameworks either require a +trusted third party to perform computationally expensive valuations or are +unable to capture the combinatorial nature of data value and do not +endogenously model the effect of differential privacy. This paper addresses +these shortcomings by proposing a valuation mechanism based on the Wasserstein +distance for differentially-private data, and corresponding procurement +mechanisms by leveraging incentive mechanism design theory, for task-agnostic +data procurement, and task-specific procurement co-optimisation. The mechanisms +are reformulated into tractable mixed-integer second-order cone programs, which +are validated with numerical studies. + +
+
+ comment: 35 pages, 15 figures +
+
+
+
+
+ + ☆ Interpretable Company Similarity with Sparse Autoencoders + + +
+ Determining company similarity is a vital task in finance, underpinning +hedging, risk management, portfolio diversification, and more. Practitioners +often rely on sector and industry classifications to gauge similarity, such as +SIC-codes and GICS-codes, the former being used by the U.S. Securities and +Exchange Commission (SEC), and the latter widely used by the investment +community. Clustering embeddings of company descriptions has been proposed as a +potential technique for determining company similarity, but the lack of +interpretability in token embeddings poses a significant barrier to adoption in +high-stakes contexts. Sparse Autoencoders have shown promise in enhancing the +interpretability of Large Language Models by decomposing LLM activations into +interpretable features. In this paper, we explore the use of SAE features in +measuring company similarity and benchmark them against (1) SIC codes and (2) +Major Group codes. We conclude that SAE features can reproduce and even surpass +sector classifications in quantifying fundamental characteristics of companies, +evaluated by the correlation of monthly returns, a proxy for similarity, and +PnL from cointegration. + +
+
+
+
+
+ + ☆ CEGI: Measuring the trade-off between efficiency and carbon emissions + for SLMs and VLMs + + +
+ This paper analyzes the performance of Small Language Models (SLMs) and +Vision Language Models (VLMs) and evaluates the trade-off between model +performance and carbon emissions across 4 essential tasks: Image Captioning, +Visual Question Answering (VQA), Dialogue Summarization and Text-to-SQL +conversion. Various SLMs and VLMs belonging to the Qwen and LLaMA architecture +family are chosen and variants based on model size in terms of the number of +parameters, quantization level and fine-tuning parameters are evaluated. The +model variant's performance and carbon emissions are calculated. To quantify +the trade-off between model performance and carbon emissions, we introduce a +novel metric called CEGI (Carbon Efficient Gain Index). This metric represents +the carbon emission per unit percentage gain per million trainable parameters . +This metric provides a normalized measure to compare model's efficiency in +terms of performance improvement relative to their environmental cost. The +experiment's outcome demonstrates that fine-tuning SLMs and VLMs can achieve +performance levels comparable to Large Language Models (LLMs) while producing +significantly less carbon emissions. Our findings suggest that the marginal +gains in accuracy from larger models do not justify the substantial increase in +carbon emissions. Leveraging lower-bit quantization levels, the proposed metric +further enhances energy efficiency without compromising performance. This study +highlights balancing high performance and environmental sustainability. It +offers a valuable metric for selecting models suitable for +environmentally-friendly AI development. + +
+
+
+
+
+ + ☆ Class-wise Autoencoders Measure Classification Difficulty And Detect + Label Mistakes + + +
+ We introduce a new framework for analyzing classification datasets based on +the ratios of reconstruction errors between autoencoders trained on individual +classes. This analysis framework enables efficient characterization of datasets +on the sample, class, and entire dataset levels. We define reconstruction error +ratios (RERs) that probe classification difficulty and allow its decomposition +into (1) finite sample size and (2) Bayes error and decision-boundary +complexity. Through systematic study across 19 popular visual datasets, we find +that our RER-based dataset difficulty probe strongly correlates with error rate +for state-of-the-art (SOTA) classification models. By interpreting sample-level +classification difficulty as a label mistakenness score, we further find that +RERs achieve SOTA performance on mislabel detection tasks on hard datasets +under symmetric and asymmetric label noise. Our code is publicly available at +https://github.com/voxel51/reconstruction-error-ratios. + +
+
+ comment: 30 pages, 18 figures +
+
+
+
+
+ + ☆ Private Linear Regression with Differential Privacy and PAC Privacy + + +
+ Linear regression is a fundamental tool for statistical analysis, which has +motivated the development of linear regression methods that satisfy provable +privacy guarantees so that the learned model reveals little about any one data +point used to construct it. Most existing privacy-preserving linear regression +methods rely on the well-established framework of differential privacy, while +the newly proposed PAC Privacy has not yet been explored in this context. In +this paper, we systematically compare linear regression models trained with +differential privacy and PAC privacy across three real-world datasets, +observing several key findings that impact the performance of +privacy-preserving linear regression. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ TAB-Fields: A Maximum Entropy Framework for Mission-Aware Adversarial + Planning + + +
+ Autonomous agents operating in adversarial scenarios face a fundamental +challenge: while they may know their adversaries' high-level objectives, such +as reaching specific destinations within time constraints, the exact policies +these adversaries will employ remain unknown. Traditional approaches address +this challenge by treating the adversary's state as a partially observable +element, leading to a formulation as a Partially Observable Markov Decision +Process (POMDP). However, the induced belief-space dynamics in a POMDP require +knowledge of the system's transition dynamics, which, in this case, depend on +the adversary's unknown policy. Our key observation is that while an +adversary's exact policy is unknown, their behavior is necessarily constrained +by their mission objectives and the physical environment, allowing us to +characterize the space of possible behaviors without assuming specific +policies. In this paper, we develop Task-Aware Behavior Fields (TAB-Fields), a +representation that captures adversary state distributions over time by +computing the most unbiased probability distribution consistent with known +constraints. We construct TAB-Fields by solving a constrained optimization +problem that minimizes additional assumptions about adversary behavior beyond +mission and environmental requirements. We integrate TAB-Fields with standard +planning algorithms by introducing TAB-conditioned POMCP, an adaptation of +Partially Observable Monte Carlo Planning. Through experiments in simulation +with underwater robots and hardware implementations with ground robots, we +demonstrate that our approach achieves superior performance compared to +baselines that either assume specific adversary policies or neglect mission +constraints altogether. Evaluation videos and code are available at +https://tab-fields.github.io. + +
+
+
+
+
+ + ☆ Plug-and-Play Half-Quadratic Splitting for Ptychography + + +
+ Ptychography is a coherent diffraction imaging method that uses phase +retrieval techniques to reconstruct complex-valued images. It achieves this by +sequentially illuminating overlapping regions of a sample with a coherent beam +and recording the diffraction pattern. Although this addresses traditional +imaging system challenges, it is computationally intensive and highly sensitive +to noise, especially with reduced illumination overlap. Data-driven +regularisation techniques have been applied in phase retrieval to improve +reconstruction quality. In particular, plug-and-play (PnP) offers flexibility +by integrating data-driven denoisers as implicit priors. In this work, we +propose a half-quadratic splitting framework for using PnP and other +data-driven priors for ptychography. We evaluate our method both on natural +images and real test objects to validate its effectiveness for ptychographic +image reconstruction. + +
+
+
+
+
+ + ☆ Fractional Order Distributed Optimization + + +
+ Distributed optimization is fundamental to modern machine learning +applications like federated learning, but existing methods often struggle with +ill-conditioned problems and face stability-versus-speed tradeoffs. We +introduce fractional order distributed optimization (FrODO); a +theoretically-grounded framework that incorporates fractional-order memory +terms to enhance convergence properties in challenging optimization landscapes. +Our approach achieves provable linear convergence for any strongly connected +network. Through empirical validation, our results suggest that FrODO achieves +up to 4 times faster convergence versus baselines on ill-conditioned problems +and 2-3 times speedup in federated neural network training, while maintaining +stability and theoretical guarantees. + +
+
+
+
+
+ + ☆ Unveiling Concept Attribution in Diffusion Models + + +
+ Diffusion models have shown remarkable abilities in generating realistic and +high-quality images from text prompts. However, a trained model remains +black-box; little do we know about the role of its components in exhibiting a +concept such as objects or styles. Recent works employ causal tracing to +localize layers storing knowledge in generative models without showing how +those layers contribute to the target concept. In this work, we approach the +model interpretability problem from a more general perspective and pose a +question: \textit{``How do model components work jointly to demonstrate +knowledge?''}. We adapt component attribution to decompose diffusion models, +unveiling how a component contributes to a concept. Our framework allows +effective model editing, in particular, we can erase a concept from diffusion +models by removing positive components while remaining knowledge of other +concepts. Surprisingly, we also show there exist components that contribute +negatively to a concept, which has not been discovered in the knowledge +localization approach. Experimental results confirm the role of positive and +negative components pinpointed by our framework, depicting a complete view of +interpreting generative models. Our code is available at +\url{https://github.com/mail-research/CAD-attribution4diffusion} + +
+
+
+
+
+ + ☆ On the Privacy, Security, and Trustworthy for Distributed Wireless Large + AI Model (WLAM) + + +
+ Combining wireless communication with large artificial intelligence (AI) +models can open up a myriad of novel application scenarios. In sixth generation +(6G) networks, ubiquitous communication and computing resources allow large AI +models to serve democratic large AI models-related services to enable real-time +applications like autonomous vehicles, smart cities, and Internet of Things +(IoT) ecosystems. However, the security considerations and sustainable +communication resources limit the deployment of large AI models over +distributed wireless networks. This paper provides a comprehensive overview of +privacy, security, and trustworthy for distributed wireless large AI model +(WLAM). In particular, the detailed privacy and security are analysis for +distributed WLAM is fist revealed. The classifications and theoretical findings +about privacy and security in distributed WLAM are discussed. Then the +trustworthy and ethics for implementing distributed WLAM are described. +Finally, the comprehensive applications of distributed WLAM is provided in the +aspect of electromagnetic signal processing. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Defending Against Diverse Attacks in Federated Learning Through + Consensus-Based Bi-Level Optimization + + +
+ Adversarial attacks pose significant challenges in many machine learning +applications, particularly in the setting of distributed training and federated +learning, where malicious agents seek to corrupt the training process with the +goal of jeopardizing and compromising the performance and reliability of the +final models. In this paper, we address the problem of robust federated +learning in the presence of such attacks by formulating the training task as a +bi-level optimization problem. We conduct a theoretical analysis of the +resilience of consensus-based bi-level optimization (CB$^2$O), an interacting +multi-particle metaheuristic optimization method, in adversarial settings. +Specifically, we provide a global convergence analysis of CB$^2$O in mean-field +law in the presence of malicious agents, demonstrating the robustness of +CB$^2$O against a diverse range of attacks. Thereby, we offer insights into how +specific hyperparameter choices enable to mitigate adversarial effects. On the +practical side, we extend CB$^2$O to the clustered federated learning setting +by proposing FedCB$^2$O, a novel interacting multi-particle system, and design +a practical algorithm that addresses the demands of real-world applications. +Extensive experiments demonstrate the robustness of the FedCB$^2$O algorithm +against label-flipping attacks in decentralized clustered federated learning +scenarios, showcasing its effectiveness in practical contexts. + +
+
+
+
+
+ + ☆ Active learning of neural population dynamics using two-photon + holographic optogenetics NeurIPS 2024 + + +
+ Recent advances in techniques for monitoring and perturbing neural +populations have greatly enhanced our ability to study circuits in the brain. +In particular, two-photon holographic optogenetics now enables precise +photostimulation of experimenter-specified groups of individual neurons, while +simultaneous two-photon calcium imaging enables the measurement of ongoing and +induced activity across the neural population. Despite the enormous space of +potential photostimulation patterns and the time-consuming nature of +photostimulation experiments, very little algorithmic work has been done to +determine the most effective photostimulation patterns for identifying the +neural population dynamics. Here, we develop methods to efficiently select +which neurons to stimulate such that the resulting neural responses will best +inform a dynamical model of the neural population activity. Using neural +population responses to photostimulation in mouse motor cortex, we demonstrate +the efficacy of a low-rank linear dynamical systems model, and develop an +active learning procedure which takes advantage of low-rank structure to +determine informative photostimulation patterns. We demonstrate our approach on +both real and synthetic data, obtaining in some cases as much as a two-fold +reduction in the amount of data required to reach a given predictive power. Our +active stimulation design method is based on a novel active learning procedure +for low-rank regression, which may be of independent interest. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ LLMForecaster: Improving Seasonal Event Forecasts with Unstructured + Textual Data NeurIPS + + +
+ Modern time-series forecasting models often fail to make full use of rich +unstructured information about the time series themselves. This lack of proper +conditioning can lead to obvious model failures; for example, models may be +unaware of the details of a particular product, and hence fail to anticipate +seasonal surges in customer demand in the lead up to major exogenous events +like holidays for clearly relevant products. To address this shortcoming, this +paper introduces a novel forecast post-processor -- which we call LLMForecaster +-- that fine-tunes large language models (LLMs) to incorporate unstructured +semantic and contextual information and historical data to improve the +forecasts from an existing demand forecasting pipeline. In an industry-scale +retail application, we demonstrate that our technique yields statistically +significantly forecast improvements across several sets of products subject to +holiday-driven demand surges. + +
+
+ comment: Presented at NeurIPS Time Series in the Age of Large Models (2024) +
+
+
+
+
+ + ☆ Cooperative Cruising: Reinforcement Learning based Time-Headway Control + for Increased Traffic Efficiency + + +
+ The proliferation of Connected Automated Vehicles represents an unprecedented +opportunity for improving driving efficiency and alleviating traffic +congestion. However, existing research fails to address realistic multi-lane +highway scenarios without assuming connectivity, perception, and control +capabilities that are typically unavailable in current vehicles. This paper +proposes a novel AI system that is the first to improve highway traffic +efficiency compared with human-like traffic in realistic, simulated multi-lane +scenarios, while relying on existing connectivity, perception, and control +capabilities. At the core of our approach is a reinforcement learning based +controller that dynamically communicates time-headways to automated vehicles +near bottlenecks based on real-time traffic conditions. These desired +time-headways are then used by Adaptive Cruise Control (ACC) systems to adjust +their following distance. By (i) integrating existing traffic estimation +technology and low-bandwidth vehicle-to-infrastructure connectivity, (ii) +leveraging safety-certified ACC systems, and (iii) targeting localized +bottleneck challenges that can be addressed independently in different +locations, we propose a practical, safe, and scalable system that can +positively impact numerous road users. + +
+
+
+
+
+ + CA-MoE: Channel-Adapted MoE for Incremental Weather Forecasting + + +
+ Atmospheric science is intricately connected with other fields, e.g., +geography and aerospace. Most existing approaches involve training a joint +atmospheric and geographic model from scratch, which incurs significant +computational costs and overlooks the potential for incremental learning of +weather variables across different domains. In this paper, we introduce +incremental learning to weather forecasting and propose a novel structure that +allows for the flexible expansion of variables within the model. Specifically, +our method presents a Channel-Adapted MoE (CA-MoE) that employs a +divide-and-conquer strategy. This strategy assigns variable training tasks to +different experts by index embedding and reduces computational complexity +through a channel-wise Top-K strategy. Experiments conducted on the widely +utilized ERA5 dataset reveal that our method, utilizing only approximately 15\% +of trainable parameters during the incremental stage, attains performance that +is on par with state-of-the-art competitors. Notably, in the context of +variable incremental experiments, our method demonstrates negligible issues +with catastrophic forgetting. + +
+
+
+
+
+ + ☆ The Cost of Consistency: Submodular Maximization with Constant Recourse + + +
+ In this work, we study online submodular maximization, and how the +requirement of maintaining a stable solution impacts the approximation. In +particular, we seek bounds on the best-possible approximation ratio that is +attainable when the algorithm is allowed to make at most a constant number of +updates per step. We show a tight information-theoretic bound of $\tfrac{2}{3}$ +for general monotone submodular functions, and an improved (also tight) bound +of $\tfrac{3}{4}$ for coverage functions. Since both these bounds are attained +by non poly-time algorithms, we also give a poly-time randomized algorithm that +achieves a $0.51$-approximation. Combined with an information-theoretic +hardness of $\tfrac{1}{2}$ for deterministic algorithms from prior work, our +work thus shows a separation between deterministic and randomized algorithms, +both information theoretically and for poly-time algorithms. + +
+
+
+
+
+ + ☆ Vector Optimization with Gaussian Process Bandits + + +
+ Learning problems in which multiple conflicting objectives must be considered +simultaneously often arise in various fields, including engineering, drug +design, and environmental management. Traditional methods for dealing with +multiple black-box objective functions, such as scalarization and +identification of the Pareto set under the componentwise order, have +limitations in incorporating objective preferences and exploring the solution +space accordingly. While vector optimization offers improved flexibility and +adaptability via specifying partial orders based on ordering cones, current +techniques designed for sequential experiments either suffer from high sample +complexity or lack theoretical guarantees. To address these issues, we propose +Vector Optimization with Gaussian Process (VOGP), a probably approximately +correct adaptive elimination algorithm that performs black-box vector +optimization using Gaussian process bandits. VOGP allows users to convey +objective preferences through ordering cones while performing efficient +sampling by exploiting the smoothness of the objective function, resulting in a +more effective optimization process that requires fewer evaluations. We +establish theoretical guarantees for VOGP and derive information gain-based and +kernel-specific sample complexity bounds. We also conduct experiments on both +real-world and synthetic datasets to compare VOGP with the state-of-the-art +methods. + +
+
+
+
+
+ + ☆ What should a neuron aim for? Designing local objective functions based + on information theory + + +
+ In modern deep neural networks, the learning dynamics of the individual +neurons is often obscure, as the networks are trained via global optimization. +Conversely, biological systems build on self-organized, local learning, +achieving robustness and efficiency with limited global information. We here +show how self-organization between individual artificial neurons can be +achieved by designing abstract bio-inspired local learning goals. These goals +are parameterized using a recent extension of information theory, Partial +Information Decomposition (PID), which decomposes the information that a set of +information sources holds about an outcome into unique, redundant and +synergistic contributions. Our framework enables neurons to locally shape the +integration of information from various input classes, i.e. feedforward, +feedback, and lateral, by selecting which of the three inputs should contribute +uniquely, redundantly or synergistically to the output. This selection is +expressed as a weighted sum of PID terms, which, for a given problem, can be +directly derived from intuitive reasoning or via numerical optimization, +offering a window into understanding task-relevant local information +processing. Achieving neuron-level interpretability while enabling strong +performance using local learning, our work advances a principled +information-theoretic foundation for local learning strategies. + +
+
+ comment: 24 pages, 11 figures +
+
+
+
+
+ + ☆ OODFace: Benchmarking Robustness of Face Recognition under Common + Corruptions and Appearance Variations + + +
+ With the rise of deep learning, facial recognition technology has seen +extensive research and rapid development. Although facial recognition is +considered a mature technology, we find that existing open-source models and +commercial algorithms lack robustness in certain real-world Out-of-Distribution +(OOD) scenarios, raising concerns about the reliability of these systems. In +this paper, we introduce OODFace, which explores the OOD challenges faced by +facial recognition models from two perspectives: common corruptions and +appearance variations. We systematically design 30 OOD scenarios across 9 major +categories tailored for facial recognition. By simulating these challenges on +public datasets, we establish three robustness benchmarks: LFW-C/V, CFP-FP-C/V, +and YTF-C/V. We then conduct extensive experiments on 19 different facial +recognition models and 3 commercial APIs, along with extended experiments on +face masks, Vision-Language Models (VLMs), and defense strategies to assess +their robustness. Based on the results, we draw several key insights, +highlighting the vulnerability of facial recognition systems to OOD data and +suggesting possible solutions. Additionally, we offer a unified toolkit that +includes all corruption and variation types, easily extendable to other +datasets. We hope that our benchmarks and findings can provide guidance for +future improvements in facial recognition model robustness. + +
+
+
+
+
+ + ☆ COMET:Combined Matrix for Elucidating Targets + + +
+ Identifying the interaction targets of bioactive compounds is a foundational +element for deciphering their pharmacological effects. Target prediction +algorithms equip researchers with an effective tool to rapidly scope and +explore potential targets. Here, we introduce the COMET, a multi-technological +modular target prediction tool that provides comprehensive predictive insights, +including similar active compounds, three-dimensional predicted binding modes, +and probability scores, all within an average processing time of less than 10 +minutes per task. With meticulously curated data, the COMET database +encompasses 990,944 drug-target interaction pairs and 45,035 binding pockets, +enabling predictions for 2,685 targets, which span confirmed and exploratory +therapeutic targets for human diseases. In comparative testing using datasets +from ChEMBL and BindingDB, COMET outperformed five other well-known algorithms, +offering nearly an 80% probability of accurately identifying at least one true +target within the top 15 predictions for a given compound. COMET also features +a user-friendly web server, accessible freely at +https://www.pdbbind-plus.org.cn/comet. + +
+
+
+
+
+ + ☆ DP-2Stage: Adapting Language Models as Differentially Private Tabular + Data Generators + + +
+ Generating tabular data under differential privacy (DP) protection ensures +theoretical privacy guarantees but poses challenges for training machine +learning models, primarily due to the need to capture complex structures under +noisy supervision signals. Recently, pre-trained Large Language Models (LLMs) +-- even those at the scale of GPT-2 -- have demonstrated great potential in +synthesizing tabular data. However, their applications under DP constraints +remain largely unexplored. In this work, we address this gap by applying DP +techniques to the generation of synthetic tabular data. Our findings shows that +LLMs face difficulties in generating coherent text when fine-tuned with DP, as +privacy budgets are inefficiently allocated to non-private elements like table +structures. To overcome this, we propose \ours, a two-stage fine-tuning +framework for differentially private tabular data generation. The first stage +involves non-private fine-tuning on a pseudo dataset, followed by DP +fine-tuning on a private dataset. Our empirical results show that this approach +improves performance across various settings and metrics compared to directly +fine-tuned LLMs in DP contexts. We release our code and setup at +https://github.com/tejuafonja/DP-2Stage. + +
+
+
+
+
+ + ☆ BYE: Build Your Encoder with One Sequence of Exploration Data for + Long-Term Dynamic Scene Understanding + + +
+ Dynamic scene understanding remains a persistent challenge in robotic +applications. Early dynamic mapping methods focused on mitigating the negative +influence of short-term dynamic objects on camera motion estimation by masking +or tracking specific categories, which often fall short in adapting to +long-term scene changes. Recent efforts address object association in long-term +dynamic environments using neural networks trained on synthetic datasets, but +they still rely on predefined object shapes and categories. Other methods +incorporate visual, geometric, or semantic heuristics for the association but +often lack robustness. In this work, we introduce BYE, a class-agnostic, +per-scene point cloud encoder that removes the need for predefined categories, +shape priors, or extensive association datasets. Trained on only a single +sequence of exploration data, BYE can efficiently perform object association in +dynamically changing scenes. We further propose an ensembling scheme combining +the semantic strengths of Vision Language Models (VLMs) with the scene-specific +expertise of BYE, achieving a 7% improvement and a 95% success rate in object +association tasks. Code and dataset are available at +https://byencoder.github.io. + +
+
+
+
+
+ + ☆ Artificial Expert Intelligence through PAC-reasoning + + +
+ Artificial Expert Intelligence (AEI) seeks to transcend the limitations of +both Artificial General Intelligence (AGI) and narrow AI by integrating +domain-specific expertise with critical, precise reasoning capabilities akin to +those of top human experts. Existing AI systems often excel at predefined tasks +but struggle with adaptability and precision in novel problem-solving. To +overcome this, AEI introduces a framework for ``Probably Approximately Correct +(PAC) Reasoning". This paradigm provides robust theoretical guarantees for +reliably decomposing complex problems, with a practical mechanism for +controlling reasoning precision. In reference to the division of human thought +into System 1 for intuitive thinking and System 2 for reflective +reasoning~\citep{tversky1974judgment}, we refer to this new type of reasoning +as System 3 for precise reasoning, inspired by the rigor of the scientific +method. AEI thus establishes a foundation for error-bounded, inference-time +learning. + +
+
+
+
+
+ + ☆ Nature versus nurture in galaxy formation: the effect of environment on + star formation with causal machine learning + + +
+ Understanding how galaxies form and evolve is at the heart of modern +astronomy. With the advent of large-scale surveys and simulations, remarkable +progress has been made in the last few decades. Despite this, the physical +processes behind the phenomena, and particularly their importance, remain far +from known, as correlations have primarily been established rather than the +underlying causality. We address this challenge by applying the causal +inference framework. Specifically, we tackle the fundamental open question of +whether galaxy formation and evolution depends more on nature (i.e., internal +processes) or nurture (i.e., external processes), by estimating the causal +effect of environment on star-formation rate in the IllustrisTNG simulations. +To do so, we develop a comprehensive causal model and employ cutting-edge +techniques from epidemiology to overcome the long-standing problem of +disentangling nature and nurture. We find that the causal effect is negative +and substantial, with environment suppressing the SFR by a maximal factor of +$\sim100$. While the overall effect at $z=0$ is negative, in the early +universe, environment is discovered to have a positive impact, boosting star +formation by a factor of $\sim10$ at $z\sim1$ and by even greater amounts at +higher redshifts. Furthermore, we show that: (i) nature also plays an important +role, as ignoring it underestimates the causal effect in intermediate-density +environments by a factor of $\sim2$, (ii) controlling for the stellar mass at a +snapshot in time, as is common in the literature, is not only insufficient to +disentangle nature and nurture but actually has an adverse effect, though (iii) +stellar mass is an adequate proxy of the effects of nature. Finally, this work +may prove a useful blueprint for extracting causal insights in other fields +that deal with dynamical systems with closed feedback loops, such as the +Earth's climate. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ☆ Improved Localized Machine Unlearning Through the Lens of Memorization + + +
+ Machine unlearning refers to removing the influence of a specified subset of +training data from a machine learning model, efficiently, after it has already +been trained. This is important for key applications, including making the +model more accurate by removing outdated, mislabeled, or poisoned data. In this +work, we study localized unlearning, where the unlearning algorithm operates on +a (small) identified subset of parameters. Drawing inspiration from the +memorization literature, we propose an improved localization strategy that +yields strong results when paired with existing unlearning algorithms. We also +propose a new unlearning algorithm, Deletion by Example Localization (DEL), +that resets the parameters deemed-to-be most critical according to our +localization strategy, and then finetunes them. Our extensive experiments on +different datasets, forget sets and metrics reveal that DEL sets a new +state-of-the-art for unlearning metrics, against both localized and +full-parameter methods, while modifying a small subset of parameters, and +outperforms the state-of-the-art localized unlearning in terms of test accuracy +too. + +
+
+
+
+
+ + ☆ Transformer-based Koopman Autoencoder for Linearizing Fisher's Equation + + +
+ A Transformer-based Koopman autoencoder is proposed for linearizing Fisher's +reaction-diffusion equation. The primary focus of this study is on using deep +learning techniques to find complex spatiotemporal patterns in the +reaction-diffusion system. The emphasis is on not just solving the equation but +also transforming the system's dynamics into a more comprehensible, linear +form. Global coordinate transformations are achieved through the autoencoder, +which learns to capture the underlying dynamics by training on a dataset with +60,000 initial conditions. Extensive testing on multiple datasets was used to +assess the efficacy of the proposed model, demonstrating its ability to +accurately predict the system's evolution as well as to generalize. We provide +a thorough comparison study, comparing our suggested design to a few other +comparable methods using experiments on various PDEs, such as the +Kuramoto-Sivashinsky equation and the Burger's equation. Results show improved +accuracy, highlighting the capabilities of the Transformer-based Koopman +autoencoder. The proposed architecture in is significantly ahead of other +architectures, in terms of solving different types of PDEs using a single +architecture. Our method relies entirely on the data, without requiring any +knowledge of the underlying equations. This makes it applicable to even the +datasets where the governing equations are not known. + +
+
+
+
+
+ + ☆ Time-Series-Informed Closed-loop Learning for Sequential Decision Making + and Control + + +
+ Closed-loop performance of sequential decision making algorithms, such as +model predictive control, depends strongly on the parameters of cost functions, +models, and constraints. Bayesian optimization is a common approach to learning +these parameters based on closed-loop experiments. However, traditional +Bayesian optimization approaches treat the learning problem as a black box, +ignoring valuable information and knowledge about the structure of the +underlying problem, resulting in slow convergence and high experimental +resource use. We propose a time-series-informed optimization framework that +incorporates intermediate performance evaluations from early iterations of each +experimental episode into the learning procedure. Additionally, probabilistic +early stopping criteria are proposed to terminate unpromising experiments, +significantly reducing experimental time. Simulation results show that our +approach achieves baseline performance with approximately half the resources. +Moreover, with the same resource budget, our approach outperforms the baseline +in terms of final closed-loop performance, highlighting its efficiency in +sequential decision making scenarios. + +
+
+ comment: 12 pages, 3 figures, submitted to L4DC 2025 +
+
+
+
+
+ + ☆ VISTA: A Panoramic View of Neural Representations + + +
+ We present VISTA (Visualization of Internal States and Their Associations), a +novel pipeline for visually exploring and interpreting neural network +representations. VISTA addresses the challenge of analyzing vast +multidimensional spaces in modern machine learning models by mapping +representations into a semantic 2D space. The resulting collages visually +reveal patterns and relationships within internal representations. We +demonstrate VISTA's utility by applying it to sparse autoencoder latents +uncovering new properties and interpretations. We review the VISTA methodology, +present findings from our case study ( https://got.drib.net/latents/ ), and +discuss implications for neural network interpretability across various domains +of machine learning. + +
+
+
+
+
+ + ☆ Leveraging Ensemble-Based Semi-Supervised Learning for Illicit Account + Detection in Ethereum DeFi Transactions + + +
+ The advent of smart contracts has enabled the rapid rise of Decentralized +Finance (DeFi) on the Ethereum blockchain, offering substantial rewards in +financial innovation and inclusivity. However, this growth has also introduced +significant security risks, including the proliferation of illicit accounts +involved in fraudulent activities. Traditional detection methods are limited by +the scarcity of labeled data and the evolving tactics of malicious actors. In +this paper, we propose a novel Self-Learning Ensemble-based Illicit account +Detection (SLEID) framework to address these challenges. SLEID employs an +Isolation Forest for initial outlier detection and a self-training mechanism to +iteratively generate pseudo-labels for unlabeled accounts, thereby enhancing +detection accuracy. Extensive experiments demonstrate that SLEID significantly +outperforms traditional supervised approaches and recent semi-supervised +models, achieving superior precision, recall, and F1-scores, particularly in +detecting illicit accounts. Compared to state-of-the-art methods, our approach +achieves better detection performance while reducing reliance on labeled data. +The results affirm SLEID's efficacy as a robust solution for safeguarding the +DeFi ecosystem and mitigating risks posed by malicious accounts. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ 3D Face Reconstruction From Radar Images + + +
+ The 3D reconstruction of faces gains wide attention in computer vision and is +used in many fields of application, for example, animation, virtual reality, +and even forensics. This work is motivated by monitoring patients in sleep +laboratories. Due to their unique characteristics, sensors from the radar +domain have advantages compared to optical sensors, namely penetration of +electrically non-conductive materials and independence of light. These +advantages of radar signals unlock new applications and require adaptation of +3D reconstruction frameworks. We propose a novel model-based method for 3D +reconstruction from radar images. We generate a dataset of synthetic radar +images with a physics-based but non-differentiable radar renderer. This dataset +is used to train a CNN-based encoder to estimate the parameters of a 3D +morphable face model. Whilst the encoder alone already leads to strong +reconstructions of synthetic data, we extend our reconstruction in an +Analysis-by-Synthesis fashion to a model-based autoencoder. This is enabled by +learning the rendering process in the decoder, which acts as an object-specific +differentiable radar renderer. Subsequently, the combination of both network +parts is trained to minimize both, the loss of the parameters and the loss of +the resulting reconstructed radar image. This leads to the additional benefit, +that at test time the parameters can be further optimized by finetuning the +autoencoder unsupervised on the image loss. We evaluated our framework on +generated synthetic face images as well as on real radar images with 3D ground +truth of four individuals. + +
+
+
+
+
+ + ☆ OMENN: One Matrix to Explain Neural Networks + + +
+ Deep Learning (DL) models are often black boxes, making their decision-making +processes difficult to interpret. This lack of transparency has driven +advancements in eXplainable Artificial Intelligence (XAI), a field dedicated to +clarifying the reasoning behind DL model predictions. Among these, +attribution-based methods such as LRP and GradCAM are widely used, though they +rely on approximations that can be imprecise. + To address these limitations, we introduce One Matrix to Explain Neural +Networks (OMENN), a novel post-hoc method that represents a neural network as a +single, interpretable matrix for each specific input. This matrix is +constructed through a series of linear transformations that represent the +processing of the input by each successive layer in the neural network. As a +result, OMENN provides locally precise, attribution-based explanations of the +input across various modern models, including ViTs and CNNs. We present a +theoretical analysis of OMENN based on dynamic linearity property and validate +its effectiveness with extensive tests on two XAI benchmarks, demonstrating +that OMENN is competitive with state-of-the-art methods. + +
+
+ comment: Under review, code will be released after acceptance +
+
+
+
+
+ + ☆ HERO: Hint-Based Efficient and Reliable Query Optimizer VLDB 2025 + + +
+ We propose a novel model for learned query optimization which provides query +hints leading to better execution plans. The model addresses the three key +challenges in learned hint-based query optimization: reliable hint +recommendation (ensuring non-degradation of query latency), efficient hint +exploration, and fast inference. We provide an in-depth analysis of existing +NN-based approaches to hint-based optimization and experimentally confirm the +named challenges for them. Our alternative solution consists of a new inference +schema based on an ensemble of context-aware models and a graph storage for +reliable hint suggestion and fast inference, and a budget-controlled training +procedure with a local search algorithm that solves the issue of exponential +search space exploration. In experiments on standard benchmarks, our model +demonstrates optimization capability close to the best achievable with +coarse-grained hints. Controlling the degree of parallelism (query dop) in +addition to operator-related hints enables our model to achieve 3x latency +improvement on JOB benchmark which sets a new standard for optimization. Our +model is interpretable and easy to debug, which is particularly important for +deployment in production. + +
+
+ comment: Submitted to VLDB 2025; 13 pages; 13 figures +
+
+
+
+
+ + ☆ LoRA Diffusion: Zero-Shot LoRA Synthesis for Diffusion Model + Personalization + + +
+ Low-Rank Adaptation (LoRA) and other parameter-efficient fine-tuning (PEFT) +methods provide low-memory, storage-efficient solutions for personalizing +text-to-image models. However, these methods offer little to no improvement in +wall-clock training time or the number of steps needed for convergence compared +to full model fine-tuning. While PEFT methods assume that shifts in generated +distributions (from base to fine-tuned models) can be effectively modeled +through weight changes in a low-rank subspace, they fail to leverage knowledge +of common use cases, which typically focus on capturing specific styles or +identities. Observing that desired outputs often comprise only a small subset +of the possible domain covered by LoRA training, we propose reducing the search +space by incorporating a prior over regions of interest. We demonstrate that +training a hypernetwork model to generate LoRA weights can achieve competitive +quality for specific domains while enabling near-instantaneous conditioning on +user input, in contrast to traditional training methods that require thousands +of steps. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Federated Analytics in Practice: Engineering for Privacy, Scalability + and Practicality + + +
+ Cross-device Federated Analytics (FA) is a distributed computation paradigm +designed to answer analytics queries about and derive insights from data held +locally on users' devices. On-device computations combined with other privacy +and security measures ensure that only minimal data is transmitted off-device, +achieving a high standard of data protection. Despite FA's broad relevance, the +applicability of existing FA systems is limited by compromised accuracy; lack +of flexibility for data analytics; and an inability to scale effectively. In +this paper, we describe our approach to combine privacy, scalability, and +practicality to build and deploy a system that overcomes these limitations. Our +FA system leverages trusted execution environments (TEEs) and optimizes the use +of on-device computing resources to facilitate federated data processing across +large fleets of devices, while ensuring robust, defensible, and verifiable +privacy safeguards. We focus on federated analytics (statistics and +monitoring), in contrast to systems for federated learning (ML workloads), and +we flag the key differences. + +
+
+
+
+
+ + ☆ An Adaptive Grasping Force Tracking Strategy for Nonlinear and + Time-Varying Object Behaviors + + +
+ Accurate grasp force control is one of the key skills for ensuring successful +and damage-free robotic grasping of objects. Although existing methods have +conducted in-depth research on slip detection and grasping force planning, they +often overlook the issue of adaptive tracking of the actual force to the target +force when handling objects with different material properties. The optimal +parameters of a force tracking controller are significantly influenced by the +object's stiffness, and many adaptive force tracking algorithms rely on +stiffness estimation. However, real-world objects often exhibit viscous, +plastic, or other more complex nonlinear time-varying behaviors, and existing +studies provide insufficient support for these materials in terms of stiffness +definition and estimation. To address this, this paper introduces the concept +of generalized stiffness, extending the definition of stiffness to nonlinear +time-varying grasp system models, and proposes an online generalized stiffness +estimator based on Long Short-Term Memory (LSTM) networks. Based on generalized +stiffness, this paper proposes an adaptive parameter adjustment strategy using +a PI controller as an example, enabling dynamic force tracking for objects with +varying characteristics. Experimental results demonstrate that the proposed +method achieves high precision and short probing time, while showing better +adaptability to non-ideal objects compared to existing methods. The method +effectively solves the problem of grasp force tracking in unknown, nonlinear, +and time-varying grasp systems, enhancing the robotic grasping ability in +unstructured environments. + +
+
+
+
+
+ + ☆ Sample Efficient Robot Learning in Supervised Effect Prediction Tasks + + +
+ In self-supervised robot learning, robots actively explore their environments +and generate data by acting on entities in the environment. Therefore, an +exploration policy is desired that ensures sample efficiency to minimize robot +execution costs while still providing accurate learning. For this purpose, the +robotic community has adopted Intrinsic Motivation (IM)-based approaches such +as Learning Progress (LP). On the machine learning front, Active Learning (AL) +has been used successfully, especially for classification tasks. In this work, +we develop a novel AL framework geared towards robotics regression tasks, such +as action-effect prediction and, more generally, for world model learning, +which we call MUSEL - Model Uncertainty for Sample Efficient Learning. MUSEL +aims to extract model uncertainty from the total uncertainty estimate given by +a suitable learning engine by making use of earning progress and input +diversity and use it to improve sample efficiency beyond the state-of-the-art +action-effect prediction methods. We demonstrate the feasibility of our model +by using a Stochastic Variational Gaussian Process (SVGP) as the learning +engine and testing the system on a set of robotic experiments in simulation. +The efficacy of MUSEL is demonstrated by comparing its performance to standard +methods used in robot action-effect learning. In a robotic tabletop environment +in which a robot manipulator is tasked with learning the effect of its actions, +the experiments show that MUSEL facilitates higher accuracy in learning action +effects while ensuring sample efficiency. + +
+
+ comment: 18 pages, 18 figures +
+
+
+
+
+ + ☆ Efficient Model Compression Techniques with FishLeg NeurIPS 2024 + + +
+ In many domains, the most successful AI models tend to be the largest, indeed +often too large to be handled by AI players with limited computational +resources. To mitigate this, a number of compression methods have been +developed, including methods that prune the network down to high sparsity +whilst retaining performance. The best-performing pruning techniques are often +those that use second-order curvature information (such as an estimate of the +Fisher information matrix) to score the importance of each weight and to +predict the optimal compensation for weight deletion. However, these methods +are difficult to scale to high-dimensional parameter spaces without making +heavy approximations. Here, we propose the FishLeg surgeon (FLS), a new +second-order pruning method based on the Fisher-Legendre (FishLeg) optimizer. +At the heart of FishLeg is a meta-learning approach to amortising the action of +the inverse FIM, which brings a number of advantages. Firstly, the +parameterisation enables the use of flexible tensor factorisation techniques to +improve computational and memory efficiency without sacrificing much accuracy, +alleviating challenges associated with scalability of most second-order pruning +methods. Secondly, directly estimating the inverse FIM leads to less +sensitivity to the amplification of stochasticity during inversion, thereby +resulting in more precise estimates. Thirdly, our approach also allows for +progressive assimilation of the curvature into the parameterisation. In the +gradual pruning regime, this results in a more efficient estimate refinement as +opposed to re-estimation. We find that FishLeg achieves higher or comparable +performance against two common baselines in the area, most notably in the high +sparsity regime when considering a ResNet18 model on CIFAR-10 (84% accuracy at +95% sparsity vs 60% for OBS) and TinyIM (53% accuracy at 80% sparsity vs 48% +for OBS). + +
+
+ comment: Published in NeurIPS 2024 - Neural Compression Workshop, 13 pages, 6 + figures +
+
+
+
+
+ + ☆ Switchable deep beamformer for high-quality and real-time passive + acoustic mapping + + +
+ Passive acoustic mapping (PAM) is a promising tool for monitoring acoustic +cavitation activities in the applications of ultrasound therapy. Data-adaptive +beamformers for PAM have better image quality compared to the time exposure +acoustics (TEA) algorithms. However, the computational cost of data-adaptive +beamformers is considerably expensive. In this work, we develop a deep +beamformer based on a generative adversarial network, which can switch between +different transducer arrays and reconstruct high-quality PAM images directly +from radio frequency ultrasound signals with low computational cost. The deep +beamformer was trained on the dataset consisting of simulated and experimental +cavitation signals of single and multiple microbubble clouds measured by +different (linear and phased) arrays covering 1-15 MHz. We compared the +performance of the deep beamformer to TEA and three different data-adaptive +beamformers using the simulated and experimental test dataset. Compared with +TEA, the deep beamformer reduced the energy spread area by 18.9%-65.0% and +improved the image signal-to-noise ratio by 9.3-22.9 dB in average for the +different arrays in our data. Compared to the data-adaptive beamformers, the +deep beamformer reduced the computational cost by three orders of magnitude +achieving 10.5 ms image reconstruction speed in our data, while the image +quality was as good as that of the data-adaptive beamformers. These results +demonstrated the potential of the deep beamformer for high-resolution +monitoring of microbubble cavitation activities for ultrasound therapy. + +
+
+
+
+
+ + ☆ Optimizing Plastic Waste Collection in Water Bodies Using Heterogeneous + Autonomous Surface Vehicles with Deep Reinforcement Learning + + +
+ This paper presents a model-free deep reinforcement learning framework for +informative path planning with heterogeneous fleets of autonomous surface +vehicles to locate and collect plastic waste. The system employs two teams of +vehicles: scouts and cleaners. Coordination between these teams is achieved +through a deep reinforcement approach, allowing agents to learn strategies to +maximize cleaning efficiency. The primary objective is for the scout team to +provide an up-to-date contamination model, while the cleaner team collects as +much waste as possible following this model. This strategy leads to +heterogeneous teams that optimize fleet efficiency through inter-team +cooperation supported by a tailored reward function. Different trainings of the +proposed algorithm are compared with other state-of-the-art heuristics in two +distinct scenarios, one with high convexity and another with narrow corridors +and challenging access. According to the obtained results, it is demonstrated +that deep reinforcement learning based algorithms outperform other benchmark +heuristics, exhibiting superior adaptability. In addition, training with greedy +actions further enhances performance, particularly in scenarios with intricate +layouts. + +
+
+ comment: This article is currently under revision for the Robotics and + Automation Letters (IEEE) +
+
+
+
+
+ + ☆ Noisy Ostracods: A Fine-Grained, Imbalanced Real-World Dataset for + Benchmarking Robust Machine Learning and Label Correction Methods + + +
+ We present the Noisy Ostracods, a noisy dataset for genus and species +classification of crustacean ostracods with specialists' annotations. Over the +71466 specimens collected, 5.58% of them are estimated to be noisy (possibly +problematic) at genus level. The dataset is created to addressing a real-world +challenge: creating a clean fine-grained taxonomy dataset. The Noisy Ostracods +dataset has diverse noises from multiple sources. Firstly, the noise is +open-set, including new classes discovered during curation that were not part +of the original annotation. The dataset has pseudo-classes, where annotators +misclassified samples that should belong to an existing class into a new +pseudo-class. The Noisy Ostracods dataset is highly imbalanced with a imbalance +factor $\rho$ = 22429. This presents a unique challenge for robust machine +learning methods, as existing approaches have not been extensively evaluated on +fine-grained classification tasks with such diverse real-world noise. Initial +experiments using current robust learning techniques have not yielded +significant performance improvements on the Noisy Ostracods dataset compared to +cross-entropy training on the raw, noisy data. On the other hand, noise +detection methods have underperformed in error hit rate compared to naive +cross-validation ensembling for identifying problematic labels. These findings +suggest that the fine-grained, imbalanced nature, and complex noise +characteristics of the dataset present considerable challenges for existing +noise-robust algorithms. By openly releasing the Noisy Ostracods dataset, our +goal is to encourage further research into the development of noise-resilient +machine learning methods capable of effectively handling diverse, real-world +noise in fine-grained classification tasks. The dataset, along with its +evaluation protocols, can be accessed at +https://github.com/H-Jamieu/Noisy_ostracods. + +
+
+ comment: Initial submit +
+
+
+
+
+ + ☆ Enhanced Photovoltaic Power Forecasting: An iTransformer and LSTM-Based + Model Integrating Temporal and Covariate Interactions + + +
+ Accurate photovoltaic (PV) power forecasting is critical for integrating +renewable energy sources into the grid, optimizing real-time energy management, +and ensuring energy reliability amidst increasing demand. However, existing +models often struggle with effectively capturing the complex relationships +between target variables and covariates, as well as the interactions between +temporal dynamics and multivariate data, leading to suboptimal forecasting +accuracy. To address these challenges, we propose a novel model architecture +that leverages the iTransformer for feature extraction from target variables +and employs long short-term memory (LSTM) to extract features from covariates. +A cross-attention mechanism is integrated to fuse the outputs of both models, +followed by a Kolmogorov-Arnold network (KAN) mapping for enhanced +representation. The effectiveness of the proposed model is validated using +publicly available datasets from Australia, with experiments conducted across +four seasons. Results demonstrate that the proposed model effectively capture +seasonal variations in PV power generation and improve forecasting accuracy. + +
+
+
+
+
+ + ☆ CADMR: Cross-Attention and Disentangled Learning for Multimodal + Recommender Systems + + +
+ The increasing availability and diversity of multimodal data in recommender +systems offer new avenues for enhancing recommendation accuracy and user +satisfaction. However, these systems must contend with high-dimensional, sparse +user-item rating matrices, where reconstructing the matrix with only small +subsets of preferred items for each user poses a significant challenge. To +address this, we propose CADMR, a novel autoencoder-based multimodal +recommender system framework. CADMR leverages multi-head cross-attention +mechanisms and Disentangled Learning to effectively integrate and utilize +heterogeneous multimodal data in reconstructing the rating matrix. Our approach +first disentangles modality-specific features while preserving their +interdependence, thereby learning a joint latent representation. The multi-head +cross-attention mechanism is then applied to enhance user-item interaction +representations with respect to the learned multimodal item latent +representations. We evaluate CADMR on three benchmark datasets, demonstrating +significant performance improvements over state-of-the-art methods. + +
+
+
+
+
+ + ☆ Initial Study On Improving Segmentation By Combining Preoperative CT And + Intraoperative CBCT Using Synthetic Data + + +
+ Computer-Assisted Interventions enable clinicians to perform precise, +minimally invasive procedures, often relying on advanced imaging methods. +Cone-beam computed tomography (CBCT) can be used to facilitate +computer-assisted interventions, despite often suffering from artifacts that +pose challenges for accurate interpretation. While the degraded image quality +can affect image analysis, the availability of high quality, preoperative scans +offers potential for improvements. Here we consider a setting where +preoperative CT and intraoperative CBCT scans are available, however, the +alignment (registration) between the scans is imperfect to simulate a real +world scenario. We propose a multimodal learning method that fuses roughly +aligned CBCT and CT scans and investigate the effect on segmentation +performance. For this experiment we use synthetically generated data containing +real CT and synthetic CBCT volumes with corresponding voxel annotations. We +show that this fusion setup improves segmentation performance in $18$ out of +$20$ investigated setups. + +
+
+ comment: Accepted at BVM 2025. arXiv admin note: text overlap with + arXiv:2406.11650 +
+
+
+
+
+ + ☆ Deep Matrix Factorization with Adaptive Weights for Multi-View + Clustering + + +
+ Recently, deep matrix factorization has been established as a powerful model +for unsupervised tasks, achieving promising results, especially for multi-view +clustering. However, existing methods often lack effective feature selection +mechanisms and rely on empirical hyperparameter selection. To address these +issues, we introduce a novel Deep Matrix Factorization with Adaptive Weights +for Multi-View Clustering (DMFAW). Our method simultaneously incorporates +feature selection and generates local partitions, enhancing clustering results. +Notably, the features weights are controlled and adjusted by a parameter that +is dynamically updated using Control Theory inspired mechanism, which not only +improves the model's stability and adaptability to diverse datasets but also +accelerates convergence. A late fusion approach is then proposed to align the +weighted local partitions with the consensus partition. Finally, the +optimization problem is solved via an alternating optimization algorithm with +theoretically guaranteed convergence. Extensive experiments on benchmark +datasets highlight that DMFAW outperforms state-of-the-art methods in terms of +clustering performance. + +
+
+
+
+
+ + ☆ Conformal Symplectic Optimization for Stable Reinforcement Learning + + +
+ Training deep reinforcement learning (RL) agents necessitates overcoming the +highly unstable nonconvex stochastic optimization inherent in the +trial-and-error mechanism. To tackle this challenge, we propose a +physics-inspired optimization algorithm called relativistic adaptive gradient +descent (RAD), which enhances long-term training stability. By conceptualizing +neural network (NN) training as the evolution of a conformal Hamiltonian +system, we present a universal framework for transferring long-term stability +from conformal symplectic integrators to iterative NN updating rules, where the +choice of kinetic energy governs the dynamical properties of resulting +optimization algorithms. By utilizing relativistic kinetic energy, RAD +incorporates principles from special relativity and limits parameter updates +below a finite speed, effectively mitigating abnormal gradient influences. +Additionally, RAD models NN optimization as the evolution of a multi-particle +system where each trainable parameter acts as an independent particle with an +individual adaptive learning rate. We prove RAD's sublinear convergence under +general nonconvex settings, where smaller gradient variance and larger batch +sizes contribute to tighter convergence. Notably, RAD degrades to the +well-known adaptive moment estimation (ADAM) algorithm when its speed +coefficient is chosen as one and symplectic factor as a small positive value. +Experimental results show RAD outperforming nine baseline optimizers with five +RL algorithms across twelve environments, including standard benchmarks and +challenging scenarios. Notably, RAD achieves up to a 155.1% performance +improvement over ADAM in Atari games, showcasing its efficacy in stabilizing +and accelerating RL training. + +
+
+
+
+
+ + ☆ Learn More by Using Less: Distributed Learning with Energy-Constrained + Devices + + +
+ Federated Learning (FL) has emerged as a solution for distributed model +training across decentralized, privacy-preserving devices, but the different +energy capacities of participating devices (system heterogeneity) constrain +real-world implementations. These energy limitations not only reduce model +accuracy but also increase dropout rates, impacting on convergence in practical +FL deployments. In this work, we propose LeanFed, an energy-aware FL framework +designed to optimize client selection and training workloads on +battery-constrained devices. LeanFed leverages adaptive data usage by +dynamically adjusting the fraction of local data each device utilizes during +training, thereby maximizing device participation across communication rounds +while ensuring they do not run out of battery during the process. We rigorously +evaluate LeanFed against traditional FedAvg on CIFAR-10 and CIFAR-100 datasets, +simulating various levels of data heterogeneity and device participation rates. +Results show that LeanFed consistently enhances model accuracy and stability, +particularly in settings with high data heterogeneity and limited battery life, +by mitigating client dropout and extending device availability. This approach +demonstrates the potential of energy-efficient, privacy-preserving FL in +real-world, large-scale applications, setting a foundation for robust and +sustainable pervasive AI on resource-constrained networks. + +
+
+
+
+
+ + ☆ GQWformer: A Quantum-based Transformer for Graph Representation Learning + + +
+ Graph Transformers (GTs) have demonstrated significant advantages in graph +representation learning through their global attention mechanisms. However, the +self-attention mechanism in GTs tends to neglect the inductive biases inherent +in graph structures, making it chanllenging to effectively capture essential +structural information. To address this issue, we propose a novel approach that +integrate graph inductive bias into self-attention mechanisms by leveraging +quantum technology for structural encoding. In this paper, we introduce the +Graph Quantum Walk Transformer (GQWformer), a groundbreaking GNN framework that +utilizes quantum walks on attributed graphs to generate node quantum states. +These quantum states encapsulate rich structural attributes and serve as +inductive biases for the transformer, thereby enabling the generation of more +meaningful attention scores. By subsequently incorporating a recurrent neural +network, our design amplifies the model's ability to focus on both local and +global information. We conducted comprehensive experiments across five publicly +available datasets to evaluate the effectiveness of our model. These results +clearly indicate that GQWformer outperforms existing state-of-the-art graph +classification algorithms. These findings highlight the significant potential +of integrating quantum computing methodologies with traditional GNNs to advance +the field of graph representation learning, providing a promising direction for +future research and applications. + +
+
+
+
+
+ + ☆ Step-by-Step Guidance to Differential Anemia Diagnosis with Real-World + Data and Deep Reinforcement Learning + + +
+ Clinical diagnostic guidelines outline the key questions to answer to reach a +diagnosis. Inspired by guidelines, we aim to develop a model that learns from +electronic health records to determine the optimal sequence of actions for +accurate diagnosis. Focusing on anemia and its sub-types, we employ deep +reinforcement learning (DRL) algorithms and evaluate their performance on both +a synthetic dataset, which is based on expert-defined diagnostic pathways, and +a real-world dataset. We investigate the performance of these algorithms across +various scenarios. Our experimental results demonstrate that DRL algorithms +perform competitively with state-of-the-art methods while offering the +significant advantage of progressively generating pathways to the suggested +diagnosis, providing a transparent decision-making process that can guide and +explain diagnostic reasoning. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.05913 +
+
+
+
+
+ + ☆ BOTracle: A framework for Discriminating Bots and Humans ESORICS + + +
+ Bots constitute a significant portion of Internet traffic and are a source of +various issues across multiple domains. Modern bots often become +indistinguishable from real users, as they employ similar methods to browse the +web, including using real browsers. We address the challenge of bot detection +in high-traffic scenarios by analyzing three distinct detection methods. The +first method operates on heuristics, allowing for rapid detection. The second +method utilizes, well known, technical features, such as IP address, window +size, and user agent. It serves primarily for comparison with the third method. +In the third method, we rely solely on browsing behavior, omitting all static +features and focusing exclusively on how clients behave on a website. In +contrast to related work, we evaluate our approaches using real-world +e-commerce traffic data, comprising 40 million monthly page visits. We further +compare our methods against another bot detection approach, Botcha, on the same +dataset. Our performance metrics, including precision, recall, and AUC, reach +98 percent or higher, surpassing Botcha. + +
+
+ comment: Bot Detection; User Behaviour Analysis; Published at ESORICS + International Workshops 2024 +
+
+
+
+
+ + ☆ Diabetic Retinopathy Classification from Retinal Images using Machine + Learning Approaches + + +
+ Diabetic Retinopathy is one of the most familiar diseases and is a diabetes +complication that affects eyes. Initially, diabetic retinopathy may cause no +symptoms or only mild vision problems. Eventually, it can cause blindness. So +early detection of symptoms could help to avoid blindness. In this paper, we +present some experiments on some features of diabetic retinopathy, like +properties of exudates, properties of blood vessels and properties of +microaneurysm. Using the features, we can classify healthy, mild +non-proliferative, moderate non-proliferative, severe non-proliferative and +proliferative stages of DR. Support Vector Machine, Random Forest and Naive +Bayes classifiers are used to classify the stages. Finally, Random Forest is +found to be the best for higher accuracy, sensitivity and specificity of 76.5%, +77.2% and 93.3% respectively. + +
+
+ comment: 5 pages, 9 figures, 2 tables. International Conference on Advanced + Engineering, Technology and Applications (ICAETA-2021), Istanbul, Turkey +
+
+
+
+
+ + ☆ Technical Report on Reinforcement Learning Control on the Lucas-Nülle + Inverted Pendulum + + +
+ The discipline of automatic control is making increased use of concepts that +originate from the domain of machine learning. Herein, reinforcement learning +(RL) takes an elevated role, as it is inherently designed for sequential +decision making, and can be applied to optimal control problems without the +need for a plant system model. To advance education of control engineers and +operators in this field, this contribution targets an RL framework that can be +applied to educational hardware provided by the Lucas-N\"ulle company. +Specifically, the goal of inverted pendulum control is pursued by means of RL, +including both, swing-up and stabilization within a single holistic design +approach. Herein, the actual learning is enabled by separating corresponding +computations from the real-time control computer and outsourcing them to a +different hardware. This distributed architecture, however, necessitates +communication of the involved components, which is realized via CAN bus. The +experimental proof of concept is presented with an applied safeguarding +algorithm that prevents the plant from being operated harmfully during the +trial-and-error training phase. + +
+
+
+
+
+ + ☆ Composing Open-domain Vision with RAG for Ocean Monitoring and + Conservation NeurIPS 2024 + + +
+ Climate change's destruction of marine biodiversity is threatening +communities and economies around the world which rely on healthy oceans for +their livelihoods. The challenge of applying computer vision to niche, +real-world domains such as ocean conservation lies in the dynamic and diverse +environments where traditional top-down learning struggle with long-tailed +distributions, generalization, and domain transfer. Scalable species +identification for ocean monitoring is particularly difficult due to the need +to adapt models to new environments and identify rare or unseen species. To +overcome these limitations, we propose leveraging bottom-up, open-domain +learning frameworks as a resilient, scalable solution for image and video +analysis in marine applications. Our preliminary demonstration uses pretrained +vision-language models (VLMs) combined with retrieval-augmented generation +(RAG) as grounding, leaving the door open for numerous architectural, training +and engineering optimizations. We validate this approach through a preliminary +application in classifying fish from video onboard fishing vessels, +demonstrating impressive emergent retrieval and prediction capabilities without +domain-specific training or knowledge of the task itself. + +
+
+ comment: Accepted to Climate Change AI Workshop at NeurIPS 2024. 9 pages, 6 + figures, 1 table +
+
+
+
+
+ + ☆ Selective Reviews of Bandit Problems in AI via a Statistical View + + +
+ Reinforcement Learning (RL) is a widely researched area in artificial +intelligence that focuses on teaching agents decision-making through +interactions with their environment. A key subset includes stochastic +multi-armed bandit (MAB) and continuum-armed bandit (SCAB) problems, which +model sequential decision-making under uncertainty. This review outlines the +foundational models and assumptions of bandit problems, explores non-asymptotic +theoretical tools like concentration inequalities and minimax regret bounds, +and compares frequentist and Bayesian algorithms for managing +exploration-exploitation trade-offs. We also extend the discussion to $K$-armed +contextual bandits and SCAB, examining their methodologies, regret analyses, +and discussing the relation between the SCAB problems and the functional data +analysis. Finally, we highlight recent advances and ongoing challenges in the +field. + +
+
+ comment: 46 pages, 5 figures, +
+
+
+
+
+ + ☆ On Simplifying Large-Scale Spatial Vectors: Fast, Memory-Efficient, and + Cost-Predictable k-means + + +
+ The k-means algorithm can simplify large-scale spatial vectors, such as 2D +geo-locations and 3D point clouds, to support fast analytics and learning. +However, when processing large-scale datasets, existing k-means algorithms have +been developed to achieve high performance with significant computational +resources, such as memory and CPU usage time. These algorithms, though +effective, are not well-suited for resource-constrained devices. In this paper, +we propose a fast, memory-efficient, and cost-predictable k-means called +Dask-means. We first accelerate k-means by designing a memory-efficient +accelerator, which utilizes an optimized nearest neighbor search over a +memory-tunable index to assign spatial vectors to clusters in batches. We then +design a lightweight cost estimator to predict the memory cost and runtime of +the k-means task, allowing it to request appropriate memory from devices or +adjust the accelerator's required space to meet memory constraints, and ensure +sufficient CPU time for running k-means. Experiments show that when simplifying +datasets with scale such as $10^6$, Dask-means uses less than $30$MB of memory, +achieves over $168$ times speedup compared to the widely-used Lloyd's +algorithm. We also validate Dask-means on mobile devices, where it demonstrates +significant speedup and low memory cost compared to other state-of-the-art +(SOTA) k-means algorithms. Our cost estimator estimates the memory cost with a +difference of less than $3\%$ from the actual ones and predicts runtime with an +MSE up to $33.3\%$ lower than SOTA methods. + +
+
+
+
+
+ + ☆ U-Net in Medical Image Segmentation: A Review of Its Applications Across + Modalities + + +
+ Medical imaging is essential in healthcare to provide key insights into +patient anatomy and pathology, aiding in diagnosis and treatment. Non-invasive +techniques such as X-ray, Magnetic Resonance Imaging (MRI), Computed Tomography +(CT), and Ultrasound (US), capture detailed images of organs, tissues, and +abnormalities. Effective analysis of these images requires precise segmentation +to delineate regions of interest (ROI), such as organs or lesions. Traditional +segmentation methods, relying on manual feature-extraction, are labor-intensive +and vary across experts. Recent advancements in Artificial Intelligence (AI) +and Deep Learning (DL), particularly convolutional models such as U-Net and its +variants (U-Net++ and U-Net 3+), have transformed medical image segmentation +(MIS) by automating the process and enhancing accuracy. These models enable +efficient, precise pixel-wise classification across various imaging modalities, +overcoming the limitations of manual segmentation. This review explores various +medical imaging techniques, examines the U-Net architectures and their +adaptations, and discusses their application across different modalities. It +also identifies common challenges in MIS and proposes potential solutions. + +
+
+
+
+
+ + ☆ ESA: Example Sieve Approach for Multi-Positive and Unlabeled Learning + + +
+ Learning from Multi-Positive and Unlabeled (MPU) data has gradually attracted +significant attention from practical applications. Unfortunately, the risk of +MPU also suffer from the shift of minimum risk, particularly when the models +are very flexible as shown in Fig.\ref{moti}. In this paper, to alleviate the +shifting of minimum risk problem, we propose an Example Sieve Approach (ESA) to +select examples for training a multi-class classifier. Specifically, we sieve +out some examples by utilizing the Certain Loss (CL) value of each example in +the training stage and analyze the consistency of the proposed risk estimator. +Besides, we show that the estimation error of proposed ESA obtains the optimal +parametric convergence rate. Extensive experiments on various real-world +datasets show the proposed approach outperforms previous methods. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Learning from Concealed Labels + + +
+ Annotating data for sensitive labels (e.g., disease, smoking) poses a +potential threats to individual privacy in many real-world scenarios. To cope +with this problem, we propose a novel setting to protect privacy of each +instance, namely learning from concealed labels for multi-class classification. +Concealed labels prevent sensitive labels from appearing in the label set +during the label collection stage, which specifies none and some random sampled +insensitive labels as concealed labels set to annotate sensitive data. In this +paper, an unbiased estimator can be established from concealed data under mild +assumptions, and the learned multi-class classifier can not only classify the +instance from insensitive labels accurately but also recognize the instance +from the sensitive labels. Moreover, we bound the estimation error and show +that the multi-class classifier achieves the optimal parametric convergence +rate. Experiments demonstrate the significance and effectiveness of the +proposed method for concealed labels in synthetic and real-world datasets. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ BANER: Boundary-Aware LLMs for Few-Shot Named Entity Recognition COLING 2025 + + +
+ Despite the recent success of two-stage prototypical networks in few-shot +named entity recognition (NER), challenges such as over/under-detected false +spans in the span detection stage and unaligned entity prototypes in the type +classification stage persist. Additionally, LLMs have not proven to be +effective few-shot information extractors in general. In this paper, we propose +an approach called Boundary-Aware LLMs for Few-Shot Named Entity Recognition to +address these issues. We introduce a boundary-aware contrastive learning +strategy to enhance the LLM's ability to perceive entity boundaries for +generalized entity spans. Additionally, we utilize LoRAHub to align information +from the target domain to the source domain, thereby enhancing adaptive +cross-domain classification capabilities. Extensive experiments across various +benchmarks demonstrate that our framework outperforms prior methods, validating +its effectiveness. In particular, the proposed strategies demonstrate +effectiveness across a range of LLM architectures. The code and data are +released on https://github.com/UESTC-GQJ/BANER. + +
+
+ comment: Appear on COLING 2025 +
+
+
+
+
+ + ☆ Unlocking Tuning-Free Few-Shot Adaptability in Visual Foundation Models + by Recycling Pre-Tuned LoRAs + + +
+ Large Language Models (LLMs) such as ChatGPT demonstrate strong few-shot +adaptability without requiring fine-tuning, positioning them ideal for +data-limited and real-time applications. However, this adaptability has not yet +been replicated in current Visual Foundation Models (VFMs), which require +explicit fine-tuning with sufficient tuning data. Besides, the +pretraining-finetuning paradigm has led to the surge of numerous task-specific +modular components, such as Low-Rank Adaptation (LoRA). For the first time, we +explore the potential of reusing diverse pre-tuned LoRAs without accessing +their original training data, to achieve tuning-free few-shot adaptation in +VFMs. Our framework, LoRA Recycle, distills a meta-LoRA from diverse pre-tuned +LoRAs with a meta-learning objective, using surrogate data generated inversely +from pre-tuned LoRAs themselves. The VFM, once equipped with the meta-LoRA, is +empowered to solve new few-shot tasks in a single forward pass, akin to the +in-context learning of LLMs. Additionally, we incorporate a double-efficient +mechanism tailored to our framework, significantly accelerating the +meta-training process while maintaining or even improving performance. +Extensive experiments across various few-shot classification benchmarks across +both in- and cross-domain scenarios demonstrate the superiority of our +framework. + +
+
+
+
+
+ + ☆ Recovering implicit physics model under real-world constraints ECAI 2024 + + +
+ Recovering a physics-driven model, i.e. a governing set of equations of the +underlying dynamical systems, from the real-world data has been of recent +interest. Most existing methods either operate on simulation data with +unrealistically high sampling rates or require explicit measurements of all +system variables, which is not amenable in real-world deployments. Moreover, +they assume the timestamps of external perturbations to the physical system are +known a priori, without uncertainty, implicitly discounting any sensor +time-synchronization or human reporting errors. In this paper, we propose a +novel liquid time constant neural network (LTC-NN) based architecture to +recover underlying model of physical dynamics from real-world data. The +automatic differentiation property of LTC-NN nodes overcomes problems +associated with low sampling rates, the input dependent time constant in the +forward pass of the hidden layer of LTC-NN nodes creates a massive search space +of implicit physical dynamics, the physics model solver based data +reconstruction loss guides the search for the correct set of implicit dynamics, +and the use of the dropout regularization in the dense layer ensures extraction +of the sparsest model. Further, to account for the perturbation timing error, +we utilize dense layer nodes to search through input shifts that results in the +lowest reconstruction loss. Experiments on four benchmark dynamical systems, +three with simulation data and one with the real-world data show that the +LTC-NN architecture is more accurate in recovering implicit physics model +coefficients than the state-of-the-art sparse model recovery approaches. We +also introduce four additional case studies (total eight) on real-life medical +examples in simulation and with real-world clinical data to show effectiveness +of our approach in recovering underlying model in practice. + +
+
+ comment: This paper is published in ECAI 2024, + https://ebooks.iospress.nl/volumearticle/69651 +
+
+
+
+
+ + ☆ An Automated Data Mining Framework Using Autoencoders for Feature + Extraction and Dimensionality Reduction + + +
+ This study proposes an automated data mining framework based on autoencoders +and experimentally verifies its effectiveness in feature extraction and data +dimensionality reduction. Through the encoding-decoding structure, the +autoencoder can capture the data's potential characteristics and achieve noise +reduction and anomaly detection, providing an efficient and stable solution for +the data mining process. The experiment compared the performance of the +autoencoder with traditional dimensionality reduction methods (such as PCA, FA, +T-SNE, and UMAP). The results showed that the autoencoder performed best in +terms of reconstruction error and root mean square error and could better +retain data structure and enhance the generalization ability of the model. The +autoencoder-based framework not only reduces manual intervention but also +significantly improves the automation of data processing. In the future, with +the advancement of deep learning and big data technology, the autoencoder +method combined with a generative adversarial network (GAN) or graph neural +network (GNN) is expected to be more widely used in the fields of complex data +processing, real-time data analysis and intelligent decision-making. + +
+
+
+
+
+ + ☆ SA-GNAS: Seed Architecture Expansion for Efficient Large-scale Graph + Neural Architecture Search + + +
+ GNAS (Graph Neural Architecture Search) has demonstrated great effectiveness +in automatically designing the optimal graph neural architectures for multiple +downstream tasks, such as node classification and link prediction. However, +most existing GNAS methods cannot efficiently handle large-scale graphs +containing more than million-scale nodes and edges due to the expensive +computational and memory overhead. To scale GNAS on large graphs while +achieving better performance, we propose SA-GNAS, a novel framework based on +seed architecture expansion for efficient large-scale GNAS. Similar to the cell +expansion in biotechnology, we first construct a seed architecture and then +expand the seed architecture iteratively. Specifically, we first propose a +performance ranking consistency-based seed architecture selection method, which +selects the architecture searched on the subgraph that best matches the +original large-scale graph. Then, we propose an entropy minimization-based seed +architecture expansion method to further improve the performance of the seed +architecture. Extensive experimental results on five large-scale graphs +demonstrate that the proposed SA-GNAS outperforms human-designed +state-of-the-art GNN architectures and existing graph NAS methods. Moreover, +SA-GNAS can significantly reduce the search time, showing better search +efficiency. For the largest graph with billion edges, SA-GNAS can achieve 2.8 +times speedup compared to the SOTA large-scale GNAS method GAUSS. Additionally, +since SA-GNAS is inherently parallelized, the search efficiency can be further +improved with more GPUs. SA-GNAS is available at +https://github.com/PasaLab/SAGNAS. + +
+
+
+
+
+ + ☆ Deep Learning, Machine Learning, Advancing Big Data Analytics and + Management + + +
+ Advancements in artificial intelligence, machine learning, and deep learning +have catalyzed the transformation of big data analytics and management into +pivotal domains for research and application. This work explores the +theoretical foundations, methodological advancements, and practical +implementations of these technologies, emphasizing their role in uncovering +actionable insights from massive, high-dimensional datasets. The study presents +a systematic overview of data preprocessing techniques, including data +cleaning, normalization, integration, and dimensionality reduction, to prepare +raw data for analysis. Core analytics methodologies such as classification, +clustering, regression, and anomaly detection are examined, with a focus on +algorithmic innovation and scalability. Furthermore, the text delves into +state-of-the-art frameworks for data mining and predictive modeling, +highlighting the role of neural networks, support vector machines, and ensemble +methods in tackling complex analytical challenges. Special emphasis is placed +on the convergence of big data with distributed computing paradigms, including +cloud and edge computing, to address challenges in storage, computation, and +real-time analytics. The integration of ethical considerations, including data +privacy and compliance with global standards, ensures a holistic perspective on +data management. Practical applications across healthcare, finance, marketing, +and policy-making illustrate the real-world impact of these technologies. +Through comprehensive case studies and Python-based implementations, this work +equips researchers, practitioners, and data enthusiasts with the tools to +navigate the complexities of modern data analytics. It bridges the gap between +theory and practice, fostering the development of innovative solutions for +managing and leveraging data in the era of artificial intelligence. + +
+
+ comment: 174 pages +
+
+
+
+
+ + ☆ Generalizing Weisfeiler-Lehman Kernels to Subgraphs + + +
+ Subgraph representation learning has been effective in solving various +real-world problems. However, current graph neural networks (GNNs) produce +suboptimal results for subgraph-level tasks due to their inability to capture +complex interactions within and between subgraphs. To provide a more expressive +and efficient alternative, we propose WLKS, a Weisfeiler-Lehman (WL) kernel +generalized for subgraphs by applying the WL algorithm on induced $k$-hop +neighborhoods. We combine kernels across different $k$-hop levels to capture +richer structural information that is not fully encoded in existing models. Our +approach can balance expressiveness and efficiency by eliminating the need for +neighborhood sampling. In experiments on eight real-world and synthetic +benchmarks, WLKS significantly outperforms leading approaches on five datasets +while reducing training time, ranging from 0.01x to 0.25x compared to the +state-of-the-art. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Improved Complexity for Smooth Nonconvex Optimization: A Two-Level + Online Learning Approach with Quasi-Newton Methods + + +
+ We study the problem of finding an $\epsilon$-first-order stationary point +(FOSP) of a smooth function, given access only to gradient information. The +best-known gradient query complexity for this task, assuming both the gradient +and Hessian of the objective function are Lipschitz continuous, is +${O}(\epsilon^{-7/4})$. In this work, we propose a method with a gradient +complexity of ${O}(d^{1/4}\epsilon^{-13/8})$, where $d$ is the problem +dimension, leading to an improved complexity when $d = {O}(\epsilon^{-1/2})$. +To achieve this result, we design an optimization algorithm that, underneath, +involves solving two online learning problems. Specifically, we first +reformulate the task of finding a stationary point for a nonconvex problem as +minimizing the regret in an online convex optimization problem, where the loss +is determined by the gradient of the objective function. Then, we introduce a +novel optimistic quasi-Newton method to solve this online learning problem, +with the Hessian approximation update itself framed as an online learning +problem in the space of matrices. Beyond improving the complexity bound for +achieving an $\epsilon$-FOSP using a gradient oracle, our result provides the +first guarantee suggesting that quasi-Newton methods can potentially outperform +gradient descent-type methods in nonconvex settings. + +
+
+ comment: 35 pages +
+
+
+
+
+ + Towards the efficacy of federated prediction for epidemics on networks + + +
+ Epidemic prediction is of practical significance in public health, enabling +early intervention, resource allocation, and strategic planning. However, +privacy concerns often hinder the sharing of health data among institutions, +limiting the development of accurate prediction models. In this paper, we +develop a general privacy-preserving framework for node-level epidemic +prediction on networks based on federated learning (FL). We frame the +spatio-temporal spread of epidemics across multiple data-isolated subnetworks, +where each node state represents the aggregate epidemic severity within a +community. Then, both the pure temporal LSTM model and the spatio-temporal +model i.e., Spatio-Temporal Graph Attention Network (STGAT) are proposed to +address the federated epidemic prediction. Extensive experiments are conducted +on various epidemic processes using a practical airline network, offering a +comprehensive assessment of FL efficacy under diverse scenarios. By introducing +the efficacy energy metric to measure system robustness under various client +configurations, we systematically explore key factors influencing FL +performance, including client numbers, aggregation strategies, graph +partitioning, missing infectious reports. Numerical results manifest that STGAT +excels in capturing spatio-temporal dependencies in dynamic processes whereas +LSTM performs well in simpler pattern. Moreover, our findings highlight the +importance of balancing feature consistency and volume uniformity among +clients, as well as the prediction dilemma between information richness and +intrinsic stochasticity of dynamic processes. This study offers practical +insights into the efficacy of FL scenario in epidemic management, demonstrates +the potential of FL to address broader collective dynamics. + +
+
+
+
+
+ + ☆ Jailbreak Defense in a Narrow Domain: Limitations of Existing Methods + and a New Transcript-Classifier Approach NeurIPS 2024 + + +
+ Defending large language models against jailbreaks so that they never engage +in a broadly-defined set of forbidden behaviors is an open problem. In this +paper, we investigate the difficulty of jailbreak-defense when we only want to +forbid a narrowly-defined set of behaviors. As a case study, we focus on +preventing an LLM from helping a user make a bomb. We find that popular +defenses such as safety training, adversarial training, and input/output +classifiers are unable to fully solve this problem. In pursuit of a better +solution, we develop a transcript-classifier defense which outperforms the +baseline defenses we test. However, our classifier defense still fails in some +circumstances, which highlights the difficulty of jailbreak-defense even in a +narrow domain. + +
+
+ comment: Accepted to the AdvML-Frontiers and SoLaR workshops at NeurIPS 2024 +
+
+
+
+
+ + ☆ CausalMob: Causal Human Mobility Prediction with LLMs-derived Human + Intentions toward Public Events KDD 2025 + + +
+ Large-scale human mobility exhibits spatial and temporal patterns that can +assist policymakers in decision making. Although traditional prediction models +attempt to capture these patterns, they often interfered by non-periodic public +events, such as disasters and occasional celebrations. Since regular human +mobility patterns are heavily affected by these events, estimating their causal +effects is critical to accurate mobility predictions. Although news articles +provide unique perspectives on these events in an unstructured format, +processing is a challenge. In this study, we propose a causality-augmented +prediction model, called \textbf{CausalMob}, to analyze the causal effects of +public events. We first utilize large language models (LLMs) to extract human +intentions from news articles and transform them into features that act as +causal treatments. Next, the model learns representations of spatio-temporal +regional covariates from multiple data sources to serve as confounders for +causal inference. Finally, we present a causal effect estimation framework to +ensure event features remain independent of confounders during prediction. +Based on large-scale real-world data, the experimental results show that the +proposed model excels in human mobility prediction, outperforming +state-of-the-art models. + +
+
+ comment: Accepted by KDD 2025 +
+
+
+
+
+ + ☆ Failure Probability Estimation for Black-Box Autonomous Systems using + State-Dependent Importance Sampling Proposals + + +
+ Estimating the probability of failure is a critical step in developing +safety-critical autonomous systems. Direct estimation methods such as Monte +Carlo sampling are often impractical due to the rarity of failures in these +systems. Existing importance sampling approaches do not scale to sequential +decision-making systems with large state spaces and long horizons. We propose +an adaptive importance sampling algorithm to address these limitations. Our +method minimizes the forward Kullback-Leibler divergence between a +state-dependent proposal distribution and a relaxed form of the optimal +importance sampling distribution. Our method uses Markov score ascent methods +to estimate this objective. We evaluate our approach on four sequential systems +and show that it provides more accurate failure probability estimates than +baseline Monte Carlo and importance sampling techniques. This work is open +sourced. + +
+
+ comment: Submitted to L4DC 2025 +
+
+
+
+
+ + ☆ Revisiting the Initial Steps in Adaptive Gradient Descent Optimization NeurIPS 2024 + + +
+ Adaptive gradient optimization methods, such as Adam, are prevalent in +training deep neural networks across diverse machine learning tasks due to +their ability to achieve faster convergence. However, these methods often +suffer from suboptimal generalization compared to stochastic gradient descent +(SGD) and exhibit instability, particularly when training Transformer models. +In this work, we show the standard initialization of the second-order moment +estimation ($v_0 =0$) as a significant factor contributing to these +limitations. We introduce simple yet effective solutions: initializing the +second-order moment estimation with non-zero values, using either data-driven +or random initialization strategies. Empirical evaluations demonstrate that our +approach not only stabilizes convergence but also enhances the final +performance of adaptive gradient optimizers. Furthermore, by adopting the +proposed initialization strategies, Adam achieves performance comparable to +many recently proposed variants of adaptive gradient optimization methods, +highlighting the practical impact of this straightforward modification. + +
+
+ comment: OPT workshop at NeurIPS 2024 +
+
+
+
+
+ + ☆ SparseGrasp: Robotic Grasping via 3D Semantic Gaussian Splatting from + Sparse Multi-View RGB Images + + +
+ Language-guided robotic grasping is a rapidly advancing field where robots +are instructed using human language to grasp specific objects. However, +existing methods often depend on dense camera views and struggle to quickly +update scenes, limiting their effectiveness in changeable environments. + In contrast, we propose SparseGrasp, a novel open-vocabulary robotic grasping +system that operates efficiently with sparse-view RGB images and handles scene +updates fastly. Our system builds upon and significantly enhances existing +computer vision modules in robotic learning. Specifically, SparseGrasp utilizes +DUSt3R to generate a dense point cloud as the initialization for 3D Gaussian +Splatting (3DGS), maintaining high fidelity even under sparse supervision. +Importantly, SparseGrasp incorporates semantic awareness from recent vision +foundation models. To further improve processing efficiency, we repurpose +Principal Component Analysis (PCA) to compress features from 2D models. +Additionally, we introduce a novel render-and-compare strategy that ensures +rapid scene updates, enabling multi-turn grasping in changeable environments. + Experimental results show that SparseGrasp significantly outperforms +state-of-the-art methods in terms of both speed and adaptability, providing a +robust solution for multi-turn grasping in changeable environment. + +
+
+
+
+
+ + ☆ ShapeWords: Guiding Text-to-Image Synthesis with 3D Shape-Aware Prompts + + +
+ We introduce ShapeWords, an approach for synthesizing images based on 3D +shape guidance and text prompts. ShapeWords incorporates target 3D shape +information within specialized tokens embedded together with the input text, +effectively blending 3D shape awareness with textual context to guide the image +synthesis process. Unlike conventional shape guidance methods that rely on +depth maps restricted to fixed viewpoints and often overlook full 3D structure +or textual context, ShapeWords generates diverse yet consistent images that +reflect both the target shape's geometry and the textual description. +Experimental results show that ShapeWords produces images that are more +text-compliant, aesthetically plausible, while also maintaining 3D shape +awareness. + +
+
+ comment: Project webpage: https://lodurality.github.io/shapewords/ +
+
+
+
+
+ + ♻ ☆ From Isolated Conversations to Hierarchical Schemas: Dynamic Tree Memory + Representation for LLMs + + +
+ Recent advancements in large language models have significantly improved +their context windows, yet challenges in effective long-term memory management +remain. We introduce MemTree, an algorithm that leverages a dynamic, +tree-structured memory representation to optimize the organization, retrieval, +and integration of information, akin to human cognitive schemas. MemTree +organizes memory hierarchically, with each node encapsulating aggregated +textual content, corresponding semantic embeddings, and varying abstraction +levels across the tree's depths. Our algorithm dynamically adapts this memory +structure by computing and comparing semantic embeddings of new and existing +information to enrich the model's context-awareness. This approach allows +MemTree to handle complex reasoning and extended interactions more effectively +than traditional memory augmentation methods, which often rely on flat lookup +tables. Evaluations on benchmarks for multi-turn dialogue understanding and +document question answering show that MemTree significantly enhances +performance in scenarios that demand structured memory management. + +
+
+
+
+
+ + ♻ ☆ Accelerating Proximal Policy Optimization Learning Using Task Prediction + for Solving Environments with Delayed Rewards + + +
+ In this paper, we tackle the challenging problem of delayed rewards in +reinforcement learning (RL). While Proximal Policy Optimization (PPO) has +emerged as a leading Policy Gradient method, its performance can degrade under +delayed rewards. We introduce two key enhancements to PPO: a hybrid policy +architecture that combines an offline policy (trained on expert demonstrations) +with an online PPO policy, and a reward shaping mechanism using Time Window +Temporal Logic (TWTL). The hybrid architecture leverages offline data +throughout training while maintaining PPO's theoretical guarantees. Building on +the monotonic improvement framework of Trust Region Policy Optimization (TRPO), +we prove that our approach ensures improvement over both the offline policy and +previous iterations, with a bounded performance gap of +$(2\varsigma\gamma\alpha^2)/(1-\gamma)^2$, where $\alpha$ is the mixing +parameter, $\gamma$ is the discount factor, and $\varsigma$ bounds the expected +advantage. Additionally, we prove that our TWTL-based reward shaping preserves +the optimal policy of the original problem. TWTL enables formal translation of +temporal objectives into immediate feedback signals that guide learning. We +demonstrate the effectiveness of our approach through extensive experiments on +an inverted pendulum and a lunar lander environments, showing improvements in +both learning speed and final performance compared to standard PPO and +offline-only approaches. + +
+
+
+
+
+ + ♻ ☆ Go beyond End-to-End Training: Boosting Greedy Local Learning with + Context Supply + + +
+ Traditional end-to-end (E2E) training of deep networks necessitates storing +intermediate activations for back-propagation, resulting in a large memory +footprint on GPUs and restricted model parallelization. As an alternative, +greedy local learning partitions the network into gradient-isolated modules and +trains supervisely based on local preliminary losses, thereby providing +asynchronous and parallel training methods that substantially reduce memory +cost. However, empirical experiments reveal that as the number of segmentations +of the gradient-isolated module increases, the performance of the local +learning scheme degrades substantially, severely limiting its expansibility. To +avoid this issue, we theoretically analyze the greedy local learning from the +standpoint of information theory and propose a ContSup scheme, which +incorporates context supply between isolated modules to compensate for +information loss. Experiments on benchmark datasets (i.e. CIFAR, SVHN, STL-10) +achieve SOTA results and indicate that our proposed method can significantly +improve the performance of greedy local learning with minimal memory and +computational overhead, allowing for the boost of the number of isolated +modules. Our codes are available at https://github.com/Tab-ct/ContSup. + +
+
+ comment: 9 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ A Fast Convergence Theory for Offline Decision Making + + +
+ This paper proposes the first generic fast convergence result in general +function approximation for offline decision making problems, which include +offline reinforcement learning (RL) and off-policy evaluation (OPE) as special +cases. To unify different settings, we introduce a framework called Decision +Making with Offline Feedback (DMOF), which captures a wide range of offline +decision making problems. Within this framework, we propose a simple yet +powerful algorithm called Empirical Decision with Divergence (EDD), whose upper +bound can be termed as a coefficient named Empirical Offline Estimation +Coefficient (EOEC). We show that EOEC is instance-dependent and actually +measures the correlation of the problem. When assuming partial coverage in the +dataset, EOEC will reduce in a rate of $1/N$ where $N$ is the size of the +dataset, endowing EDD with a fast convergence guarantee. Finally, we complement +the above results with a lower bound in the DMOF framework, which further +demonstrates the soundness of our theory. + +
+
+
+
+
+ + ♻ ☆ Decoupling Dark Knowledge via Block-wise Logit Distillation for + Feature-level Alignment + + +
+ Knowledge Distillation (KD), a learning manner with a larger teacher network +guiding a smaller student network, transfers dark knowledge from the teacher to +the student via logits or intermediate features, with the aim of producing a +well-performed lightweight model. Notably, many subsequent feature-based KD +methods outperformed the earliest logit-based KD method and iteratively +generated numerous state-of-the-art distillation methods. Nevertheless, recent +work has uncovered the potential of the logit-based method, bringing the simple +KD form based on logits back into the limelight. Features or logits? They +partially implement the KD with entirely distinct perspectives; therefore, +choosing between logits and features is not straightforward. This paper +provides a unified perspective of feature alignment in order to obtain a better +comprehension of their fundamental distinction. Inheriting the design +philosophy and insights of feature-based and logit-based methods, we introduce +a block-wise logit distillation framework to apply implicit logit-based feature +alignment by gradually replacing teacher's blocks as intermediate +stepping-stone models to bridge the gap between the student and the teacher. +Our method obtains comparable or superior results to state-of-the-art +distillation methods. This paper demonstrates the great potential of combining +logit and features, and we hope it will inspire future research to revisit KD +from a higher vantage point. + +
+
+
+
+
+ + ♻ ☆ Neural Thermodynamic Integration: Free Energies from Energy-based + Diffusion Models + + +
+ Thermodynamic integration (TI) offers a rigorous method for estimating +free-energy differences by integrating over a sequence of interpolating +conformational ensembles. However, TI calculations are computationally +expensive and typically limited to coupling a small number of degrees of +freedom due to the need to sample numerous intermediate ensembles with +sufficient conformational-space overlap. In this work, we propose to perform TI +along an alchemical pathway represented by a trainable neural network, which we +term Neural TI. Critically, we parametrize a time-dependent Hamiltonian +interpolating between the interacting and non-interacting systems, and optimize +its gradient using a score matching objective. The ability of the resulting +energy-based diffusion model to sample all intermediate ensembles allows us to +perform TI from a single reference calculation. We apply our method to +Lennard-Jones fluids, where we report accurate calculations of the excess +chemical potential, demonstrating that Neural TI reproduces the underlying +changes in free energy without the need for simulations at interpolating +Hamiltonians. + +
+
+
+
+
+ + ♻ ☆ Denoising: A Powerful Building-Block for Imaging, Inverse Problems, and + Machine Learning + + +
+ Denoising, the process of reducing random fluctuations in a signal to +emphasize essential patterns, has been a fundamental problem of interest since +the dawn of modern scientific inquiry. Recent denoising techniques, +particularly in imaging, have achieved remarkable success, nearing theoretical +limits by some measures. Yet, despite tens of thousands of research papers, the +wide-ranging applications of denoising beyond noise removal have not been fully +recognized. This is partly due to the vast and diverse literature, making a +clear overview challenging. + This paper aims to address this gap. We present a clarifying perspective on +denoisers, their structure, and desired properties. We emphasize the increasing +importance of denoising and showcase its evolution into an essential building +block for complex tasks in imaging, inverse problems, and machine learning. +Despite its long history, the community continues to uncover unexpected and +groundbreaking uses for denoising, further solidifying its place as a +cornerstone of scientific and engineering practice. + +
+
+
+
+
+ + ♻ ☆ Filtered Direct Preference Optimization EMNLP 2024 + + +
+ Reinforcement learning from human feedback (RLHF) plays a crucial role in +aligning language models with human preferences. While the significance of +dataset quality is generally recognized, explicit investigations into its +impact within the RLHF framework, to our knowledge, have been limited. This +paper addresses the issue of text quality within the preference dataset by +focusing on direct preference optimization (DPO), an increasingly adopted +reward-model-free RLHF method. We confirm that text quality significantly +influences the performance of models optimized with DPO more than those +optimized with reward-model-based RLHF. Building on this new insight, we +propose an extension of DPO, termed filtered direct preference optimization +(fDPO). fDPO uses a trained reward model to monitor the quality of texts within +the preference dataset during DPO training. Samples of lower quality are +discarded based on comparisons with texts generated by the model being +optimized, resulting in a more accurate dataset. Experimental results +demonstrate that fDPO enhances the final model performance. Our code is +available at https://github.com/CyberAgentAILab/filtered-dpo. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ LumiNet: Latent Intrinsics Meets Diffusion Models for Indoor Scene + Relighting + + +
+ We introduce LumiNet, a novel architecture that leverages generative models +and latent intrinsic representations for effective lighting transfer. Given a +source image and a target lighting image, LumiNet synthesizes a relit version +of the source scene that captures the target's lighting. Our approach makes two +key contributions: a data curation strategy from the StyleGAN-based relighting +model for our training, and a modified diffusion-based ControlNet that +processes both latent intrinsic properties from the source image and latent +extrinsic properties from the target image. We further improve lighting +transfer through a learned adaptor (MLP) that injects the target's latent +extrinsic properties via cross-attention and fine-tuning. + Unlike traditional ControlNet, which generates images with conditional maps +from a single scene, LumiNet processes latent representations from two +different images - preserving geometry and albedo from the source while +transferring lighting characteristics from the target. Experiments demonstrate +that our method successfully transfers complex lighting phenomena including +specular highlights and indirect illumination across scenes with varying +spatial layouts and materials, outperforming existing approaches on challenging +indoor scenes using only images as input. + +
+
+ comment: Project page: https://luminet-relight.github.io +
+
+
+
+
+ + ♻ ☆ Closed-Form Interpretation of Neural Network Latent Spaces with Symbolic + Gradients + + +
+ It has been demonstrated in many scientific fields that artificial neural +networks like autoencoders or Siamese networks encode meaningful concepts in +their latent spaces. However, there does not exist a comprehensive framework +for retrieving this information in a human-readable form without prior +knowledge. In order to extract these concepts, we introduce a framework for +finding closed-form interpretations of neurons in latent spaces of artificial +neural networks. The interpretation framework is based on embedding trained +neural networks into an equivalence class of functions that encode the same +concept. We interpret these neural networks by finding an intersection between +the equivalence class and human-readable equations defined by a symbolic search +space. The approach is demonstrated by retrieving invariants of matrices and +conserved quantities of dynamical systems from latent spaces of Siamese neural +networks. + +
+
+
+
+
+ + ♻ ☆ Burning RED: Unlocking Subtask-Driven Reinforcement Learning and + Risk-Awareness in Average-Reward Markov Decision Processes + + +
+ Average-reward Markov decision processes (MDPs) provide a foundational +framework for sequential decision-making under uncertainty. However, +average-reward MDPs have remained largely unexplored in reinforcement learning +(RL) settings, with the majority of RL-based efforts having been allocated to +episodic and discounted MDPs. In this work, we study a unique structural +property of average-reward MDPs and utilize it to introduce Reward-Extended +Differential (or RED) reinforcement learning: a novel RL framework that can be +used to effectively and efficiently solve various subtasks simultaneously in +the average-reward setting. We introduce a family of RED learning algorithms +for prediction and control, including proven-convergent algorithms for the +tabular case. We then showcase the power of these algorithms by demonstrating +how they can be used to learn a policy that optimizes, for the first time, the +well-known conditional value-at-risk (CVaR) risk measure in a fully-online +manner, without the use of an explicit bi-level optimization scheme or an +augmented state-space. + +
+
+
+
+
+ + ♻ ☆ Introduction to Reinforcement Learning + + +
+ Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI), +focuses on training agents to make decisions by interacting with their +environment to maximize cumulative rewards. This paper provides an overview of +RL, covering its core concepts, methodologies, and resources for further +learning. It offers a thorough explanation of fundamental components such as +states, actions, policies, and reward signals, ensuring readers develop a solid +foundational understanding. Additionally, the paper presents a variety of RL +algorithms, categorized based on the key factors such as model-free, +model-based, value-based, policy-based, and other key factors. Resources for +learning and implementing RL, such as books, courses, and online communities +are also provided. By offering a clear, structured introduction, this paper +aims to simplify the complexities of RL for beginners, providing a +straightforward pathway to understanding. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Dynamic-LLaVA: Efficient Multimodal Large Language Models via Dynamic + Vision-language Context Sparsification + + +
+ Multimodal Large Language Models (MLLMs) have achieved remarkable success in +vision understanding, reasoning, and interaction. However, the inference +computation and memory increase progressively with the generation of output +tokens during decoding, directly affecting the efficacy of MLLMs. Existing +methods attempt to reduce the vision context redundancy to achieve efficient +MLLMs. Unfortunately, the efficiency benefits of the vision context reduction +in the prefill stage gradually diminish during the decoding stage. To address +this problem, we proposed a dynamic vision-language context sparsification +framework Dynamic-LLaVA, which dynamically reduces the redundancy of vision +context in the prefill stage and decreases the memory and computation overhead +of the generated language context during decoding. Dynamic-LLaVA designs a +tailored sparsification inference scheme for different inference modes, i.e., +prefill, decoding with and without KV cache, to achieve efficient inference of +MLLMs. In practice, Dynamic-LLaVA can reduce computation consumption by +$\sim$75\% in the prefill stage. Meanwhile, throughout the entire generation +process of MLLMs, Dynamic-LLaVA reduces the $\sim$50\% computation consumption +under decoding without KV cache, while saving $\sim$50\% GPU memory overhead +when decoding with KV cache, due to the vision-language context sparsification. +Extensive experiments also demonstrate that Dynamic-LLaVA achieves efficient +inference for MLLMs with negligible understanding and generation ability +degradation or even performance gains compared to the full-context inference +baselines. Code is available at https://github.com/Osilly/dynamic_llava . + +
+
+ comment: Code is available at https://github.com/Osilly/dynamic_llava +
+
+
+
+
+ + ♻ ☆ Understanding complex crowd dynamics with generative neural simulators + + +
+ Understanding the dynamics of pedestrian crowds is an outstanding challenge +crucial for designing efficient urban infrastructure and ensuring safe crowd +management. To this end, both small-scale laboratory and large-scale real-world +measurements have been used. However, these approaches respectively lack +statistical resolution and parametric controllability, both essential to +discovering physical relationships underlying the complex stochastic dynamics +of crowds. Here, we establish an investigation paradigm that offers +laboratory-like controllability, while ensuring the statistical resolution of +large-scale real-world datasets. Using our data-driven Neural Crowd Simulator +(NeCS), which we train on large-scale data and validate against key statistical +features of crowd dynamics, we show that we can perform effective surrogate +crowd dynamics experiments without training on specific scenarios. We not only +reproduce known experimental results on pairwise avoidance, but also uncover +the vision-guided and topological nature of N-body interactions. These findings +show how virtual experiments based on neural simulation enable data-driven +scientific discovery. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Paired Autoencoders for Likelihood-free Estimation in Inverse Problems + + +
+ We consider the solution of nonlinear inverse problems where the forward +problem is a discretization of a partial differential equation. Such problems +are notoriously difficult to solve in practice and require minimizing a +combination of a data-fit term and a regularization term. The main +computational bottleneck of typical algorithms is the direct estimation of the +data misfit. Therefore, likelihood-free approaches have become appealing +alternatives. Nonetheless, difficulties in generalization and limitations in +accuracy have hindered their broader utility and applicability. In this work, +we use a paired autoencoder framework as a likelihood-free estimator for +inverse problems. We show that the use of such an architecture allows us to +construct a solution efficiently and to overcome some known open problems when +using likelihood-free estimators. In particular, our framework can assess the +quality of the solution and improve on it if needed. We demonstrate the +viability of our approach using examples from full waveform inversion and +inverse electromagnetic imaging. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Fast and reliable uncertainty quantification with neural network + ensembles for industrial image classification + + +
+ Image classification with neural networks (NNs) is widely used in industrial +processes, situations where the model likely encounters unknown objects during +deployment, i.e., out-of-distribution (OOD) data. Worryingly, NNs tend to make +confident yet incorrect predictions when confronted with OOD data. To increase +the models' reliability, they should quantify the uncertainty in their own +predictions, communicating when the output should (not) be trusted. Deep +ensembles, composed of multiple independent NNs, have been shown to perform +strongly but are computationally expensive. Recent research has proposed more +efficient NN ensembles, namely the snapshot, batch, and multi-input +multi-output ensemble. This study investigates the predictive and uncertainty +performance of efficient NN ensembles in the context of image classification +for industrial processes. It is the first to provide a comprehensive comparison +and it proposes a novel Diversity Quality metric to quantify the ensembles' +performance on the in-distribution and OOD sets in one single metric. The +results highlight the batch ensemble as a cost-effective and competitive +alternative to the deep ensemble. It matches the deep ensemble in both +uncertainty and accuracy while exhibiting considerable savings in training +time, test time, and memory storage. + +
+
+ comment: Submitted to Annals of Operations Research +
+
+
+
+
+ + ♻ ☆ Learning to Predict Structural Vibrations + + +
+ In mechanical structures like airplanes, cars and houses, noise is generated +and transmitted through vibrations. To take measures to reduce this noise, +vibrations need to be simulated with expensive numerical computations. Deep +learning surrogate models present a promising alternative to classical +numerical simulations as they can be evaluated magnitudes faster, while +trading-off accuracy. To quantify such trade-offs systematically and foster the +development of methods, we present a benchmark on the task of predicting the +vibration of harmonically excited plates. The benchmark features a total of +12,000 plate geometries with varying forms of beadings, material, boundary +conditions, load position and sizes with associated numerical solutions. To +address the benchmark task, we propose a new network architecture, named +Frequency-Query Operator, which predicts vibration patterns of plate geometries +given a specific excitation frequency. Applying principles from operator +learning and implicit models for shape encoding, our approach effectively +addresses the prediction of highly variable frequency response functions +occurring in dynamic systems. To quantify the prediction quality, we introduce +a set of evaluation metrics and evaluate the method on our vibrating-plates +benchmark. Our method outperforms DeepONets, Fourier Neural Operators and more +traditional neural network architectures and can be used for design +optimization. Code, dataset and visualizations: +https://github.com/ecker-lab/Learning_Vibrating_Plates + +
+
+ comment: Accepted at Neurips 2024 +
+
+
+
+
+ + ♻ ☆ SpaCE: The Spatial Confounding Environment + + +
+ Spatial confounding poses a significant challenge in scientific studies +involving spatial data, where unobserved spatial variables can influence both +treatment and outcome, possibly leading to spurious associations. To address +this problem, we introduce SpaCE: The Spatial Confounding Environment, the +first toolkit to provide realistic benchmark datasets and tools for +systematically evaluating causal inference methods designed to alleviate +spatial confounding. Each dataset includes training data, true counterfactuals, +a spatial graph with coordinates, and smoothness and confounding scores +characterizing the effect of a missing spatial confounder. It also includes +realistic semi-synthetic outcomes and counterfactuals, generated using +state-of-the-art machine learning ensembles, following best practices for +causal inference benchmarks. The datasets cover real treatment and covariates +from diverse domains, including climate, health and social sciences. SpaCE +facilitates an automated end-to-end pipeline, simplifying data loading, +experimental setup, and evaluating machine learning and causal inference +models. The SpaCE project provides several dozens of datasets of diverse sizes +and spatial complexity. It is publicly available as a Python package, +encouraging community feedback and contributions. + +
+
+
+
+
+ + ♻ ☆ A Probabilistic Perspective on Unlearning and Alignment for Large + Language Models + + +
+ Comprehensive evaluation of Large Language Models (LLMs) is an open research +problem. Existing evaluations rely on deterministic point estimates generated +via greedy decoding. However, we find that deterministic evaluations fail to +capture the whole output distribution of a model, yielding inaccurate +estimations of model capabilities. This is particularly problematic in critical +contexts such as unlearning and alignment, where precise model evaluations are +crucial. To remedy this, we introduce the first formal probabilistic evaluation +framework in LLMs. Namely, we derive novel metrics with high-probability +guarantees concerning the output distribution of a model. Our metrics are +application-independent and allow practitioners to make more reliable estimates +about model capabilities before deployment. Through a case study focused on +unlearning, we reveal that deterministic evaluations falsely indicate +successful unlearning, whereas our probabilistic evaluations demonstrate that +most if not all of the supposedly unlearned information remains accessible in +these models. Additionally, we propose a novel unlearning loss based on entropy +optimization and adaptive temperature scaling, which significantly improves +unlearning in probabilistic settings on recent benchmarks. Our proposed shift +from point estimates to probabilistic evaluations of output distributions +represents an important step toward comprehensive evaluations of LLMs. Code +available at https://github.com/yascho/probabilistic-unlearning. + +
+
+
+
+
+ + ♻ ☆ Harnessing Preference Optimisation in Protein LMs for Hit Maturation in + Cell Therapy + + +
+ Cell and immunotherapy offer transformative potential for treating diseases +like cancer and autoimmune disorders by modulating the immune system. The +development of these therapies is resource-intensive, with the majority of drug +candidates failing to progress beyond laboratory testing. While recent advances +in machine learning have revolutionised areas such as protein engineering, +applications in immunotherapy remain limited due to the scarcity of +large-scale, standardised datasets and the complexity of cellular systems. In +this work, we address these challenges by leveraging a high-throughput +experimental platform to generate data suitable for fine-tuning protein +language models. We demonstrate how models fine-tuned using a preference task +show surprising correlations to biological assays, and how they can be +leveraged for few-shot hit maturation in CARs. This proof-of-concept presents a +novel pathway for applying ML to immunotherapy and could generalise to other +therapeutic modalities. + +
+
+
+
+
+ + ♻ ☆ Supervised Multiple Kernel Learning approaches for multi-omics data + integration + + +
+ Advances in high-throughput technologies have originated an ever-increasing +availability of omics datasets. The integration of multiple heterogeneous data +sources is currently an issue for biology and bioinformatics. Multiple kernel +learning (MKL) has shown to be a flexible and valid approach to consider the +diverse nature of multi-omics inputs, despite being an underused tool in +genomic data mining. We provide novel MKL approaches based on different kernel +fusion strategies. To learn from the meta-kernel of input kernels, we adapted +unsupervised integration algorithms for supervised tasks with support vector +machines. We also tested deep learning architectures for kernel fusion and +classification. The results show that MKL-based models can outperform more +complex, state-of-the-art, supervised multi-omics integrative approaches. +Multiple kernel learning offers a natural framework for predictive models in +multi-omics data. It proved to provide a fast and reliable solution that can +compete with and outperform more complex architectures. Our results offer a +direction for bio-data mining research, biomarker discovery and further +development of methods for heterogeneous data integration. + +
+
+
+
+
+ + ♻ ☆ The Descriptive Complexity of Graph Neural Networks + + +
+ We analyse the power of graph neural networks (GNNs) in terms of Boolean +circuit complexity and descriptive complexity. + We prove that the graph queries that can be computed by a polynomial-size +bounded-depth family of GNNs are exactly those definable in the guarded +fragment GFO+C of first-order logic with counting and with built-in relations. +This puts GNNs in the circuit complexity class (non-uniform) $\text{TC}^0$. +Remarkably, the GNN families may use arbitrary real weights and a wide class of +activation functions that includes the standard ReLU, logistic "sigmoid", and +hyperbolic tangent functions. If the GNNs are allowed to use random +initialisation and global readout (both standard features of GNNs widely used +in practice), they can compute exactly the same queries as bounded depth +Boolean circuits with threshold gates, that is, exactly the queries in +$\text{TC}^0$. + Moreover, we show that queries computable by a single GNN with piecewise +linear activations and rational weights are definable in GFO+C without built-in +relations. Therefore, they are contained in uniform $\text{TC}^0$. + +
+
+ comment: Journal version for TheoretiCS +
+
+
+
+
+ + ♻ ☆ Training for Speech Recognition on Coprocessors + + +
+ Automatic Speech Recognition (ASR) has increased in popularity in recent +years. The evolution of processor and storage technologies has enabled more +advanced ASR mechanisms, fueling the development of virtual assistants such as +Amazon Alexa, Apple Siri, Microsoft Cortana, and Google Home. The interest in +such assistants, in turn, has amplified the novel developments in ASR research. +However, despite this popularity, there has not been a detailed training +efficiency analysis of modern ASR systems. This mainly stems from: the +proprietary nature of many modern applications that depend on ASR, like the +ones listed above; the relatively expensive co-processor hardware that is used +to accelerate ASR by big vendors to enable such applications; and the absence +of well-established benchmarks. The goal of this paper is to address the latter +two of these challenges. The paper first describes an ASR model, based on a +deep neural network inspired by recent work in this domain, and our experiences +building it. Then we evaluate this model on three CPU-GPU co-processor +platforms that represent different budget categories. Our results demonstrate +that utilizing hardware acceleration yields good results even without high-end +equipment. While the most expensive platform (10X price of the least expensive +one) converges to the initial accuracy target 10-30% and 60-70% faster than the +other two, the differences among the platforms almost disappear at slightly +higher accuracy targets. In addition, our results further highlight both the +difficulty of evaluating ASR systems due to the complex, long, and resource +intensive nature of the model training in this domain, and the importance of +establishing benchmarks for ASR. + +
+
+ comment: published at ADMS 2020 +
+
+
+
+
+ + ♻ ☆ Enhancing joint automatic chest X-ray diagnosis and clinical visual + attention prediction with multi-stage cooperative learning + + +
+ Purpose: As visual inspection is an inherent process during radiological +screening, the associated eye gaze data can provide valuable insights into +relevant clinical decisions. As deep learning has become the state-of-the-art +for computer-assisted diagnosis, integrating human behavior, such as eye gaze +data, into these systems is instrumental to help align machine predictions with +clinical diagnostic criteria, thus enhancing the quality of automatic +radiological diagnosis. Methods: We propose a novel deep learning framework for +joint disease diagnosis and prediction of corresponding clinical visual +attention maps for chest X-ray scans. Specifically, we introduce a new +dual-encoder multi-task UNet, which leverages both a DenseNet201 backbone and a +Residual and Squeeze-and-Excitation block-based encoder to extract diverse +features for visual attention map prediction, and a multi-scale feature-fusion +classifier to perform disease classification. To tackle the issue of +asynchronous training schedules of individual tasks in multi-task learning, we +proposed a multi-stage cooperative learning strategy, with contrastive learning +for feature encoder pretraining to boost performance. Results: Our proposed +method is shown to significantly outperform existing techniques for chest X-ray +diagnosis (AUC=0.93) and the quality of visual attention map prediction +(Correlation coefficient=0.58). Conclusion: Benefiting from the proposed +multi-task multi-stage cooperative learning, our technique demonstrates the +benefit of integrating clinicians' eye gaze into clinical AI systems to boost +performance and potentially explainability. + +
+
+
+
+
+ + ♻ ☆ PITN: Physics-Informed Temporal Networks for Cuffless Blood Pressure + Estimation + + +
+ Monitoring blood pressure with non-invasive sensors has gained popularity for +providing comfortable user experiences, one of which is a significant function +of smart wearables. Although providing a comfortable user experience, such +methods are suffering from the demand for a significant amount of realistic +data to train an individual model for each subject, especially considering the +invasive or obtrusive BP ground-truth measurements. To tackle this challenge, +we introduce a novel physics-informed temporal network~(PITN) with adversarial +contrastive learning to enable precise BP estimation with very limited data. +Specifically, we first enhance the physics-informed neural network~(PINN) with +the temporal block for investigating BP dynamics' multi-periodicity for +personal cardiovascular cycle modeling and temporal variation. We then employ +adversarial training to generate extra physiological time series data, +improving PITN's robustness in the face of sparse subject-specific training +data. Furthermore, we utilize contrastive learning to capture the +discriminative variations of cardiovascular physiologic phenomena. This +approach aggregates physiological signals with similar blood pressure values in +latent space while separating clusters of samples with dissimilar blood +pressure values. Experiments on three widely-adopted datasets with different +modailties (\emph{i.e.,} bioimpedance, PPG, millimeter-wave) demonstrate the +superiority and effectiveness of the proposed methods over previous +state-of-the-art approaches. The code is available +at~\url{https://github.com/Zest86/ACL-PITN}. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Flow Matching for Accelerated Simulation of Atomic Transport in + Materials + + +
+ We introduce LiFlow, a generative framework to accelerate molecular dynamics +(MD) simulations for crystalline materials that formulates the task as +conditional generation of atomic displacements. The model uses flow matching, +with a Propagator submodel to generate atomic displacements and a Corrector to +locally correct unphysical geometries, and incorporates an adaptive prior based +on the Maxwell-Boltzmann distribution to account for chemical and thermal +conditions. We benchmark LiFlow on a dataset comprising 25-ps trajectories of +lithium diffusion across 4,186 solid-state electrolyte (SSE) candidates at four +temperatures. The model obtains a consistent Spearman rank correlation of +0.7-0.8 for lithium mean squared displacement (MSD) predictions on unseen +compositions. Furthermore, LiFlow generalizes from short training trajectories +to larger supercells and longer simulations while maintaining high accuracy. +With speed-ups of up to 600,000$\times$ compared to first-principles methods, +LiFlow enables scalable simulations at significantly larger length and time +scales. + +
+
+
+
+
+ + ♻ ☆ Detection and Imputation based Two-Stage Denoising Diffusion Power + System Measurement Recovery under Cyber-Physical Uncertainties + + +
+ Power system cyber-physical uncertainties, including measurement ambiguities +stemming from cyber attacks and data losses, along with system uncertainties +introduced by massive renewables and complex dynamics, reduce the likelihood of +enhancing the quality of measurements. Fortunately, denoising diffusion models +exhibit powerful learning and generation abilities for the complex underlying +physics of the real world. To this end, this paper proposes an improved +detection and imputation based two-stage denoising diffusion model (TSDM) to +identify and reconstruct the measurements with various cyber-physical +uncertainties. The first stage of the model comprises a classifier-guided +conditional anomaly detection component, while the second stage involves +diffusion-based measurement imputation component. Moreover, the proposed TSDM +adopts optimal variance to accelerate the diffusion generation process with +subsequence sampling. Extensive numerical case studies demonstrate that the +proposed TSDM can accurately recover power system measurements despite +renewables-induced strong randomness and highly nonlinear dynamics. +Additionally, the proposed TSDM has stronger robustness compared to existing +reconstruction networks and exhibits lower computational complexity than +general denoising diffusion models. + +
+
+
+
+
+ + ♻ ☆ Latent Diffusion Model-Enabled Low-Latency Semantic Communication in the + Presence of Semantic Ambiguities and Wireless Channel Noises + + +
+ Deep learning (DL)-based Semantic Communications (SemCom) is becoming +critical to maximize overall efficiency of communication networks. +Nevertheless, SemCom is sensitive to wireless channel uncertainties, source +outliers, and suffer from poor generalization bottlenecks. To address the +mentioned challenges, this paper develops a latent diffusion model-enabled +SemCom system with three key contributions, i.e., i) to handle potential +outliers in the source data, semantic errors obtained by projected gradient +descent based on the vulnerabilities of DL models, are utilized to update the +parameters and obtain an outlier-robust encoder, ii) a lightweight single-layer +latent space transformation adapter completes one-shot learning at the +transmitter and is placed before the decoder at the receiver, enabling +adaptation for out-of-distribution data and enhancing human-perceptual quality, +and iii) an end-to-end consistency distillation (EECD) strategy is used to +distill the diffusion models trained in latent space, enabling deterministic +single or few-step low-latency denoising in various noisy channels while +maintaining high semantic quality. Extensive numerical experiments across +different datasets demonstrate the superiority of the proposed SemCom system, +consistently proving its robustness to outliers, the capability to transmit +data with unknown distributions, and the ability to perform real-time channel +denoising tasks while preserving high human perceptual quality, outperforming +the existing denoising approaches in semantic metrics like learned perceptual +image path similarity (LPIPS). + +
+
+
+
+
+ + ♻ ☆ Interpolation and differentiation of alchemical degrees of freedom in + machine learning interatomic potentials + + +
+ Machine learning interatomic potentials (MLIPs) have become a workhorse of +modern atomistic simulations, and recently published universal MLIPs, +pre-trained on large datasets, have demonstrated remarkable accuracy and +generalizability. However, the computational cost of MLIPs limits their +applicability to chemically disordered systems requiring large simulation cells +or to sample-intensive statistical methods. Here, we report the use of +continuous and differentiable alchemical degrees of freedom in atomistic +materials simulations, exploiting the fact that graph neural network MLIPs +represent discrete elements as real-valued tensors. The proposed method +introduces alchemical atoms with corresponding weights into the input graph, +alongside modifications to the message-passing and readout mechanisms of MLIPs, +and allows smooth interpolation between the compositional states of materials. +The end-to-end differentiability of MLIPs enables efficient calculation of the +gradient of energy with respect to the compositional weights. With this +modification, we propose methodologies for optimizing the composition of solid +solutions towards target macroscopic properties, characterizing order and +disorder in multicomponent oxides, and conducting alchemical free energy +simulations to quantify the free energy of vacancy formation and composition +changes. The approach offers an avenue for extending the capabilities of +universal MLIPs in the modeling of compositional disorder and characterizing +the phase stability of complex materials systems. + +
+
+
+
+
+ + ♻ ☆ Governance of Generative Artificial Intelligence for Companies + + +
+ Generative Artificial Intelligence (GenAI), specifically large language +models like ChatGPT, has swiftly entered organizations without adequate +governance, posing both opportunities and risks. Despite extensive debates on +GenAI's transformative nature and regulatory measures, limited research +addresses organizational governance, encompassing technical and business +perspectives. Although numerous frameworks for governance of AI exist, it is +not clear to what extent they apply to GenAI. Our review paper fills this gap +by surveying recent works with the purpose of better understanding fundamental +characteristics of GenAI and adjusting prior frameworks specifically towards +GenAI governance within companies. To do so, it extends Nickerson's framework +development processes to include prior conceptualizations. Our framework +outlines the scope, objectives, and governance mechanisms tailored to harness +business opportunities as well as mitigate risks associated with GenAI +integration. Our research contributes a focused approach to GenAI governance, +offering practical insights for companies navigating the challenges of GenAI +adoption and highlighting research gaps. + +
+
+
+
+
+ + ♻ ☆ LLM-ABBA: Understanding time series via symbolic approximation + + +
+ The success of large language models (LLMs) for time series has been +demonstrated in previous work. Utilizing a symbolic time series representation, +one can efficiently bridge the gap between LLMs and time series. However, the +remaining challenge is to exploit the semantic information hidden in time +series by using symbols or existing tokens of LLMs, while aligning the +embedding space of LLMs according to the hidden information of time series. The +symbolic time series approximation (STSA) method called adaptive Brownian +bridge-based symbolic aggregation (ABBA) shows outstanding efficacy in +preserving salient time series features by modeling time series patterns in +terms of amplitude and period while using existing tokens of LLMs. + In this paper, we introduce a method, called LLM-ABBA, that integrates ABBA +into large language models for various downstream time series tasks. By +symbolizing time series, LLM-ABBA compares favorably to the recent +state-of-the-art (SOTA) in UCR and three medical time series classification +tasks. Meanwhile, a fixed-polygonal chain trick in ABBA is introduced to +\kc{avoid obvious drifting} during prediction tasks by significantly mitigating +the effects of cumulative error arising from misused symbols during the +transition from symbols to numerical values. In time series regression tasks, +LLM-ABBA achieves the new SOTA on Time Series Extrinsic Regression (TSER) +benchmarks. LLM-ABBA also shows competitive prediction capability compared to +recent SOTA time series prediction results. We believe this framework can also +seamlessly extend to other time series tasks. + +
+
+
+
+
+ + ♻ ☆ BInD: Bond and Interaction-generating Diffusion Model for + Multi-objective Structure-based Drug Design + + +
+ A remarkable advance in geometric deep generative models with accumulated +structural data enables structure-based drug design (SBDD) with target protein +information only. However, most existing models struggle to address +multi-objectives simultaneously while performing well only in their specialized +tasks. Here, we present BInD, a diffusion model with knowledge-based guidance +for multi-objective SBDD. BInD is designed to co-generate molecules and their +interactions with a target protein to consider all key objectives equally well, +including target-specific interactions, molecular properties, and local +geometry. Comprehensive evaluations show that BInD achieves robust performance +for all objectives while outperforming or matching state-of-the-art methods for +each. Finally, we propose a train-free optimization method empowered by +retrieving target-specific interactions, highlighting the role of non-covalent +interactions in achieving higher selectivity and binding affinities to a target +protein. + +
+
+
+
+
+ + ♻ ☆ Puzzle: Distillation-Based NAS for Inference-Optimized LLMs + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities, but +their adoption is limited by high computational costs during inference. While +increasing parameter counts enhances accuracy, it also widens the gap between +state-of-the-art capabilities and practical deployability. We present Puzzle, a +framework to accelerate LLM inference on specific hardware while preserving +their capabilities. Through an innovative application of neural architecture +search (NAS) at an unprecedented scale, Puzzle systematically optimizes models +with tens of billions of parameters under hardware constraints. Our approach +utilizes blockwise local knowledge distillation (BLD) for parallel architecture +exploration and employs mixed-integer programming for precise constraint +optimization. + We demonstrate the real-world impact of our framework through +Llama-3.1-Nemotron-51B-Instruct (Nemotron-51B), a publicly available model +derived from Llama-3.1-70B-Instruct. Nemotron-51B achieves a 2.17x inference +throughput speedup, fitting on a single NVIDIA H100 GPU while preserving 98.4% +of the original model's capabilities. Nemotron-51B currently stands as the most +accurate language model capable of inference on a single GPU with large batch +sizes. Remarkably, this transformation required just 45B training tokens, +compared to over 15T tokens used for the 70B model it was derived from. This +establishes a new paradigm where powerful models can be optimized for efficient +deployment with only negligible compromise of their capabilities, demonstrating +that inference performance, not parameter count alone, should guide model +selection. With the release of Nemotron-51B and the presentation of the Puzzle +framework, we provide practitioners immediate access to state-of-the-art +language modeling capabilities at significantly reduced computational costs. + +
+
+
+
+
+ + ♻ ☆ Re-examining learning linear functions in context + + +
+ In context learning (ICL) is an attractive method of solving a wide range of +problems. Inspired by Garg et al. (2022), we look closely at ICL in a variety +of train and test settings for several transformer models of different sizes +trained from scratch. Our study complements prior work by pointing out several +systematic failures of these models to generalize to data not in the training +distribution, thereby showing some limitations of ICL. We find that models +adopt a strategy for this task that is very different from standard solutions. + +
+
+
+
+
+ + ♻ ☆ Feudal Graph Reinforcement Learning + + +
+ Graph-based representations and message-passing modular policies constitute +prominent approaches to tackling composable control problems in reinforcement +learning (RL). However, as shown by recent graph deep learning literature, such +local message-passing operators can create information bottlenecks and hinder +global coordination. The issue becomes more serious in tasks requiring +high-level planning. In this work, we propose a novel methodology, named Feudal +Graph Reinforcement Learning (FGRL), that addresses such challenges by relying +on hierarchical RL and a pyramidal message-passing architecture. In particular, +FGRL defines a hierarchy of policies where high-level commands are propagated +from the top of the hierarchy down through a layered graph structure. The +bottom layers mimic the morphology of the physical system, while the upper +layers correspond to higher-order sub-modules. The resulting agents are then +characterized by a committee of policies where actions at a certain level set +goals for the level below, thus implementing a hierarchical decision-making +structure that can naturally implement task decomposition. We evaluate the +proposed framework on a graph clustering problem and MuJoCo locomotion tasks; +simulation results show that FGRL compares favorably against relevant +baselines. Furthermore, an in-depth analysis of the command propagation +mechanism provides evidence that the introduced message-passing scheme favors +learning hierarchical decision-making policies. + +
+
+
+
+
+ + ♻ ☆ OceanCastNet: A Deep Learning Ocean Wave Model with Energy Conservation + + +
+ Traditional wave forecasting models, although based on energy conservation +equations, are computationally expensive. On the other hand, existing deep +learning geophysical fluid models, while computationally efficient, often +suffer from issues such as energy dissipation in long-term forecasts. This +paper proposes a novel energy-balanced deep learning wave forecasting model +called OceanCastNet (OCN). By incorporating wind fields at the current, +previous, and future time steps, as well as wave fields at the current and +previous time steps as input variables, OCN maintains energy balance within the +model. Furthermore, the model employs adaptive Fourier operators as its core +components and designs a masked loss function to better handle the impact of +land-sea boundaries. A series of experiments on the ERA5 dataset demonstrate +that OCN can achieve short-term forecast accuracy comparable to traditional +models while exhibiting an understanding of the wave generation process. In +comparative experiments under both normal and extreme conditions, OCN +consistently outperforms the widely used WaveWatch III model in the industry. +Even after long-term forecasting, OCN maintains a stable and energy-rich state. +By further constructing a simple meteorological model, OCN-wind, which +considers energy balance, this paper confirms the importance of energy +constraints for improving the long-term forecast performance of deep learning +meteorological models. This finding provides new ideas for future research on +deep learning geophysical fluid models. + +
+
+
+
+
+ + ♻ ☆ FairML: A Julia Package for Fair Classification + + +
+ In this paper, we propose FairML.jl, a Julia package providing a framework +for fair classification in machine learning. In this framework, the fair +learning process is divided into three stages. Each stage aims to reduce +unfairness, such as disparate impact and disparate mistreatment, in the final +prediction. For the preprocessing stage, we present a resampling method that +addresses unfairness coming from data imbalances. The in-processing phase +consist of a classification method. This can be either one coming from the +MLJ.jl package, or a user defined one. For this phase, we incorporate fair ML +methods that can handle unfairness to a certain degree through their +optimization process. In the post-processing, we discuss the choice of the +cut-off value for fair prediction. With simulations, we show the performance of +the single phases and their combinations. + +
+
+ comment: 25 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ Equation-informed data-driven identification of flow budgets and + dynamics + + +
+ Computational Fluid Dynamics (CFD) is an indispensable method of fluid +modelling in engineering applications, reducing the need for physical +prototypes and testing for tasks such as design optimisation and performance +analysis. Depending on the complexity of the system under consideration, models +ranging from low to high fidelity can be used for prediction, allowing +significant speed-up. However, the choice of model requires information about +the actual dynamics of the flow regime. Correctly identifying the +regions/clusters of flow that share the same dynamics has been a challenging +research topic to date. In this study, we propose a novel hybrid approach to +flow clustering. It consists of characterising each sample point of the system +with equation-based features, i.e. features are budgets that represent the +contribution of each term from the original governing equation to the local +dynamics at each sample point. This was achieved by applying the Sparse +Identification of Nonlinear Dynamical systems (SINDy) method pointwise to time +evolution data. The method proceeds with equation-based clustering using the +Girvan-Newman algorithm. This allows the detection of communities that share +the same physical dynamics. The algorithm is implemented in both Eulerian and +Lagrangian frameworks. In the Lagrangian, i.e. dynamic approach, the clustering +is performed on the trajectory of each point, allowing the change of clusters +to be represented also in time. The performance of the algorithm is first +tested on a flow around a cylinder. The construction of the dynamic clusters in +this test case clearly shows the evolution of the wake from the steady state +solution through the transient to the oscillatory solution. Dynamic clustering +was then successfully tested on turbulent flow data. Two distinct and +well-defined clusters were identified and their temporal evolution was +reconstructed. + +
+
+
+
+
+ + ♻ ☆ Bigger, Regularized, Optimistic: scaling for compute and + sample-efficient continuous control NeurIPS 2024 + + +
+ Sample efficiency in Reinforcement Learning (RL) has traditionally been +driven by algorithmic enhancements. In this work, we demonstrate that scaling +can also lead to substantial improvements. We conduct a thorough investigation +into the interplay of scaling model capacity and domain-specific RL +enhancements. These empirical findings inform the design choices underlying our +proposed BRO (Bigger, Regularized, Optimistic) algorithm. The key innovation +behind BRO is that strong regularization allows for effective scaling of the +critic networks, which, paired with optimistic exploration, leads to superior +performance. BRO achieves state-of-the-art results, significantly outperforming +the leading model-based and model-free algorithms across 40 complex tasks from +the DeepMind Control, MetaWorld, and MyoSuite benchmarks. BRO is the first +model-free algorithm to achieve near-optimal policies in the notoriously +challenging Dog and Humanoid tasks. + +
+
+ comment: NeurIPS 2024 Spotlight +
+
+
+
+
+ + ♻ ☆ Multi-objective Deep Learning: Taxonomy and Survey of the State of the + Art + + +
+ Simultaneously considering multiple objectives in machine learning has been a +popular approach for several decades, with various benefits for multi-task +learning, the consideration of secondary goals such as sparsity, or +multicriteria hyperparameter tuning. However - as multi-objective optimization +is significantly more costly than single-objective optimization - the recent +focus on deep learning architectures poses considerable additional challenges +due to the very large number of parameters, strong nonlinearities and +stochasticity. This survey covers recent advancements in the area of +multi-objective deep learning. We introduce a taxonomy of existing methods - +based on the type of training algorithm as well as the decision maker's needs - +before listing recent advancements, and also successful applications. All three +main learning paradigms supervised learning, unsupervised learning and +reinforcement learning are covered, and we also address the recently very +popular area of generative modeling. + +
+
+
+
+
+ + ♻ ☆ Normalizing self-supervised learning for provably reliable Change Point + Detection + + +
+ Change point detection (CPD) methods aim to identify abrupt shifts in the +distribution of input data streams. Accurate estimators for this task are +crucial across various real-world scenarios. Yet, traditional unsupervised CPD +techniques face significant limitations, often relying on strong assumptions or +suffering from low expressive power due to inherent model simplicity. In +contrast, representation learning methods overcome these drawbacks by offering +flexibility and the ability to capture the full complexity of the data without +imposing restrictive assumptions. However, these approaches are still emerging +in the CPD field and lack robust theoretical foundations to ensure their +reliability. Our work addresses this gap by integrating the expressive power of +representation learning with the groundedness of traditional CPD techniques. We +adopt spectral normalization (SN) for deep representation learning in CPD tasks +and prove that the embeddings after SN are highly informative for CPD. Our +method significantly outperforms current state-of-the-art methods during the +comprehensive evaluation via three standard CPD datasets. + +
+
+
+
+
+ + ♻ ☆ Samba: Simple Hybrid State Space Models for Efficient Unlimited Context + Language Modeling + + +
+ Efficiently modeling sequences with infinite context length has long been a +challenging problem. Previous approaches have either suffered from quadratic +computational complexity or limited extrapolation ability in length +generalization. In this work, we present Samba, a simple hybrid architecture +that layer-wise combines Mamba, a selective State Space Model (SSM), with +Sliding Window Attention (SWA). Samba selectively compresses a given sequence +into recurrent hidden states while still maintaining the ability to precisely +recall recent memories with the attention mechanism. We scale Samba up to 3.8B +parameters with 3.2T training tokens and demonstrate that it significantly +outperforms state-of-the-art models across a variety of benchmarks. Pretrained +on sequences of 4K length, Samba shows improved perplexity in context lengths +of up to 1M in zero-shot. When finetuned on 4K-length sequences, Samba +efficiently extrapolates to a 256K context length with perfect memory recall on +the Passkey Retrieval task, and exhibits superior retrieval extrapolation on +the challenging Phonebook task compared to full-attention models. As a +linear-time sequence model, Samba achieves a 3.73x higher throughput compared +to Transformers with grouped-query attention for user prompts of 128K length, +and a 3.64x speedup when generating 64K tokens with unlimited streaming. Our +code for training on open source data is publicly available at +https://github.com/microsoft/Samba. + +
+
+
+
+
+ + ♻ ☆ Learning from Reduced Labels for Long-Tailed Data + + +
+ Long-tailed data is prevalent in real-world classification tasks and heavily +relies on supervised information, which makes the annotation process +exceptionally labor-intensive and time-consuming. Unfortunately, despite being +a common approach to mitigate labeling costs, existing weakly supervised +learning methods struggle to adequately preserve supervised information for +tail samples, resulting in a decline in accuracy for the tail classes. To +alleviate this problem, we introduce a novel weakly supervised labeling setting +called Reduced Label. The proposed labeling setting not only avoids the decline +of supervised information for the tail samples, but also decreases the labeling +costs associated with long-tailed data. Additionally, we propose an +straightforward and highly efficient unbiased framework with strong theoretical +guarantees to learn from these Reduced Labels. Extensive experiments conducted +on benchmark datasets including ImageNet validate the effectiveness of our +approach, surpassing the performance of state-of-the-art weakly supervised +methods. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Demystifying Language Model Forgetting with Low-rank Example + Associations + + +
+ Large Language models (LLMs) suffer from forgetting of upstream data when +fine-tuned. Despite efforts on mitigating forgetting, few have investigated +whether, and how forgotten upstream examples are dependent on and associated +with newly learned tasks. Insights on such associations enable efficient and +targeted mitigation of forgetting. In this paper, we empirically analyze +forgetting (measured in log-perplexity increase) that occurs in $N$ upstream +examples of language modeling or instruction-tuning after fine-tuning LLMs on +one of $M$ new tasks, visualized in $M\times N$ matrices. We demonstrate that +the matrices display simple low-rank patterns, often well-approximated with +multiplicative scalar effects of upstream examples and newly learned tasks. We +also examine fine-grained associations with visualization and statistics. +Leveraging the low-rank nature of the associations, we predict forgetting of +upstream examples when fine-tuning on unseen tasks with matrix completion over +the empirical associations. This enables fast identification of most forgotten +examples without expensive inference on the entire upstream data. The approach, +despite simplicity, outperforms prior approaches that learn semantic +relationships of learned tasks and upstream examples with LMs for predicting +forgetting. We demonstrate the practical utility of our analysis by showing +statistically significantly reduced forgetting as we upweight predicted +examples for replay at fine-tuning. Project page: +https://inklab.usc.edu/lm-forgetting-prediction/ + +
+
+ comment: 10 pages; preprint +
+
+
+
+
+ + ♻ ☆ AutoGuide: Automated Generation and Selection of Context-Aware + Guidelines for Large Language Model Agents + + +
+ Recent advances in large language models (LLMs) have empowered AI agents +capable of performing various sequential decision-making tasks. However, +effectively guiding LLMs to perform well in unfamiliar domains like web +navigation, where they lack sufficient knowledge, has proven to be difficult +with the demonstration-based in-context learning paradigm. In this paper, we +introduce a novel framework, called AutoGuide, which addresses this limitation +by automatically generating context-aware guidelines from offline experiences. +Importantly, each context-aware guideline is expressed in concise natural +language and follows a conditional structure, clearly describing the context +where it is applicable. As a result, our guidelines facilitate the provision of +relevant knowledge for the agent's current decision-making process, overcoming +the limitations of the conventional demonstration-based learning paradigm. Our +evaluation demonstrates that AutoGuide significantly outperforms competitive +baselines in complex benchmark domains, including real-world web navigation. + +
+
+
+
+
+ + ♻ ☆ Center-Sensitive Kernel Optimization for Efficient On-Device Incremental + Learning + + +
+ To facilitate the evolution of edge intelligence in ever-changing +environments, we study on-device incremental learning constrained in limited +computation resource in this paper. Current on-device training methods just +focus on efficient training without considering the catastrophic forgetting, +preventing the model getting stronger when continually exploring the world. To +solve this problem, a direct solution is to involve the existing incremental +learning mechanisms into the on-device training framework. Unfortunately, such +a manner cannot work well as those mechanisms usually introduce large +additional computational cost to the network optimization process, which would +inevitably exceed the memory capacity of the edge devices. To address this +issue, this paper makes an early effort to propose a simple but effective +edge-friendly incremental learning framework. Based on an empirical study on +the knowledge intensity of the kernel elements of the neural network, we find +that the center kernel is the key for maximizing the knowledge intensity for +learning new data, while freezing the other kernel elements would get a good +balance on the model's capacity for overcoming catastrophic forgetting. Upon +this finding, we further design a center-sensitive kernel optimization +framework to largely alleviate the cost of the gradient computation and +back-propagation. Besides, a dynamic channel element selection strategy is also +proposed to facilitate a sparse orthogonal gradient projection for further +reducing the optimization complexity, upon the knowledge explored from the new +task data. Extensive experiments validate our method is efficient and +effective, e.g., our method achieves average accuracy boost of 38.08% with even +less memory and approximate computation compared to existing on-device training +methods, indicating its significant potential for on-device incremental +learning. + +
+
+
+
+
+ + ♻ ☆ VISION-XL: High Definition Video Inverse Problem Solver using Latent + Image Diffusion Models + + +
+ In this paper, we propose a novel framework for solving high-definition video +inverse problems using latent image diffusion models. Building on recent +advancements in spatio-temporal optimization for video inverse problems using +image diffusion models, our approach leverages latent-space diffusion models to +achieve enhanced video quality and resolution. To address the high +computational demands of processing high-resolution frames, we introduce a +pseudo-batch consistent sampling strategy, allowing efficient operation on a +single GPU. Additionally, to improve temporal consistency, we present +batch-consistent inversion, an initialization technique that incorporates +informative latents from the measurement frame. By integrating with SDXL, our +framework achieves state-of-the-art video reconstruction across a wide range of +spatio-temporal inverse problems, including complex combinations of frame +averaging and various spatial degradations, such as deblurring, +super-resolution, and inpainting. Unlike previous methods, our approach +supports multiple aspect ratios (landscape, vertical, and square) and delivers +HD-resolution reconstructions (exceeding 1280x720) in under 2.5 minutes on a +single NVIDIA 4090 GPU. + +
+
+ comment: Project page: https://vision-xl.github.io/ +
+
+
+
+
+ + ♻ ☆ Classical integrability in the presence of a cosmological constant: + analytic and machine learning results + + +
+ We study the integrability of two-dimensional theories that are obtained by a +dimensional reduction of certain four-dimensional gravitational theories +describing the coupling of Maxwell fields and neutral scalar fields to gravity +in the presence of a potential for the neutral scalar fields. For a certain +solution subspace, we demonstrate partial integrability by showing that a +subset of the equations of motion in two dimensions are the compatibility +conditions for a linear system. Subsequently, we study the integrability of +these two-dimensional models from a complementary one-dimensional point of +view, framed in terms of Liouville integrability. In this endeavour, we employ +various machine learning techniques to systematise our search for numerical Lax +pair matrices for these models, as well as conserved currents expressed as +functions of phase space variables. + +
+
+ comment: 38 pages, 9 figures, typographical corrections and assorted + improvements +
+
+
+
+
+ + ♻ ☆ Practical Parallel Algorithms for Non-Monotone Submodular Maximization AAAI-2023 + + +
+ Submodular maximization has found extensive applications in various domains +within the field of artificial intelligence, including but not limited to +machine learning, computer vision, and natural language processing. With the +increasing size of datasets in these domains, there is a pressing need to +develop efficient and parallelizable algorithms for submodular maximization. +One measure of the parallelizability of a submodular maximization algorithm is +its adaptive complexity, which indicates the number of sequential rounds where +a polynomial number of queries to the objective function can be executed in +parallel. In this paper, we study the problem of non-monotone submodular +maximization subject to a knapsack constraint, and propose the first +combinatorial algorithm achieving an $(8+\epsilon)$-approximation under +$\mathcal{O}(\log n)$ adaptive complexity, which is \textit{optimal} up to a +factor of $\mathcal{O}(\log\log n)$. Moreover, we also propose the first +algorithm with both provable approximation ratio and sublinear adaptive +complexity for the problem of non-monotone submodular maximization subject to a +$k$-system constraint. As a by-product, we show that our two algorithms can +also be applied to the special case of submodular maximization subject to a +cardinality constraint, and achieve performance bounds comparable with those of +state-of-the-art algorithms. Finally, the effectiveness of our approach is +demonstrated by extensive experiments on real-world applications. + +
+
+ comment: Part of the contribution appears in AAAI-2023 +
+
+
+
+
+ + ♻ ☆ Guardian of the Ensembles: Introducing Pairwise Adversarially Robust + Loss for Resisting Adversarial Attacks in DNN Ensembles WACV 2025 + + +
+ Adversarial attacks rely on transferability, where an adversarial example +(AE) crafted on a surrogate classifier tends to mislead a target classifier. +Recent ensemble methods demonstrate that AEs are less likely to mislead +multiple classifiers in an ensemble. This paper proposes a new ensemble +training using a Pairwise Adversarially Robust Loss (PARL) that by construction +produces an ensemble of classifiers with diverse decision boundaries. PARL +utilizes outputs and gradients of each layer with respect to network parameters +in every classifier within the ensemble simultaneously. PARL is demonstrated to +achieve higher robustness against black-box transfer attacks than previous +ensemble methods as well as adversarial training without adversely affecting +clean example accuracy. Extensive experiments using standard Resnet20, +WideResnet28-10 classifiers demonstrate the robustness of PARL against +state-of-the-art adversarial attacks. While maintaining similar clean accuracy +and lesser training time, the proposed architecture has a 24.8% increase in +robust accuracy ($\epsilon$ = 0.07) from the state-of-the art method. + +
+
+ comment: Accepted at IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV 2025) +
+
+
+
+
+ + ♻ ☆ Bidirectional Decoding: Improving Action Chunking via Closed-Loop + Resampling + + +
+ Predicting and executing a sequence of actions without intermediate +replanning, known as action chunking, is increasingly used in robot learning +from human demonstrations. Yet, its reported effects on the learned policy are +inconsistent: some studies find it crucial for achieving strong results, while +others observe decreased performance. In this paper, we first dissect how +action chunking impacts the divergence between a learner and a demonstrator. We +find that action chunking allows the learner to better capture the temporal +dependencies in demonstrations but at the cost of reduced reactivity in +stochastic environments. To address this tradeoff, we propose Bidirectional +Decoding (BID), a test-time inference algorithm that bridges action chunking +with closed-loop operations. BID samples multiple predictions at each time step +and searches for the optimal one based on two criteria: (i) backward coherence, +which favors samples that align with previous decisions; (ii) forward contrast, +which seeks samples of high likelihood for future plans. By coupling decisions +within and across action chunks, BID promotes consistency over time while +maintaining reactivity to unexpected changes. Experimental results show that +BID boosts the performance of two state-of-the-art generative policies across +seven simulation benchmarks and two real-world tasks. Code and videos are +available at https://bid-robot.github.io. + +
+
+ comment: Project website: https://bid-robot.github.io/ +
+
+
+
+
+ + ♻ ☆ CultureLLM: Incorporating Cultural Differences into Large Language + Models NeurIPS 2024 + + +
+ Large language models (LLMs) are reported to be partial to certain cultures +owing to the training data dominance from the English corpora. Since +multilingual cultural data are often expensive to collect, existing efforts +handle this by prompt engineering or culture-specific pre-training. However, +they might overlook the knowledge deficiency of low-resource culture and +require extensive computing resources. In this paper, we propose CultureLLM, a +cost-effective solution to incorporate cultural differences into LLMs. +CultureLLM adopts World Value Survey (WVS) as seed data and generates +semantically equivalent training data via the proposed semantic data +augmentation. Using only 50 seed samples from WVS with augmented data, we +fine-tune culture-specific LLMs and one unified model (CultureLLM-One) for 9 +cultures covering rich and low-resource languages. Extensive experiments on 60 +culture-related datasets demonstrate that CultureLLM significantly outperforms +various counterparts such as GPT-3.5 (by 8.1%) and Gemini Pro (by 9.5%) with +comparable performance to GPT-4 or even better. Our human study shows that the +generated samples are semantically equivalent to the original samples, +providing an effective solution for LLMs augmentation. Code is released at +https://github.com/Scarelette/CultureLLM. + +
+
+ comment: NeurIPS 2024; Code is at https://github.com/Scarelette/CultureLLM +
+
+
+
+
+ + ♻ ☆ Harmful Fine-tuning Attacks and Defenses for Large Language Models: A + Survey + + +
+ Recent research demonstrates that the nascent fine-tuning-as-a-service +business model exposes serious safety concerns -- fine-tuning over a few +harmful data uploaded by the users can compromise the safety alignment of the +model. The attack, known as harmful fine-tuning attack, has raised a broad +research interest among the community. However, as the attack is still new, +\textbf{we observe that there are general misunderstandings within the research +community.} To clear up concern, this paper provide a comprehensive overview to +three aspects of harmful fine-tuning: attacks setting, defense design and +evaluation methodology. Specifically, we first present the threat model of the +problem, and introduce the harmful fine-tuning attack and its variants. Then we +systematically survey the existing literature on attacks/defenses/mechanical +analysis of the problem. Finally, we introduce the evaluation methodology and +outline future research directions that might contribute to the development of +the field. Additionally, we present a list of questions of interest, which +might be useful to refer to when reviewers in the peer review process question +the realism of the experiment/attack/defense setting. A curated list of +relevant papers is maintained and made accessible at: +https://github.com/git-disl/awesome_LLM-harmful-fine-tuning-papers. + +
+
+
+
+
+ + ♻ ☆ Intelligent Go-Explore: Standing on the Shoulders of Giant Foundation + Models + + +
+ Go-Explore is a powerful family of algorithms designed to solve +hard-exploration problems built on the principle of archiving discovered +states, and iteratively returning to and exploring from the most promising +states. This approach has led to superhuman performance across a wide variety +of challenging problems including Atari games and robotic control, but requires +manually designing heuristics to guide exploration (i.e., determine which +states to save and explore from, and what actions to consider next), which is +time-consuming and infeasible in general. To resolve this, we propose +Intelligent Go-Explore (IGE) which greatly extends the scope of the original +Go-Explore by replacing these handcrafted heuristics with the intelligence and +internalized human notions of interestingness captured by giant pretrained +foundation models (FMs). This provides IGE with a human-like ability to +instinctively identify how interesting or promising any new state is (e.g., +discovering new objects, locations, or behaviors), even in complex environments +where heuristics are hard to define. Moreover, IGE offers the exciting +opportunity to recognize and capitalize on serendipitous discoveries-states +encountered during exploration that are valuable in terms of exploration, yet +where what makes them interesting was not anticipated by the human user. We +evaluate our algorithm on a diverse range of language and vision-based tasks +that require search and exploration. Across these tasks, IGE strongly exceeds +classic reinforcement learning and graph search baselines, and also succeeds +where prior state-of-the-art FM agents like Reflexion completely fail. Overall, +Intelligent Go-Explore combines the tremendous strengths of FMs and the +powerful Go-Explore algorithm, opening up a new frontier of research into +creating more generally capable agents with impressive exploration +capabilities. + +
+
+
+
+
+ + ♻ ☆ FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL NeurIPS '24 + + +
+ Multi-agent reinforcement learning has demonstrated significant potential in +addressing complex cooperative tasks across various real-world applications. +However, existing MARL approaches often rely on the restrictive assumption that +the number of entities (e.g., agents, obstacles) remains constant between +training and inference. This overlooks scenarios where entities are dynamically +removed or added during the inference trajectory -- a common occurrence in +real-world environments like search and rescue missions and dynamic combat +situations. In this paper, we tackle the challenge of intra-trajectory dynamic +entity composition under zero-shot out-of-domain (OOD) generalization, where +such dynamic changes cannot be anticipated beforehand. Our empirical studies +reveal that existing MARL methods suffer significant performance degradation +and increased uncertainty in these scenarios. In response, we propose +FlickerFusion, a novel OOD generalization method that acts as a universally +applicable augmentation technique for MARL backbone methods. FlickerFusion +stochastically drops out parts of the observation space, emulating being +in-domain when inferenced OOD. The results show that FlickerFusion not only +achieves superior inference rewards but also uniquely reduces uncertainty +vis-\`a-vis the backbone, compared to existing methods. Benchmarks, +implementations, and model weights are organized and open-sourced at +flickerfusion305.github.io, accompanied by ample demo video renderings. + +
+
+ comment: NeurIPS '24 Open-World Agents Workshop +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Federated Learning via Homomorphic Adversarial + Networks + + +
+ Privacy-preserving federated learning (PPFL) aims to train a global model for +multiple clients while maintaining their data privacy. However, current PPFL +protocols exhibit one or more of the following insufficiencies: considerable +degradation in accuracy, the requirement for sharing keys, and cooperation +during the key generation or decryption processes. As a mitigation, we develop +the first protocol that utilizes neural networks to implement PPFL, as well as +incorporating an Aggregatable Hybrid Encryption scheme tailored to the needs of +PPFL. We name these networks as Homomorphic Adversarial Networks (HANs) which +demonstrate that neural networks are capable of performing tasks similar to +multi-key homomorphic encryption (MK-HE) while solving the problems of key +distribution and collaborative decryption. Our experiments show that HANs are +robust against privacy attacks. Compared with non-private federated learning, +experiments conducted on multiple datasets demonstrate that HANs exhibit a +negligible accuracy loss (at most 1.35%). Compared to traditional MK-HE +schemes, HANs increase encryption aggregation speed by 6,075 times while +incurring a 29.2 times increase in communication overhead. + +
+
+
+
+
+ + ♻ ☆ Investigating Privacy Leakage in Dimensionality Reduction Methods via + Reconstruction Attack + + +
+ This study investigates privacy leakage in dimensionality reduction methods +through a novel machine learning-based reconstruction attack. Employing an +informed adversary threat model, we develop a neural network capable of +reconstructing high-dimensional data from low-dimensional embeddings. + We evaluate six popular dimensionality reduction techniques: PCA, sparse +random projection (SRP), multidimensional scaling (MDS), Isomap, t-SNE, and +UMAP. Using both MNIST and NIH Chest X-ray datasets, we perform a qualitative +analysis to identify key factors affecting reconstruction quality. Furthermore, +we assess the effectiveness of an additive noise mechanism in mitigating these +reconstruction attacks. Our experimental results on both datasets reveal that +the attack is effective against deterministic methods (PCA and Isomap), but +ineffective against methods that employ random initialization (SRP, MDS, t-SNE +and UMAP). When adding the images with large noises before performing PCA or +Isomap, the attack produced severely distorted reconstructions. In contrast, +for the other four methods, the reconstructions still show some recognizable +features, though they bear little resemblance to the original images. + +
+
+ comment: Major revision +
+
+
+
+
+ + ♻ ☆ A Physics-embedded Deep Learning Framework for Cloth Simulation + + +
+ Delicate cloth simulations have long been desired in computer graphics. +Various methods were proposed to improve engaged force interactions, collision +handling, and numerical integrations. Deep learning has the potential to +achieve fast and real-time simulation, but common neural network structures +often demand many parameters to capture cloth dynamics. This paper proposes a +physics-embedded learning framework that directly encodes physical features of +cloth simulation. The convolutional neural network is used to represent spatial +correlations of the mass-spring system, after which three branches are designed +to learn linear, nonlinear, and time derivate features of cloth physics. The +framework can also integrate with other external forces and collision handling +through either traditional simulators or sub neural networks. The model is +tested across different cloth animation cases, without training with new data. +Agreement with baselines and predictive realism successfully validate its +generalization ability. Inference efficiency of the proposed model also defeats +traditional physics simulation. This framework is also designed to easily +integrate with other visual refinement techniques like wrinkle carving, which +leaves significant chances to incorporate prevailing macing learning techniques +in 3D cloth amination. + +
+
+ comment: updated version +
+
+
+
+
+ + ♻ ☆ Yi-Lightning Technical Report + + +
+ This technical report presents Yi-Lightning, our latest flagship large +language model (LLM). It achieves exceptional performance, ranking 6th overall +on Chatbot Arena, with particularly strong results (2nd to 4th place) in +specialized categories including Chinese, Math, Coding, and Hard Prompts. +Yi-Lightning leverages an enhanced Mixture-of-Experts (MoE) architecture, +featuring advanced expert segmentation and routing mechanisms coupled with +optimized KV-caching techniques. Our development process encompasses +comprehensive pre-training, supervised fine-tuning (SFT), and reinforcement +learning from human feedback (RLHF), where we devise deliberate strategies for +multi-stage training, synthetic data construction, and reward modeling. +Furthermore, we implement RAISE (Responsible AI Safety Engine), a +four-component framework to address safety issues across pre-training, +post-training, and serving phases. Empowered by our scalable super-computing +infrastructure, all these innovations substantially reduce training, deployment +and inference costs while maintaining high-performance standards. With further +evaluations on public academic benchmarks, Yi-Lightning demonstrates +competitive performance against top-tier LLMs, while we observe a notable +disparity between traditional, static benchmark results and real-world, dynamic +human preferences. This observation prompts a critical reassessment of +conventional benchmarks' utility in guiding the development of more intelligent +and powerful AI systems for practical applications. Yi-Lightning is now +available through our developer platform at https://platform.lingyiwanwu.com. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Study of Shapley Value in Data Analytics + + +
+ Over the recent years, Shapley value (SV), a solution concept from +cooperative game theory, has found numerous applications in data analytics +(DA). This paper provides the first comprehensive study of SV used throughout +the DA workflow, which involves three main steps: data fabric, data +exploration, and result reporting. We summarize existing versatile forms of SV +used in these steps by a unified definition and clarify the essential +functionalities that SV can provide for data scientists. We categorize the arts +in this field based on the technical challenges they tackled, which include +computation efficiency, approximation error, privacy preservation, and +appropriate interpretations. We discuss these challenges and analyze the +corresponding solutions. We also implement SVBench, the first open-sourced +benchmark for developing SV applications, and conduct experiments on six DA +tasks to validate our analysis and discussions. Based on the qualitative and +quantitative results, we identify the limitations of current efforts for +applying SV to DA and highlight the directions of future research and +engineering. + +
+
+
+
+
+ + ♻ ☆ FSMLP: Modelling Channel Dependencies With Simplex Theory Based + Multi-Layer Perceptions In Frequency Domain + + +
+ Time series forecasting (TSF) plays a crucial role in various domains, +including web data analysis, energy consumption prediction, and weather +forecasting. While Multi-Layer Perceptrons (MLPs) are lightweight and effective +for capturing temporal dependencies, they are prone to overfitting when used to +model inter-channel dependencies. In this paper, we investigate the overfitting +problem in channel-wise MLPs using Rademacher complexity theory, revealing that +extreme values in time series data exacerbate this issue. To mitigate this +issue, we introduce a novel Simplex-MLP layer, where the weights are +constrained within a standard simplex. This strategy encourages the model to +learn simpler patterns and thereby reducing overfitting to extreme values. +Based on the Simplex-MLP layer, we propose a novel \textbf{F}requency +\textbf{S}implex \textbf{MLP} (FSMLP) framework for time series forecasting, +comprising of two kinds of modules: \textbf{S}implex +\textbf{C}hannel-\textbf{W}ise MLP (SCWM) and \textbf{F}requency +\textbf{T}emporal \textbf{M}LP (FTM). The SCWM effectively leverages the +Simplex-MLP to capture inter-channel dependencies, while the FTM is a simple +yet efficient temporal MLP designed to extract temporal information from the +data. Our theoretical analysis shows that the upper bound of the Rademacher +Complexity for Simplex-MLP is lower than that for standard MLPs. Moreover, we +validate our proposed method on seven benchmark datasets, demonstrating +significant improvements in forecasting accuracy and efficiency, while also +showcasing superior scalability. Additionally, we demonstrate that Simplex-MLP +can improve other methods that use channel-wise MLP to achieve less overfitting +and improved performance. Code are available +\href{https://github.com/FMLYD/FSMLP}{\textcolor{red}{here}}. + +
+
+
+
+
+ + ♻ ☆ NüshuRescue: Revitalization of the endangered Nüshu Language with AI COLING 2025 + + +
+ The preservation and revitalization of endangered and extinct languages is a +meaningful endeavor, conserving cultural heritage while enriching fields like +linguistics and anthropology. However, these languages are typically +low-resource, making their reconstruction labor-intensive and costly. This +challenge is exemplified by N\"ushu, a rare script historically used by Yao +women in China for self-expression within a patriarchal society. To address +this challenge, we introduce N\"ushuRescue, an AI-driven framework designed to +train large language models (LLMs) on endangered languages with minimal data. +N\"ushuRescue automates evaluation and expands target corpora to accelerate +linguistic revitalization. As a foundational component, we developed NCGold, a +500-sentence N\"ushu-Chinese parallel corpus, the first publicly available +dataset of its kind. Leveraging GPT-4-Turbo, with no prior exposure to N\"ushu +and only 35 short examples from NCGold, N\"ushuRescue achieved 48.69\% +translation accuracy on 50 withheld sentences and generated NCSilver, a set of +98 newly translated modern Chinese sentences of varying lengths. A sample of +both NCGold and NCSilver is included in the Supplementary Materials. +Additionally, we developed FastText-based and Seq2Seq models to further support +research on N\"ushu. N\"ushuRescue provides a versatile and scalable tool for +the revitalization of endangered languages, minimizing the need for extensive +human input. + +
+
+ comment: Accepted to COLING 2025 +
+
+
+
+
+ + ♻ ☆ CPRM: A LLM-based Continual Pre-training Framework for Relevance + Modeling in Commercial Search + + +
+ Relevance modeling between queries and items stands as a pivotal component in +commercial search engines, directly affecting the user experience. Given the +remarkable achievements of large language models (LLMs) in various natural +language processing (NLP) tasks, LLM-based relevance modeling is gradually +being adopted within industrial search systems. Nevertheless, foundational LLMs +lack domain-specific knowledge and do not fully exploit the potential of +in-context learning. Furthermore, structured item text remains underutilized, +and there is a shortage in the supply of corresponding queries and background +knowledge. We thereby propose CPRM (Continual Pre-training for Relevance +Modeling), a framework designed for the continual pre-training of LLMs to +address these issues. Our CPRM framework includes three modules: 1) employing +both queries and multi-field item to jointly pre-train for enhancing domain +knowledge, 2) applying in-context pre-training, a novel approach where LLMs are +pre-trained on a sequence of related queries or items, and 3) conducting +reading comprehension on items to produce associated domain knowledge and +background information (e.g., generating summaries and corresponding queries) +to further strengthen LLMs. Results on offline experiments and online A/B +testing demonstrate that our model achieves convincing performance compared to +strong baselines. + +
+
+
+
+
+ + ♻ ☆ Interventional Causal Discovery in a Mixture of DAGs NeurIPS 2024 + + +
+ Causal interactions among a group of variables are often modeled by a single +causal graph. In some domains, however, these interactions are best described +by multiple co-existing causal graphs, e.g., in dynamical systems or genomics. +This paper addresses the hitherto unknown role of interventions in learning +causal interactions among variables governed by a mixture of causal systems, +each modeled by one directed acyclic graph (DAG). Causal discovery from +mixtures is fundamentally more challenging than single-DAG causal discovery. +Two major difficulties stem from (i)~an inherent uncertainty about the +skeletons of the component DAGs that constitute the mixture and (ii)~possibly +cyclic relationships across these component DAGs. This paper addresses these +challenges and aims to identify edges that exist in at least one component DAG +of the mixture, referred to as the true edges. First, it establishes matching +necessary and sufficient conditions on the size of interventions required to +identify the true edges. Next, guided by the necessity results, an adaptive +algorithm is designed that learns all true edges using $O(n^2)$ interventions, +where $n$ is the number of nodes. Remarkably, the size of the interventions is +optimal if the underlying mixture model does not contain cycles across its +components. More generally, the gap between the intervention size used by the +algorithm and the optimal size is quantified. It is shown to be bounded by the +cyclic complexity number of the mixture model, defined as the size of the +minimal intervention that can break the cycles in the mixture, which is upper +bounded by the number of cycles among the ancestors of a node. + +
+
+ comment: NeurIPS 2024 camera-ready version +
+
+
+
+
+ + ♻ ☆ DFRot: Achieving Outlier-Free and Massive Activation-Free for Rotated + LLMs with Refined Rotation + + +
+ Rotating the activation and weight matrices to reduce the influence of +outliers in large language models (LLMs) has recently attracted significant +attention, particularly in the context of model quantization. Prior studies +have shown that in low-precision quantization scenarios, such as 4-bit weights +and 4-bit activations (W4A4), randomized Hadamard transforms can achieve +significantly higher accuracy than randomized orthogonal transforms. Notably, +the reason behind this phenomena remains unknown. In this paper, we find that +these transformations show substantial improvement in eliminating outliers for +common tokens and achieve similar quantization error. The primary reason for +the accuracy difference lies in the fact that randomized Hadamard transforms +can slightly reduce the quantization error for tokens with massive activations +while randomized orthogonal transforms increase the quantization error. Due to +the extreme rarity of these tokens and their critical impact on model accuracy, +we consider this a long-tail optimization problem, and therefore construct a +simple yet effective method: a weighted loss function. Additionally, we propose +an optimization strategy for the rotation matrix that involves alternating +optimization of quantization parameters while employing orthogonal Procrustes +transforms to refine the rotation matrix. This makes the distribution of the +rotated activation values more conducive to quantization, especially for tokens +with massive activations. Our method enhances the Rotated LLMs by achieving +dual free, Outlier-Free and Massive Activation-Free, dubbed as DFRot. Extensive +experiments demonstrate the effectiveness and efficiency of DFRot. By tuning +the rotation matrix using just a single sample, DFRot achieves a perplexity +improvement of 0.25 and 0.21 on W4A4KV4 and W4A4KV16, respectively, for +LLaMA3-8B, a model known for its quantization challenges. + +
+
+ comment: 24 pages, 38 figures, source code + \url{https://github.com/JingyangXiang/DFRot} +
+
+
+
+
+ + ♻ ☆ Towards Universal Mesh Movement Networks NeurIPS 2024 + + +
+ Solving complex Partial Differential Equations (PDEs) accurately and +efficiently is an essential and challenging problem in all scientific and +engineering disciplines. Mesh movement methods provide the capability to +improve the accuracy of the numerical solution without increasing the overall +mesh degree of freedom count. Conventional sophisticated mesh movement methods +are extremely expensive and struggle to handle scenarios with complex boundary +geometries. However, existing learning-based methods require re-training from +scratch given a different PDE type or boundary geometry, which limits their +applicability, and also often suffer from robustness issues in the form of +inverted elements. In this paper, we introduce the Universal Mesh Movement +Network (UM2N), which -- once trained -- can be applied in a non-intrusive, +zero-shot manner to move meshes with different size distributions and +structures, for solvers applicable to different PDE types and boundary +geometries. UM2N consists of a Graph Transformer (GT) encoder for extracting +features and a Graph Attention Network (GAT) based decoder for moving the mesh. +We evaluate our method on advection and Navier-Stokes based examples, as well +as a real-world tsunami simulation case. Our method outperforms existing +learning-based mesh movement methods in terms of the benchmarks described +above. In comparison to the conventional sophisticated Monge-Amp\`ere +PDE-solver based method, our approach not only significantly accelerates mesh +movement, but also proves effective in scenarios where the conventional method +fails. Our project page is at https://erizmr.github.io/UM2N/. + +
+
+ comment: Accepted at NeurIPS 2024 as a spotlight paper +
+
+
+
+
+ + ♻ ☆ HLSFactory: A Framework Empowering High-Level Synthesis Datasets for + Machine Learning and Beyond + + +
+ Machine learning (ML) techniques have been applied to high-level synthesis +(HLS) flows for quality-of-result (QoR) prediction and design space exploration +(DSE). Nevertheless, the scarcity of accessible high-quality HLS datasets and +the complexity of building such datasets present challenges. Existing datasets +have limitations in terms of benchmark coverage, design space enumeration, +vendor extensibility, or lack of reproducible and extensible software for +dataset construction. Many works also lack user-friendly ways to add more +designs, limiting wider adoption of such datasets. In response to these +challenges, we introduce HLSFactory, a comprehensive framework designed to +facilitate the curation and generation of high-quality HLS design datasets. +HLSFactory has three main stages: 1) a design space expansion stage to +elaborate single HLS designs into large design spaces using various +optimization directives across multiple vendor tools, 2) a design synthesis +stage to execute HLS and FPGA tool flows concurrently across designs, and 3) a +data aggregation stage for extracting standardized data into packaged datasets +for ML usage. This tripartite architecture ensures broad design space coverage +via design space expansion and supports multiple vendor tools. Users can +contribute to each stage with their own HLS designs and synthesis results and +extend the framework itself with custom frontends and tool flows. We also +include an initial set of built-in designs from common HLS benchmarks curated +open-source HLS designs. We showcase the versatility and multi-functionality of +our framework through seven case studies: I) ML model for QoR prediction; II) +Design space sampling; III) Fine-grained parallelism backend speedup; IV) +Targeting Intel's HLS flow; V) Adding new auxiliary designs; VI) Integrating +published HLS data; VII) HLS tool version regression benchmarking. + +
+
+ comment: MLCAD 2024 version of the paper. New case study with ML QoR + prediction. Artifact evaluation details included +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ AV-Odyssey Bench: Can Your Multimodal LLMs Really Understand + Audio-Visual Information? + + +
+ Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini +1.5 Pro, and Reka Core, have expanded their capabilities to include vision and +audio modalities. While these models demonstrate impressive performance across +a wide range of audio-visual applications, our proposed DeafTest reveals that +MLLMs often struggle with simple tasks humans find trivial: 1) determining +which of two sounds is louder, and 2) determining which of two sounds has a +higher pitch. Motivated by these observations, we introduce AV-Odyssey Bench, a +comprehensive audio-visual benchmark designed to assess whether those MLLMs can +truly understand the audio-visual information. This benchmark encompasses 4,555 +carefully crafted problems, each incorporating text, visual, and audio +components. To successfully infer answers, models must effectively leverage +clues from both visual and audio inputs. To ensure precise and objective +evaluation of MLLM responses, we have structured the questions as +multiple-choice, eliminating the need for human evaluation or LLM-assisted +assessment. We benchmark a series of closed-source and open-source models and +summarize the observations. By revealing the limitations of current models, we +aim to provide useful insight for future dataset collection and model +development. + +
+
+ comment: Project page: https://av-odyssey.github.io/ +
+
+
+
+
+ + ☆ Copy-Move Forgery Detection and Question Answering for Remote Sensing + Image + + +
+ This paper introduces the task of Remote Sensing Copy-Move Question Answering +(RSCMQA). Unlike traditional Remote Sensing Visual Question Answering (RSVQA), +RSCMQA focuses on interpreting complex tampering scenarios and inferring +relationships between objects. Based on the practical needs of national defense +security and land resource monitoring, we have developed an accurate and +comprehensive global dataset for remote sensing image copy-move question +answering, named RS-CMQA-2.1M. These images were collected from 29 different +regions across 14 countries. Additionally, we have refined a balanced dataset, +RS-CMQA-B, to address the long-standing issue of long-tail data in the remote +sensing field. Furthermore, we propose a region-discriminative guided +multimodal CMQA model, which enhances the accuracy of answering questions about +tampered images by leveraging prompt about the differences and connections +between the source and tampered domains. Extensive experiments demonstrate that +our method provides a stronger benchmark for RS-CMQA compared to general VQA +and RSVQA models. Our dataset and code are available at +https://github.com/shenyedepisa/RSCMQA. + +
+
+ comment: 7 figs, 7 tables +
+
+
+
+
+ + ☆ It Takes Two: Real-time Co-Speech Two-person's Interaction Generation + via Reactive Auto-regressive Diffusion Model + + +
+ Conversational scenarios are very common in real-world settings, yet existing +co-speech motion synthesis approaches often fall short in these contexts, where +one person's audio and gestures will influence the other's responses. +Additionally, most existing methods rely on offline sequence-to-sequence +frameworks, which are unsuitable for online applications. In this work, we +introduce an audio-driven, auto-regressive system designed to synthesize +dynamic movements for two characters during a conversation. At the core of our +approach is a diffusion-based full-body motion synthesis model, which is +conditioned on the past states of both characters, speech audio, and a +task-oriented motion trajectory input, allowing for flexible spatial control. +To enhance the model's ability to learn diverse interactions, we have enriched +existing two-person conversational motion datasets with more dynamic and +interactive motions. We evaluate our system through multiple experiments to +show it outperforms across a variety of tasks, including single and two-person +co-speech motion generation, as well as interactive motion generation. To the +best of our knowledge, this is the first system capable of generating +interactive full-body motions for two characters from speech in an online +manner. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Resource-Efficient Reference-Free Evaluation of Audio Captions + + +
+ To establish the trustworthiness of systems that automatically generate text +captions for audio, images and video, existing reference-free metrics rely on +large pretrained models which are impractical to accommodate in +resource-constrained settings. To address this, we propose some metrics to +elicit the model's confidence in its own generation. To assess how well these +metrics replace correctness measures that leverage reference captions, we test +their calibration with correctness measures. We discuss why some of these +confidence metrics align better with certain correctness measures. Further, we +provide insight into why temperature scaling of confidence metrics is +effective. Our main contribution is a suite of well-calibrated lightweight +confidence metrics for reference-free evaluation of captions in +resource-constrained settings. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 83 + +
+
+
+ + ♻ ☆ Moner: Motion Correction in Undersampled Radial MRI with Unsupervised + Neural Representation + + +
+ Motion correction (MoCo) in radial MRI is a challenging problem due to the +unpredictability of subject's motion. Current state-of-the-art (SOTA) MoCo +algorithms often use extensive high-quality MR images to pre-train neural +networks, obtaining excellent reconstructions. However, the need for +large-scale datasets significantly increases costs and limits model +generalization. In this work, we propose Moner, an unsupervised MoCo method +that jointly solves artifact-free MR images and accurate motion from +undersampled, rigid motion-corrupted k-space data, without requiring training +data. Our core idea is to leverage the continuous prior of implicit neural +representation (INR) to constrain this ill-posed inverse problem, enabling +ideal solutions. Specifically, we incorporate a quasi-static motion model into +the INR, granting its ability to correct subject's motion. To stabilize model +optimization, we reformulate radial MRI as a back-projection problem using the +Fourier-slice theorem. Additionally, we propose a novel coarse-to-fine hash +encoding strategy, significantly enhancing MoCo accuracy. Experiments on +multiple MRI datasets show our Moner achieves performance comparable to SOTA +MoCo techniques on in-domain data, while demonstrating significant improvements +on out-of-domain data. + +
+
+
+
+
+ + ♻ ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion Transformer + + +
+ In this paper, we introduce OminiControl, a highly versatile and +parameter-efficient framework that integrates image conditions into pre-trained +Diffusion Transformer (DiT) models. At its core, OminiControl leverages a +parameter reuse mechanism, enabling the DiT to encode image conditions using +itself as a powerful backbone and process them with its flexible multi-modal +attention processors. Unlike existing methods, which rely heavily on additional +encoder modules with complex architectures, OminiControl (1) effectively and +efficiently incorporates injected image conditions with only ~0.1% additional +parameters, and (2) addresses a wide range of image conditioning tasks in a +unified manner, including subject-driven generation and spatially-aligned +conditions such as edges, depth, and more. Remarkably, these capabilities are +achieved by training on images generated by the DiT itself, which is +particularly beneficial for subject-driven generation. Extensive evaluations +demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted +models in both subject-driven and spatially-aligned conditional generation. +Additionally, we release our training dataset, Subjects200K, a diverse +collection of over 200,000 identity-consistent images, along with an efficient +data synthesis pipeline to advance research in subject-consistent generation. + +
+
+
+
+
+ + ♻ ☆ GuardSplat: Efficient and Robust Watermarking for 3D Gaussian Splatting + + +
+ 3D Gaussian Splatting (3DGS) has recently created impressive assets for +various applications. However, the copyright of these assets is not well +protected as existing watermarking methods are not suited for 3DGS considering +security, capacity, and invisibility. Besides, these methods often require +hours or even days for optimization, limiting the application scenarios. In +this paper, we propose GuardSplat, an innovative and efficient framework that +effectively protects the copyright of 3DGS assets. Specifically, 1) We first +propose a CLIP-guided Message Decoupling Optimization module for training the +message decoder, leveraging CLIP's aligning capability and rich representations +to achieve a high extraction accuracy with minimal optimization costs, +presenting exceptional capability and efficiency. 2) Then, we propose a +Spherical-harmonic-aware (SH-aware) Message Embedding module tailored for 3DGS, +which employs a set of SH offsets to seamlessly embed the message into the SH +features of each 3D Gaussian while maintaining the original 3D structure. It +enables the 3DGS assets to be watermarked with minimal fidelity trade-offs and +prevents malicious users from removing the messages from the model files, +meeting the demands for invisibility and security. 3) We further propose an +Anti-distortion Message Extraction module to improve robustness against various +visual distortions. Extensive experiments demonstrate that GuardSplat +outperforms the state-of-the-art methods and achieves fast optimization speed. + +
+
+ comment: Project page: https://narcissusex.github.io/GuardSplat and Code: + https://github.com/NarcissusEx/GuardSplat +
+
+
+
+
+ + ♻ ☆ ConvMixFormer- A Resource-efficient Convolution Mixer for + Transformer-based Dynamic Hand Gesture Recognition + + +
+ Transformer models have demonstrated remarkable success in many domains such +as natural language processing (NLP) and computer vision. With the growing +interest in transformer-based architectures, they are now utilized for gesture +recognition. So, we also explore and devise a novel ConvMixFormer architecture +for dynamic hand gestures. The transformers use quadratic scaling of the +attention features with the sequential data, due to which these models are +computationally complex and heavy. We have considered this drawback of the +transformer and designed a resource-efficient model that replaces the +self-attention in the transformer with the simple convolutional layer-based +token mixer. The computational cost and the parameters used for the +convolution-based mixer are comparatively less than the quadratic +self-attention. Convolution-mixer helps the model capture the local spatial +features that self-attention struggles to capture due to their sequential +processing nature. Further, an efficient gate mechanism is employed instead of +a conventional feed-forward network in the transformer to help the model +control the flow of features within different stages of the proposed model. +This design uses fewer learnable parameters which is nearly half the vanilla +transformer that helps in fast and efficient training. The proposed method is +evaluated on NVidia Dynamic Hand Gesture and Briareo datasets and our model has +achieved state-of-the-art results on single and multimodal inputs. We have also +shown the parameter efficiency of the proposed ConvMixFormer model compared to +other methods. The source code is available at +https://github.com/mallikagarg/ConvMixFormer. + +
+
+
+
+
+ + ♻ ☆ Learning Temporally Consistent Video Depth from Video Diffusion Priors + + +
+ This work addresses the challenge of streamed video depth estimation, which +expects not only per-frame accuracy but, more importantly, cross-frame +consistency. We argue that sharing contextual information between frames or +clips is pivotal in fostering temporal consistency. Thus, instead of directly +developing a depth estimator from scratch, we reformulate this predictive task +into a conditional generation problem to provide contextual information within +a clip and across clips. Specifically, we propose a consistent context-aware +training and inference strategy for arbitrarily long videos to provide +cross-clip context. We sample independent noise levels for each frame within a +clip during training while using a sliding window strategy and initializing +overlapping frames with previously predicted frames without adding noise. +Moreover, we design an effective training strategy to provide context within a +clip. Extensive experimental results validate our design choices and +demonstrate the superiority of our approach, dubbed ChronoDepth. Project page: +https://xdimlab.github.io/ChronoDepth/. + +
+
+
+
+
+ + ♻ ☆ PartGS:Learning Part-aware 3D Representations by Fusing 2D Gaussians and + Superquadrics + + +
+ Low-level 3D representations, such as point clouds, meshes, NeRFs, and 3D +Gaussians, are commonly used to represent 3D objects or scenes. However, human +perception typically understands 3D objects at a higher level as a composition +of parts or structures rather than points or voxels. Representing 3D objects or +scenes as semantic parts can benefit further understanding and applications. In +this paper, we introduce $\textbf{PartGS}$, $\textbf{part}$-aware 3D +reconstruction by a hybrid representation of 2D $\textbf{G}$aussians and +$\textbf{S}$uperquadrics, which parses objects or scenes into semantic parts, +digging 3D structural clues from multi-view image inputs. Accurate structured +geometry reconstruction and high-quality rendering are achieved at the same +time. Our method simultaneously optimizes superquadric meshes and Gaussians by +coupling their parameters within our hybrid representation. On one hand, this +hybrid representation inherits the advantage of superquadrics to represent +different shape primitives, supporting flexible part decomposition of scenes. +On the other hand, 2D Gaussians capture complex texture and geometry details, +ensuring high-quality appearance and geometry reconstruction. Our method is +fully unsupervised and outperforms existing state-of-the-art approaches in +extensive experiments on DTU, ShapeNet, and real-life datasets. + +
+
+
+
+
+ + ♻ ☆ DGNN-YOLO: Dynamic Graph Neural Networks with YOLO11 for Small Object + Detection and Tracking in Traffic Surveillance + + +
+ Accurate detection and tracking of small objects such as pedestrians, +cyclists, and motorbikes are critical for traffic surveillance systems, which +are crucial in improving road safety and decision-making in intelligent +transportation systems. However, traditional methods struggle with challenges +such as occlusion, low resolution, and dynamic traffic conditions, +necessitating innovative approaches to address these limitations. This paper +introduces DGNN-YOLO, a novel framework integrating dynamic graph neural +networks (DGNN) with YOLO11 to enhance small object detection and tracking in +traffic surveillance systems. The framework leverages YOLO11's advanced spatial +feature extraction capabilities for precise object detection and incorporates +DGNN to model spatial-temporal relationships for robust real-time tracking +dynamically. By constructing and updating graph structures, DGNN-YOLO +effectively represents objects as nodes and their interactions as edges, +ensuring adaptive and accurate tracking in complex and dynamic environments. +Extensive experiments demonstrate that DGNN-YOLO consistently outperforms +state-of-the-art methods in detecting and tracking small objects under diverse +traffic conditions, achieving the highest precision (0.8382), recall (0.6875), +and mAP@0.5:0.95 (0.6476), showcasing its robustness and scalability, +particularly in challenging scenarios involving small and occluded objects. +This work provides a scalable, real-time traffic surveillance and analysis +solution, significantly contributing to intelligent transportation systems. + +
+
+
+
+
+ + ♻ ☆ Structure-Aware Human Body Reshaping with Adaptive Affinity-Graph + Network + + +
+ Given a source portrait, the automatic human body reshaping task aims at +editing it to an aesthetic body shape. As the technology has been widely used +in media, several methods have been proposed mainly focusing on generating +optical flow to warp the body shape. However, those previous works only +consider the local transformation of different body parts (arms, torso, and +legs), ignoring the global affinity, and limiting the capacity to ensure +consistency and quality across the entire body. In this paper, we propose a +novel Adaptive Affinity-Graph Network (AAGN), which extracts the global +affinity between different body parts to enhance the quality of the generated +optical flow. Specifically, our AAGN primarily introduces the following +designs: (1) we propose an Adaptive Affinity-Graph (AAG) Block that leverages +the characteristic of a fully connected graph. AAG represents different body +parts as nodes in an adaptive fully connected graph and captures all the +affinities between nodes to obtain a global affinity map. The design could +better improve the consistency between body parts. (2) Besides, for +high-frequency details are crucial for photo aesthetics, a Body Shape +Discriminator (BSD) is designed to extract information from both high-frequency +and spatial domain. Particularly, an SRM filter is utilized to extract +high-frequency details, which are combined with spatial features as input to +the BSD. With this design, BSD guides the Flow Generator (FG) to pay attention +to various fine details rather than rigid pixel-level fitting. Extensive +experiments conducted on the BR-5K dataset demonstrate that our framework +significantly enhances the aesthetic appeal of reshaped photos, surpassing all +previous work to achieve state-of-the-art in all evaluation metrics. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Invertible Consistency Distillation for Text-Guided Image Editing in + Around 7 Steps + + +
+ Diffusion distillation represents a highly promising direction for achieving +faithful text-to-image generation in a few sampling steps. However, despite +recent successes, existing distilled models still do not provide the full +spectrum of diffusion abilities, such as real image inversion, which enables +many precise image manipulation methods. This work aims to enrich distilled +text-to-image diffusion models with the ability to effectively encode real +images into their latent space. To this end, we introduce invertible +Consistency Distillation (iCD), a generalized consistency distillation +framework that facilitates both high-quality image synthesis and accurate image +encoding in only 3-4 inference steps. Though the inversion problem for +text-to-image diffusion models gets exacerbated by high classifier-free +guidance scales, we notice that dynamic guidance significantly reduces +reconstruction errors without noticeable degradation in generation performance. +As a result, we demonstrate that iCD equipped with dynamic guidance may serve +as a highly effective tool for zero-shot text-guided image editing, competing +with more expensive state-of-the-art alternatives. + +
+
+ comment: Project page: https://yandex-research.github.io/invertible-cd/ +
+
+
+
+
+ + ♻ ☆ Brain Tumour Removing and Missing Modality Generation using 3D WDM + + +
+ This paper presents the second-placed solution for task 8 and the +participation solution for task 7 of BraTS 2024. The adoption of automated +brain analysis algorithms to support clinical practice is increasing. However, +many of these algorithms struggle with the presence of brain lesions or the +absence of certain MRI modalities. The alterations in the brain's morphology +leads to high variability and thus poor performance of predictive models that +were trained only on healthy brains. The lack of information that is usually +provided by some of the missing MRI modalities also reduces the reliability of +the prediction models trained with all modalities. In order to improve the +performance of these models, we propose the use of conditional 3D wavelet +diffusion models. The wavelet transform enabled full-resolution image training +and prediction on a GPU with 48 GB VRAM, without patching or downsampling, +preserving all information for prediction. The code for these tasks is +available at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ♻ ☆ Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation, + Embrace Orthogonality + + +
+ We introduce a yat-product-powered neural network, the Neural Matter Network +(NMN), a breakthrough in deep learning that achieves non-linear pattern +recognition without activation functions. Our key innovation relies on the +yat-product and yat-product, which naturally induces non-linearity by +projecting inputs into a pseudo-metric space, eliminating the need for +traditional activation functions while maintaining only a softmax layer for +final class probability distribution. This approach simplifies network +architecture and provides unprecedented transparency into the network's +decision-making process. Our comprehensive empirical evaluation across +different datasets demonstrates that NMN consistently outperforms traditional +MLPs. The results challenge the assumption that separate activation functions +are necessary for effective deep-learning models. The implications of this work +extend beyond immediate architectural benefits, by eliminating intermediate +activation functions while preserving non-linear capabilities, yat-MLP +establishes a new paradigm for neural network design that combines simplicity +with effectiveness. Most importantly, our approach provides unprecedented +insights into the traditionally opaque "black-box" nature of neural networks, +offering a clearer understanding of how these models process and classify +information. + +
+
+ comment: fixed proof, added softermax +
+
+
+
+
+ + ♻ ☆ Deepfake for the Good: Generating Avatars through Face-Swapping with + Implicit Deepfake Generation + + +
+ Numerous emerging deep-learning techniques have had a substantial impact on +computer graphics. Among the most promising breakthroughs are the rise of +Neural Radiance Fields (NeRFs) and Gaussian Splatting (GS). NeRFs encode the +object's shape and color in neural network weights using a handful of images +with known camera positions to generate novel views. In contrast, GS provides +accelerated training and inference without a decrease in rendering quality by +encoding the object's characteristics in a collection of Gaussian +distributions. These two techniques have found many use cases in spatial +computing and other domains. On the other hand, the emergence of deepfake +methods has sparked considerable controversy. Deepfakes refers to artificial +intelligence-generated videos that closely mimic authentic footage. Using +generative models, they can modify facial features, enabling the creation of +altered identities or expressions that exhibit a remarkably realistic +appearance to a real person. Despite these controversies, deepfake can offer a +next-generation solution for avatar creation and gaming when of desirable +quality. To that end, we show how to combine all these emerging technologies to +obtain a more plausible outcome. Our ImplicitDeepfake uses the classical +deepfake algorithm to modify all training images separately and then train NeRF +and GS on modified faces. Such simple strategies can produce plausible 3D +deepfake-based avatars. + +
+
+
+
+
+ + ♻ ☆ Efficient Multi-modal Large Language Models via Visual Token Grouping + + +
+ The development of Multi-modal Large Language Models (MLLMs) enhances Large +Language Models (LLMs) with the ability to perceive data formats beyond text, +significantly advancing a range of downstream applications, such as visual +question answering and image captioning. However, the substantial computational +costs associated with processing high-resolution images and videos pose a +barrier to their broader adoption. To address this challenge, compressing +vision tokens in MLLMs has emerged as a promising approach to reduce inference +costs. While existing methods conduct token reduction in the feature alignment +phase. In this paper, we introduce VisToG, a novel grouping mechanism that +leverages the capabilities of pre-trained vision encoders to group similar +image segments without the need for segmentation masks. Specifically, we +concatenate semantic tokens to represent image semantic segments after the +linear projection layer before feeding into the vision encoder. Besides, with +the isolated attention we adopt, VisToG can identify and eliminate redundant +visual tokens utilizing the prior knowledge in the pre-trained vision encoder, +which effectively reduces computational demands. Extensive experiments +demonstrate the effectiveness of VisToG, maintaining 98.1% of the original +performance while achieving a reduction of over 27\% inference time. + +
+
+
+
+
+ + ♻ ☆ GLOV: Guided Large Language Models as Implicit Optimizers for Vision + Language Models + + +
+ In this work, we propose a novel method (GLOV) enabling Large Language Models +(LLMs) to act as implicit Optimizers for Vision-Langugage Models (VLMs) to +enhance downstream vision tasks. Our GLOV meta-prompts an LLM with the +downstream task description, querying it for suitable VLM prompts (e.g., for +zero-shot classification with CLIP). These prompts are ranked according to a +purity measure obtained through a fitness function. In each respective +optimization step, the ranked prompts are fed as in-context examples (with +their accuracies) to equip the LLM with the knowledge of the type of text +prompts preferred by the downstream VLM. Furthermore, we also explicitly steer +the LLM generation process in each optimization step by specifically adding an +offset difference vector of the embeddings from the positive and negative +solutions found by the LLM, in previous optimization steps, to the intermediate +layer of the network for the next generation step. This offset vector steers +the LLM generation toward the type of language preferred by the downstream VLM, +resulting in enhanced performance on the downstream vision tasks. We +comprehensively evaluate our GLOV on 16 diverse datasets using two families of +VLMs, i.e., dual-encoder (e.g., CLIP) and encoder-decoder (e.g., LLaVa) models +-- showing that the discovered solutions can enhance the recognition +performance by up to 15.0% and 57.5% (3.8% and 21.6% on average) for these +models. + +
+
+ comment: Code: https://github.com/jmiemirza/GLOV +
+
+
+
+
+ + ♻ ☆ Free-Mask: A Novel Paradigm of Integration Between the Segmentation + Diffusion Model and Image Editing to Improve Segmentation Ability + + +
+ Current semantic segmentation models typically require a substantial amount +of manually annotated data, a process that is both time-consuming and +resource-intensive. Alternatively, leveraging advanced text-to-image models +such as Midjourney and Stable Diffusion has emerged as an efficient strategy, +enabling the automatic generation of synthetic data in place of manual +annotations. However, previous methods have been limited to generating +single-instance images, as the generation of multiple instances with Stable +Diffusion has proven unstable. To address this limitation and expand the scope +and diversity of synthetic datasets, we propose a framework \textbf{Free-Mask} +that combines a Diffusion Model for segmentation with advanced image editing +capabilities, allowing for the integration of multiple objects into images via +text-to-image models. Our method facilitates the creation of highly realistic +datasets that closely emulate open-world environments while generating accurate +segmentation masks. It reduces the labor associated with manual annotation and +also ensures precise mask generation. Experimental results demonstrate that +synthetic data generated by \textbf{Free-Mask} enables segmentation models to +outperform those trained on real data, especially in zero-shot settings. +Notably, \textbf{Free-Mask} achieves new state-of-the-art results on previously +unseen classes in the VOC 2012 benchmark. + +
+
+ comment: 16 pages,5 figures,5 tables +
+
+
+
+
+ + ♻ ☆ Anomaly Detection in Medical Imaging -- A Mini Review SC2021 + + +
+ The increasing digitization of medical imaging enables machine learning based +improvements in detecting, visualizing and segmenting lesions, easing the +workload for medical experts. However, supervised machine learning requires +reliable labelled data, which is is often difficult or impossible to collect or +at least time consuming and thereby costly. Therefore methods requiring only +partly labeled data (semi-supervised) or no labeling at all (unsupervised +methods) have been applied more regularly. Anomaly detection is one possible +methodology that is able to leverage semi-supervised and unsupervised methods +to handle medical imaging tasks like classification and segmentation. This +paper uses a semi-exhaustive literature review of relevant anomaly detection +papers in medical imaging to cluster into applications, highlight important +results, establish lessons learned and give further advice on how to approach +anomaly detection in medical imaging. The qualitative analysis is based on +google scholar and 4 different search terms, resulting in 120 different +analysed papers. The main results showed that the current research is mostly +motivated by reducing the need for labelled data. Also, the successful and +substantial amount of research in the brain MRI domain shows the potential for +applications in further domains like OCT and chest X-ray. + +
+
+ comment: Accepted and presented at iDSC2021 edit: During work on this + publication Maximilian Ernst Tschuchnig was affiliated with Salzburg + University of Applied Sciences and University of Salzburg +
+
+
+
+
+ + ♻ ☆ Evaluation of Multi-Scale Multiple Instance Learning to Improve Thyroid + Cancer Classification + + +
+ Thyroid cancer is currently the fifth most common malignancy diagnosed in +women. Since differentiation of cancer sub-types is important for treatment and +current, manual methods are time consuming and subjective, automatic +computer-aided differentiation of cancer types is crucial. Manual +differentiation of thyroid cancer is based on tissue sections, analysed by +pathologists using histological features. Due to the enormous size of gigapixel +whole slide images, holistic classification using deep learning methods is not +feasible. Patch based multiple instance learning approaches, combined with +aggregations such as bag-of-words, is a common approach. This work's +contribution is to extend a patch based state-of-the-art method by generating +and combining feature vectors of three different patch resolutions and +analysing three distinct ways of combining them. The results showed +improvements in one of the three multi-scale approaches, while the others led +to decreased scores. This provides motivation for analysis and discussion of +the individual approaches. + +
+
+ comment: Accepted and presented at IPTA 2022 (Best Paper) edit: During work on + this publication Maximilian Ernst Tschuchnig was affiliated with Salzburg + University of Applied Sciences and University of Salzburg +
+
+
+
+
+ + ♻ ☆ Beyond Gaussians: Fast and High-Fidelity 3D Splatting with Linear + Kernels + + +
+ Recent advancements in 3D Gaussian Splatting (3DGS) have substantially +improved novel view synthesis, enabling high-quality reconstruction and +real-time rendering. However, blurring artifacts, such as floating primitives +and over-reconstruction, remain challenging. Current methods address these +issues by refining scene structure, enhancing geometric representations, +addressing blur in training images, improving rendering consistency, and +optimizing density control, yet the role of kernel design remains +underexplored. We identify the soft boundaries of Gaussian ellipsoids as one of +the causes of these artifacts, limiting detail capture in high-frequency +regions. To bridge this gap, we introduce 3D Linear Splatting (3DLS), which +replaces Gaussian kernels with linear kernels to achieve sharper and more +precise results, particularly in high-frequency regions. Through evaluations on +three datasets, 3DLS demonstrates state-of-the-art fidelity and accuracy, along +with a 30% FPS improvement over baseline 3DGS. The implementation will be made +publicly available upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Multi-task Learning To Improve Semantic Segmentation Of CBCT Scans Using + Image Reconstruction + + +
+ Semantic segmentation is a crucial task in medical image processing, +essential for segmenting organs or lesions such as tumors. In this study we aim +to improve automated segmentation in CBCTs through multi-task learning. To +evaluate effects on different volume qualities, a CBCT dataset is synthesised +from the CT Liver Tumor Segmentation Benchmark (LiTS) dataset. To improve +segmentation, two approaches are investigated. First, we perform multi-task +learning to add morphology based regularization through a volume reconstruction +task. Second, we use this reconstruction task to reconstruct the best quality +CBCT (most similar to the original CT), facilitating denoising effects. We +explore both holistic and patch-based approaches. Our findings reveal that, +especially using a patch-based approach, multi-task learning improves +segmentation in most cases and that these results can further be improved by +our denoising approach. + +
+
+ comment: Accepted and presented at German Conference on Medical Image + Computing (BVM) 2024 edit: During work on this publication Maximilian Ernst + Tschuchnig was affiliated with Salzburg University of Applied Sciences and + University of Salzburg +
+
+
+
+
+ + ♻ ☆ RO-SVD: A Reconfigurable Hardware Copyright Protection Framework for + AIGC Applications + + +
+ The dramatic surge in the utilisation of generative artificial intelligence +(GenAI) underscores the need for a secure and efficient mechanism to +responsibly manage, use and disseminate multi-dimensional data generated by +artificial intelligence (AI). In this paper, we propose a blockchain-based +copyright traceability framework called ring oscillator-singular value +decomposition (RO-SVD), which introduces decomposition computing to approximate +low-rank matrices generated from hardware entropy sources and establishes an +AI-generated content (AIGC) copyright traceability mechanism at the device +level. By leveraging the parallelism and reconfigurability of +field-programmable gate arrays (FPGAs), our framework can be easily constructed +on existing AI-accelerated devices and provide a low-cost solution to emerging +copyright issues of AIGC. We developed a hardware-software (HW/SW) co-design +prototype based on comprehensive analysis and on-board experiments with +multiple AI-applicable FPGAs. Using AI-generated images as a case study, our +framework demonstrated effectiveness and emphasised customisation, +unpredictability, efficiency, management and reconfigurability. To the best of +our knowledge, this is the first practical hardware study discussing and +implementing copyright traceability specifically for AI-generated content. + +
+
+ comment: Accepted on 20 May 2024 as a full paper at ASAP 2024 +
+
+
+
+
+ + ♻ ☆ Multimodal Perception System for Real Open Environment + + +
+ This paper presents a novel multimodal perception system for a real open +environment. The proposed system includes an embedded computation platform, +cameras, ultrasonic sensors, GPS, and IMU devices. Unlike the traditional +frameworks, our system integrates multiple sensors with advanced computer +vision algorithms to help users walk outside reliably. The system can +efficiently complete various tasks, including navigating to specific locations, +passing through obstacle regions, and crossing intersections. Specifically, we +also use ultrasonic sensors and depth cameras to enhance obstacle avoidance +performance. The path planning module is designed to find the locally optimal +route based on various feedback and the user's current state. To evaluate the +performance of the proposed system, we design several experiments under +different scenarios. The results show that the system can help users walk +efficiently and independently in complex situations. + +
+
+
+
+
+ + ♻ ☆ Advances in 3D Neural Stylization: A Survey + + +
+ Modern artificial intelligence offers a novel and transformative approach to +creating digital art across diverse styles and modalities like images, videos +and 3D data, unleashing the power of creativity and revolutionizing the way +that we perceive and interact with visual content. This paper reports on recent +advances in stylized 3D asset creation and manipulation with the expressive +power of neural networks. We establish a taxonomy for neural stylization, +considering crucial design choices such as scene representation, guidance data, +optimization strategies, and output styles. Building on such taxonomy, our +survey first revisits the background of neural stylization on 2D images, and +then presents in-depth discussions on recent neural stylization methods for 3D +data, accompanied by a benchmark evaluating selected mesh and neural field +stylization methods. Based on the insights gained from the survey, we highlight +the practical significance, open challenges, future research, and potential +impacts of neural stylization, which facilitates researchers and practitioners +to navigate the rapidly evolving landscape of 3D content creation using modern +artificial intelligence. + +
+
+ comment: curated list of papers: + https://github.com/chenyingshu/advances_3d_neural_stylization +
+
+
+
+
+ + ♻ ☆ DUSt3R: Geometric 3D Vision Made Easy + + +
+ Multi-view stereo reconstruction (MVS) in the wild requires to first estimate +the camera parameters e.g. intrinsic and extrinsic parameters. These are +usually tedious and cumbersome to obtain, yet they are mandatory to triangulate +corresponding pixels in 3D space, which is the core of all best performing MVS +algorithms. In this work, we take an opposite stance and introduce DUSt3R, a +radically novel paradigm for Dense and Unconstrained Stereo 3D Reconstruction +of arbitrary image collections, i.e. operating without prior information about +camera calibration nor viewpoint poses. We cast the pairwise reconstruction +problem as a regression of pointmaps, relaxing the hard constraints of usual +projective camera models. We show that this formulation smoothly unifies the +monocular and binocular reconstruction cases. In the case where more than two +images are provided, we further propose a simple yet effective global alignment +strategy that expresses all pairwise pointmaps in a common reference frame. We +base our network architecture on standard Transformer encoders and decoders, +allowing us to leverage powerful pretrained models. Our formulation directly +provides a 3D model of the scene as well as depth information, but +interestingly, we can seamlessly recover from it, pixel matches, relative and +absolute camera. Exhaustive experiments on all these tasks showcase that the +proposed DUSt3R can unify various 3D vision tasks and set new SoTAs on +monocular/multi-view depth estimation as well as relative pose estimation. In +summary, DUSt3R makes many geometric 3D vision tasks easy. + +
+
+ comment: fixing the ref for StaticThings3D dataset +
+
+
+
+
+ + ♻ ☆ Pixel-aligned RGB-NIR Stereo Imaging and Dataset for Robot Vision + + +
+ Integrating RGB and NIR stereo imaging provides complementary spectral +information, potentially enhancing robotic 3D vision in challenging lighting +conditions. However, existing datasets and imaging systems lack pixel-level +alignment between RGB and NIR images, posing challenges for downstream vision +tasks. In this paper, we introduce a robotic vision system equipped with +pixel-aligned RGB-NIR stereo cameras and a LiDAR sensor mounted on a mobile +robot. The system simultaneously captures pixel-aligned pairs of RGB stereo +images, NIR stereo images, and temporally synchronized LiDAR points. Utilizing +the mobility of the robot, we present a dataset containing continuous video +frames under diverse lighting conditions. We then introduce two methods that +utilize the pixel-aligned RGB-NIR images: an RGB-NIR image fusion method and a +feature fusion method. The first approach enables existing RGB-pretrained +vision models to directly utilize RGB-NIR information without fine-tuning. The +second approach fine-tunes existing vision models to more effectively utilize +RGB-NIR information. Experimental results demonstrate the effectiveness of +using pixel-aligned RGB-NIR images across diverse lighting conditions. + +
+
+ comment: 8 pages for main article, 32 pages for supplemental document. Fix + typos +
+
+
+
+
+ + ♻ ☆ LV-UNet: A Lightweight and Vanilla Model for Medical Image Segmentation + + +
+ While large models have achieved significant progress in computer vision, +challenges such as optimization complexity, the intricacy of transformer +architectures, computational constraints, and practical application demands +highlight the importance of simpler model designs in medical image +segmentation. This need is particularly pronounced in mobile medical devices, +which require lightweight, deployable models with real-time performance. +However, existing lightweight models often suffer from poor robustness across +datasets, limiting their widespread adoption. To address these challenges, this +paper introduces LV-UNet, a lightweight and vanilla model that leverages +pre-trained MobileNetv3-Large backbones and incorporates fusible modules. +LV-UNet employs an enhanced deep training strategy and switches to a deployment +mode during inference by re-parametrization, significantly reducing parameter +count and computational overhead. Experimental results on ISIC 2016, BUSI, +CVC-ClinicDB, CVC-ColonDB, and Kvair-SEG datasets demonstrate a better +trade-off between performance and the computational load. The code will be +released at \url{https://github.com/juntaoJianggavin/LV-UNet}. + +
+
+ comment: Accepted by IEEE BIBM2024 ML4BMI workshop +
+
+
+
+
+ + ♻ ☆ Morph: A Motion-free Physics Optimization Framework for Human Motion + Generation + + +
+ Human motion generation plays a vital role in applications such as digital +humans and humanoid robot control. However, most existing approaches disregard +physics constraints, leading to the frequent production of physically +implausible motions with pronounced artifacts such as floating and foot +sliding. In this paper, we propose \textbf{Morph}, a +\textbf{Mo}tion-f\textbf{r}ee \textbf{ph}ysics optimization framework, +comprising a Motion Generator and a Motion Physics Refinement module, for +enhancing physical plausibility without relying on costly real-world motion +data. Specifically, the Motion Generator is responsible for providing +large-scale synthetic motion data, while the Motion Physics Refinement Module +utilizes these synthetic data to train a motion imitator within a physics +simulator, enforcing physical constraints to project the noisy motions into a +physically-plausible space. These physically refined motions, in turn, are used +to fine-tune the Motion Generator, further enhancing its capability. +Experiments on both text-to-motion and music-to-dance generation tasks +demonstrate that our framework achieves state-of-the-art motion generation +quality while improving physical plausibility drastically. + +
+
+ comment: 15 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Multi-View Large Reconstruction Model via Geometry-Aware Positional + Encoding and Attention + + +
+ Despite recent advancements in the Large Reconstruction Model (LRM) +demonstrating impressive results, when extending its input from single image to +multiple images, it exhibits inefficiencies, subpar geometric and texture +quality, as well as slower convergence speed than expected. It is attributed to +that, LRM formulates 3D reconstruction as a naive images-to-3D translation +problem, ignoring the strong 3D coherence among the input images. In this +paper, we propose a Multi-view Large Reconstruction Model (M-LRM) designed to +reconstruct high-quality 3D shapes from multi-views in a 3D-aware manner. +Specifically, we introduce a multi-view consistent cross-attention scheme to +enable M-LRM to accurately query information from the input images. Moreover, +we employ the 3D priors of the input multi-view images to initialize the +triplane tokens. Compared to previous methods, the proposed M-LRM can generate +3D shapes of high fidelity. Experimental studies demonstrate that our model +achieves a significant performance gain and faster training convergence. +Project page: \url{https://murphylmf.github.io/M-LRM/}. + +
+
+
+
+
+ + ♻ ☆ AniFaceDiff: Animating Stylized Avatars via Parametric Conditioned + Diffusion Models + + +
+ Animating stylized avatars with dynamic poses and expressions has attracted +increasing attention for its broad range of applications. Previous research has +made significant progress by training controllable generative models to +synthesize animations based on reference characteristics, pose, and expression +conditions. However, the mechanisms used in these methods to control pose and +expression often inadvertently introduce unintended features from the target +motion, while also causing a loss of expression-related details, particularly +when applied to stylized animation. This paper proposes a new method based on +Stable Diffusion, called AniFaceDiff, incorporating a new conditioning module +for animating stylized avatars. First, we propose a refined spatial +conditioning approach by Facial Alignment to prevent the inclusion of identity +characteristics from the target motion. Then, we introduce an Expression +Adapter that incorporates additional cross-attention layers to address the +potential loss of expression-related information. Our approach effectively +preserves pose and expression from the target video while maintaining input +image consistency. Extensive experiments demonstrate that our method achieves +state-of-the-art results, showcasing superior image quality, preservation of +reference features, and expression accuracy, particularly for out-of-domain +animation across diverse styles, highlighting its versatility and strong +generalization capabilities. This work aims to enhance the quality of virtual +stylized animation for positive applications. To promote responsible use in +virtual environments, we contribute to the advancement of detection for +generative content by evaluating state-of-the-art detectors, highlighting +potential areas for improvement, and suggesting solutions. + +
+
+
+
+
+ + ♻ ☆ A Survey and Benchmark of Automatic Surface Reconstruction from Point + Clouds + + +
+ We present a comprehensive survey and benchmark of both traditional and +learning-based methods for surface reconstruction from point clouds. This task +is particularly challenging for real-world acquisitions due to factors such as +noise, outliers, non-uniform sampling, and missing data. Traditional approaches +often simplify the problem by imposing handcrafted priors on either the input +point clouds or the resulting surface, a process that can require tedious +hyperparameter tuning. In contrast, deep learning models have the capability to +directly learn the properties of input point clouds and desired surfaces from +data. We study the influence of handcrafted and learned priors on the precision +and robustness of surface reconstruction techniques. We evaluate various +time-tested and contemporary methods in a standardized manner. When both +trained and evaluated on point clouds with identical characteristics, the +learning-based models consistently produce higher-quality surfaces compared to +their traditional counterparts -- even in scenarios involving novel shape +categories. However, traditional methods demonstrate greater resilience to the +diverse anomalies commonly found in real-world 3D acquisitions. For the benefit +of the research community, we make our code and datasets available, inviting +further enhancements to learning-based surface reconstruction. This can be +accessed at https://github.com/raphaelsulzer/dsr-benchmark . + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ VQA$^2$: Visual Question Answering for Video Quality Assessment + + +
+ The advent and proliferation of large multi-modal models (LMMs) have +introduced new paradigms to computer vision, transforming various tasks into a +unified visual question answering framework. Video Quality Assessment (VQA), a +classic field in low-level visual perception, focused initially on quantitative +video quality scoring. However, driven by advances in LMMs, it is now +progressing toward more holistic visual quality understanding tasks. Recent +studies in the image domain have demonstrated that Visual Question Answering +(VQA) can markedly enhance low-level visual quality evaluation. Nevertheless, +related work has not been explored in the video domain, leaving substantial +room for improvement. To address this gap, we introduce the VQA2 Instruction +Dataset - the first visual question answering instruction dataset that focuses +on video quality assessment. This dataset consists of 3 subsets and covers +various video types, containing 157,755 instruction question-answer pairs. +Then, leveraging this foundation, we present the VQA2 series models. The VQA2 +series models interleave visual and motion tokens to enhance the perception of +spatial-temporal quality details in videos. We conduct extensive experiments on +video quality scoring and understanding tasks, and results demonstrate that the +VQA2series models achieve excellent performance in both tasks. Notably, our +final model, the VQA2-Assistant, exceeds the renowned GPT-4o in visual quality +understanding tasks while maintaining strong competitiveness in quality scoring +tasks. Our work provides a foundation and feasible approach for integrating +low-level video quality assessment and understanding with LMMs. + +
+
+ comment: 23 pages 12 figures +
+
+
+
+
+ + ♻ ☆ Revisiting MAE pre-training for 3D medical image segmentation + + +
+ Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the +potential of vast, untapped clinical datasets, for various downstream +applications that suffer from the scarcity of labeled data. While SSL has +revolutionized fields like natural language processing and computer vision, its +adoption in 3D medical image computing has been limited by three key pitfalls: +Small pre-training dataset sizes, architectures inadequate for 3D medical image +analysis, and insufficient evaluation practices. In this paper, we address +these issues by i) leveraging a large-scale dataset of 39k 3D brain MRI volumes +and ii) using a Residual Encoder U-Net architecture within the state-of-the-art +nnU-Net framework. iii) A robust development framework, incorporating 5 +development and 8 testing brain MRI segmentation datasets, allowed +performance-driven design decisions to optimize the simple concept of Masked +Auto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses +previous SSL methods but also outperforms the strong nnU-Net baseline by an +average of approximately 3 Dice points setting a new state-of-the-art. Our code +and models are made available here. + +
+
+ comment: Arxiv Preprint. Revised and under review +
+
+
+
+
+ + ♻ ☆ An Architectural Approach to Enhance Deep Long-Tailed Learning + + +
+ Deep long-tailed recognition has been widely studied to address the issue of +imbalanced data distributions in real-world scenarios. However, there has been +insufficient focus on the design of neural architectures, despite empirical +evidence suggesting that architecture can significantly impact performance. In +this paper, we attempt to mitigate long-tailed issues through architectural +improvements. To simplify the design process, we utilize Differential +Architecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS +methods struggle to perform well in long-tailed scenarios. To tackle this +challenge, we introduce Long-Tailed Differential Architecture Search (LTDAS). +Specifically, we conduct extensive experiments to explore architectural +components that demonstrate better performance on long-tailed data and propose +a new search space based on our observations. This ensures that the +architecture obtained through our search process incorporates superior +components. Additionally, we propose replacing the learnable linear classifier +with an Equiangular Tight Frame (ETF) classifier to further enhance our method. +This classifier effectively alleviates the biased search process and prevents +performance collapse. Extensive experimental evaluations demonstrate that our +approach consistently improves upon existing methods from an orthogonal +perspective and achieves state-of-the-art results with simple enhancements. + +
+
+
+
+
+ + ♻ ☆ Real-time Transformer-based Open-Vocabulary Detection with Efficient + Fusion Head + + +
+ End-to-end transformer-based detectors (DETRs) have shown exceptional +performance in both closed-set and open-vocabulary object detection (OVD) tasks +through the integration of language modalities. However, their demanding +computational requirements have hindered their practical application in +real-time object detection (OD) scenarios. In this paper, we scrutinize the +limitations of two leading models in the OVDEval benchmark, OmDet and +Grounding-DINO, and introduce OmDet-Turbo. This novel transformer-based +real-time OVD model features an innovative Efficient Fusion Head (EFH) module +designed to alleviate the bottlenecks observed in OmDet and Grounding-DINO. +Notably, OmDet-Turbo-Base achieves a 100.2 frames per second (FPS) with +TensorRT and language cache techniques applied. Notably, in zero-shot scenarios +on COCO and LVIS datasets, OmDet-Turbo achieves performance levels nearly on +par with current state-of-the-art supervised models. Furthermore, it +establishes new state-of-the-art benchmarks on ODinW and OVDEval, boasting an +AP of 30.1 and an NMS-AP of 26.86, respectively. The practicality of +OmDet-Turbo in industrial applications is underscored by its exceptional +performance on benchmark datasets and superior inference speed, positioning it +as a compelling choice for real-time object detection tasks. Code: +\url{https://github.com/om-ai-lab/OmDet} + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Anticipating Object State Changes in Long Procedural Videos + + +
+ In this work, we introduce (a) the new problem of anticipating object state +changes in images and videos during procedural activities, (b) new curated +annotation data for object state change classification based on the Ego4D +dataset, and (c) the first method for addressing this challenging problem. +Solutions to this new task have important implications in vision-based scene +understanding, automated monitoring systems, and action planning. The proposed +novel framework predicts object state changes that will occur in the near +future due to yet unseen human actions by integrating learned visual features +that represent recent visual information with natural language (NLP) features +that represent past object state changes and actions. Leveraging the extensive +and challenging Ego4D dataset which provides a large-scale collection of +first-person perspective videos across numerous interaction scenarios, we +introduce an extension noted Ego4D-OSCA that provides new curated annotation +data for the object state change anticipation task (OSCA). An extensive +experimental evaluation is presented demonstrating the proposed method's +efficacy in predicting object state changes in dynamic scenarios. The +performance of the proposed approach also underscores the potential of +integrating video and linguistic cues to enhance the predictive performance of +video understanding systems and lays the groundwork for future research on the +new task of object state change anticipation. The source code and the new +annotation data (Ego4D-OSCA) will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ Improved Multi-Task Brain Tumour Segmentation with Synthetic Data + Augmentation + + +
+ This paper presents the winning solution of task 1 and the third-placed +solution of task 3 of the BraTS challenge. The use of automated tools in +clinical practice has increased due to the development of more and more +sophisticated and reliable algorithms. However, achieving clinical standards +and developing tools for real-life scenarios is a major challenge. To this end, +BraTS has organised tasks to find the most advanced solutions for specific +purposes. In this paper, we propose the use of synthetic data to train +state-of-the-art frameworks in order to improve the segmentation of adult +gliomas in a post-treatment scenario, and the segmentation of meningioma for +radiotherapy planning. Our results suggest that the use of synthetic data leads +to more robust algorithms, although the synthetic data generation pipeline is +not directly suited to the meningioma task. In task 1, we achieved a DSC of +0.7900, 0.8076, 0.7760, 0.8926, 0.7874, 0.8938 and a HD95 of 35.63, 30.35, +44.58, 16.87, 38.19, 17.95 for ET, NETC, RC, SNFH, TC and WT, respectively and, +in task 3, we achieved a DSC of 0.801 and HD95 of 38.26, in the testing phase. +The code for these tasks is available at +https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ♻ ☆ Image Statistics Predict the Sensitivity of Perceptual Quality Metrics + + +
+ Previously, Barlow and Attneave hypothesised a link between biological vision +and information maximisation. Following Shannon, information was defined using +the probability of natural images. Several physiological and psychophysical +phenomena have been derived from principles like info-max, efficient coding, or +optimal denoising. However, it remains unclear how this link is expressed in +mathematical terms from image probability. Classical derivations were subjected +to strong assumptions on the probability models and on the behaviour of the +sensors. Moreover, the direct evaluation of the hypothesis was limited by the +inability of classical image models to deliver accurate estimates of the +probability. Here, we directly evaluate image probabilities using a generative +model for natural images, and analyse how probability-related factors can be +combined to predict the sensitivity of state-of-the-art subjective image +quality metrics, a proxy for human perception. We use information theory and +regression analysis to find a simple model that when combining just two +probability-related factors achieves 0.77 correlation with subjective metrics. +This probability-based model is validated in two ways: through direct +comparison with the opinion of real observers in a subjective quality +experiment, and by reproducing basic trends of classical psychophysical facts +such as the Contrast Sensitivity Function, the Weber-law, and contrast masking. + +
+
+
+
+
+ + ♻ ☆ Object-Size-Driven Design of Convolutional Neural Networks: Virtual Axle + Detection based on Raw Data + + +
+ As infrastructure ages, the need for efficient monitoring methods becomes +increasingly critical. Bridge Weigh-In-Motion (BWIM) systems are crucial for +cost-effective determination of loads and, consequently, the residual service +life of road and railway infrastructure. However, conventional BWIM systems +require additional sensors for axle detection, which must be installed in +potentially inaccessible locations or places that interfere with bridge +operation. + This study presents a novel approach for real-time detection of train axles +using sensors arbitrarily placed on bridges, providing an alternative to +dedicated axle detectors. The developed Virtual Axle Detector with Enhanced +Receptive Field (VADER) has been validated on a single-track railway bridge +using only acceleration measurements, detecting 99.9% of axles with a spatial +error of 3.69cm. Using raw data as input outperformed the state-of-the-art +spectrogram-based method in both speed and memory usage by 99%, thereby making +real-time application feasible for the first time. + Additionally, we introduce the Maximum Receptive Field (MRF) rule, a novel +approach to optimise hyperparameters of Convolutional Neural Networks (CNNs) +based on the size of objects. In this context, the object size relates to the +fundamental frequency of a bridge. The MRF rule effectively narrows the +hyperparameter search space, overcoming the need for extensive hyperparameter +tuning. Since the MRF rule can theoretically be applied to all unstructured +data, it could have implications for a wide range of deep learning problems, +from earthquake prediction to object recognition. + +
+
+
+
+
+ + ♻ ☆ ARTIST: Improving the Generation of Text-rich Images with Disentangled + Diffusion Models and Large Language Models WACV 2025 + + +
+ Diffusion models have demonstrated exceptional capabilities in generating a +broad spectrum of visual content, yet their proficiency in rendering text is +still limited: they often generate inaccurate characters or words that fail to +blend well with the underlying image. To address these shortcomings, we +introduce a novel framework named, ARTIST, which incorporates a dedicated +textual diffusion model to focus on the learning of text structures +specifically. Initially, we pretrain this textual model to capture the +intricacies of text representation. Subsequently, we finetune a visual +diffusion model, enabling it to assimilate textual structure information from +the pretrained textual model. This disentangled architecture design and +training strategy significantly enhance the text rendering ability of the +diffusion models for text-rich image generation. Additionally, we leverage the +capabilities of pretrained large language models to interpret user intentions +better, contributing to improved generation quality. Empirical results on the +MARIO-Eval benchmark underscore the effectiveness of the proposed method, +showing an improvement of up to 15% in various metrics. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with + Diffusion Autoencoder + + +
+ While recent research has made significant progress in speech-driven talking +face generation, the quality of the generated video still lags behind that of +real recordings. One reason for this is the use of handcrafted intermediate +representations like facial landmarks and 3DMM coefficients, which are designed +based on human knowledge and are insufficient to precisely describe facial +movements. Additionally, these methods require an external pretrained model for +extracting these representations, whose performance sets an upper bound on +talking face generation. To address these limitations, we propose a novel +method called DAE-Talker that leverages data-driven latent representations +obtained from a diffusion autoencoder (DAE). DAE contains an image encoder that +encodes an image into a latent vector and a DDIM image decoder that +reconstructs the image from it. We train our DAE on talking face video frames +and then extract their latent representations as the training target for a +Conformer-based speech2latent model. This allows DAE-Talker to synthesize full +video frames and produce natural head movements that align with the content of +speech, rather than relying on a predetermined head pose from a template video. +We also introduce pose modelling in speech2latent for pose controllability. +Additionally, we propose a novel method for generating continuous video frames +with the DDIM image decoder trained on individual frames, eliminating the need +for modelling the joint distribution of consecutive frames directly. Our +experiments show that DAE-Talker outperforms existing popular methods in +lip-sync, video fidelity, and pose naturalness. We also conduct ablation +studies to analyze the effectiveness of the proposed techniques and demonstrate +the pose controllability of DAE-Talker. + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ Video-Driven Graph Network-Based Simulators + + +
+ Lifelike visualizations in design, cinematography, and gaming rely on precise +physics simulations, typically requiring extensive computational resources and +detailed physical input. This paper presents a method that can infer a system's +physical properties from a short video, eliminating the need for explicit +parameter input, provided it is close to the training condition. The learned +representation is then used within a Graph Network-based Simulator to emulate +the trajectories of physical systems. We demonstrate that the video-derived +encodings effectively capture the physical properties of the system and +showcase a linear dependence between some of the encodings and the system's +motion. + +
+
+
+
+
+ + ♻ ☆ Enhancing Compositional Text-to-Image Generation with Reliable Random + Seeds + + +
+ Text-to-image diffusion models have demonstrated remarkable capability in +generating realistic images from arbitrary text prompts. However, they often +produce inconsistent results for compositional prompts such as "two dogs" or "a +penguin on the right of a bowl". Understanding these inconsistencies is crucial +for reliable image generation. In this paper, we highlight the significant role +of initial noise in these inconsistencies, where certain noise patterns are +more reliable for compositional prompts than others. Our analyses reveal that +different initial random seeds tend to guide the model to place objects in +distinct image areas, potentially adhering to specific patterns of camera +angles and image composition associated with the seed. To improve the model's +compositional ability, we propose a method for mining these reliable cases, +resulting in a curated training set of generated images without requiring any +manual annotation. By fine-tuning text-to-image models on these generated +images, we significantly enhance their compositional capabilities. For +numerical composition, we observe relative increases of 29.3% and 19.5% for +Stable Diffusion and PixArt-{\alpha}, respectively. Spatial composition sees +even larger gains, with 60.7% for Stable Diffusion and 21.1% for +PixArt-{\alpha}. + +
+
+
+
+
+ + ♻ ☆ Enhancing the automatic segmentation and analysis of 3D liver + vasculature models + + +
+ Surgical assessment of liver cancer patients requires identification of the +vessel trees from medical images. Specifically, the venous trees - the portal +(perfusing) and the hepatic (draining) trees are important for understanding +the liver anatomy and disease state, and perform surgery planning. This +research aims to improve the 3D segmentation, skeletonization, and subsequent +analysis of vessel trees, by creating an automatic pipeline based on deep +learning and image processing techniques. + The first part of this work explores the impact of differentiable +skeletonization methods such as ClDice and morphological skeletonization loss, +on the overall liver vessel segmentation performance. To this aim, it studies +how to improve vessel tree connectivity. + The second part of this study converts a single class vessel segmentation +into multi-class ones, separating the two venous trees. It builds on the +previous two-class vessel segmentation model, which vessel tree outputs might +be entangled, and on connected components and skeleton analyses of the trees. + After providing sub-labeling of the specific anatomical branches of each +venous tree, these algorithms also enable a morphometric analysis of the vessel +trees by extracting various geometrical markers. + In conclusion, we propose a method that successfully improves current +skeletonization methods, for extensive vascular trees that contain vessels of +different calibers. The separation algorithm creates a clean multi-class +segmentation of the vessels, validated by surgeons to provide low error. A new, +publicly shared high-quality liver vessel dataset of 77 cases is thus created. +Finally a method to annotate vessel trees according to anatomy is provided, +enabling a unique liver vessel morphometry analysis. + +
+
+ comment: Internship at Simbiotx +
+
+
+
+
+ + ♻ ☆ MVBoost: Boost 3D Reconstruction with Multi-View Refinement + + +
+ Recent advancements in 3D object reconstruction have been remarkable, yet +most current 3D models rely heavily on existing 3D datasets. The scarcity of +diverse 3D datasets results in limited generalization capabilities of 3D +reconstruction models. In this paper, we propose a novel framework for boosting +3D reconstruction with multi-view refinement (MVBoost) by generating pseudo-GT +data. The key of MVBoost is combining the advantages of the high accuracy of +the multi-view generation model and the consistency of the 3D reconstruction +model to create a reliable data source. Specifically, given a single-view input +image, we employ a multi-view diffusion model to generate multiple views, +followed by a large 3D reconstruction model to produce consistent 3D data. +MVBoost then adaptively refines these multi-view images, rendered from the +consistent 3D data, to build a large-scale multi-view dataset for training a +feed-forward 3D reconstruction model. Additionally, the input view optimization +is designed to optimize the corresponding viewpoints based on the user's input +image, ensuring that the most important viewpoint is accurately tailored to the +user's needs. Extensive evaluations demonstrate that our method achieves +superior reconstruction results and robust generalization compared to prior +works. + +
+
+
+
+
+ + ♻ ☆ Scaling nnU-Net for CBCT Segmentation + + +
+ This paper presents our approach to scaling the nnU-Net framework for +multi-structure segmentation on Cone Beam Computed Tomography (CBCT) images, +specifically in the scope of the ToothFairy2 Challenge. We leveraged the +nnU-Net ResEnc L model, introducing key modifications to patch size, network +topology, and data augmentation strategies to address the unique challenges of +dental CBCT imaging. Our method achieved a mean Dice coefficient of 0.9253 and +HD95 of 18.472 on the test set, securing a mean rank of 4.6 and with it the +first place in the ToothFairy2 challenge. The source code is publicly +available, encouraging further research and development in the field. + +
+
+ comment: Fabian Isensee and Yannick Kirchhoff contributed equally +
+
+
+
+
+ + ♻ ☆ ODE: Open-Set Evaluation of Hallucinations in Multimodal Large Language + Models + + +
+ Hallucination poses a persistent challenge for multimodal large language +models (MLLMs). However, existing benchmarks for evaluating hallucinations are +generally static, which may overlook the potential risk of data contamination. +To address this issue, we propose ODE, an open-set, dynamic protocol designed +to evaluate object hallucinations in MLLMs at both the existence and attribute +levels. ODE employs a graph-based structure to represent real-world object +concepts, their attributes, and the distributional associations between them. +This structure facilitates the extraction of concept combinations based on +diverse distributional criteria, generating varied samples for structured +queries that evaluate hallucinations in both generative and discriminative +tasks. Through the generation of new samples, dynamic concept combinations, and +varied distribution frequencies, ODE mitigates the risk of data contamination +and broadens the scope of evaluation. This protocol is applicable to both +general and specialized scenarios, including those with limited data. +Experimental results demonstrate the effectiveness of our protocol, revealing +that MLLMs exhibit higher hallucination rates when evaluated with ODE-generated +samples, which indicates potential data contamination. Furthermore, these +generated samples aid in analyzing hallucination patterns and fine-tuning +models, offering an effective approach to mitigating hallucinations in MLLMs. + +
+
+
+
+
+ + ♻ ☆ FIRE: A Dataset for Feedback Integration and Refinement Evaluation of + Multimodal Models NeurIPS 2024 + + +
+ Vision language models (VLMs) have achieved impressive progress in diverse +applications, becoming a prevalent research direction. In this paper, we build +FIRE, a feedback-refinement dataset, consisting of 1.1M multi-turn +conversations that are derived from 27 source datasets, empowering VLMs to +spontaneously refine their responses based on user feedback across diverse +tasks. To scale up the data collection, FIRE is collected in two components: +FIRE-100K and FIRE-1M, where FIRE-100K is generated by GPT-4V, and FIRE-1M is +freely generated via models trained on FIRE-100K. Then, we build FIRE-Bench, a +benchmark to comprehensively evaluate the feedback-refining capability of VLMs, +which contains 11K feedback-refinement conversations as the test data, two +evaluation settings, and a model to provide feedback for VLMs. We develop the +FIRE-LLaVA model by fine-tuning LLaVA on FIRE-100K and FIRE-1M, which shows +remarkable feedback-refining capability on FIRE-Bench and outperforms untrained +VLMs by 50%, making more efficient user-agent interactions and underscoring the +significance of the FIRE dataset. + +
+
+ comment: NeurIPS 2024 Dataset & Benchmark Track +
+
+
+
+
+ + ♻ ☆ PAR: Prompt-Aware Token Reduction Method for Efficient Large Multimodal + Models + + +
+ Multimodal large language models (MLLMs) demonstrate strong performance +across visual tasks, but their efficiency is hindered by significant +computational and memory demands from processing long contexts in multimodal +inputs. To address this, we introduce PAR (Prompt-Aware Token Reduction), a +novel and plug-and-play approach that reduces visual tokens efficiently without +compromising model performance. Unlike previous methods that rely heavily on +attention mechanisms and overlooking cross-modal interactions , we uses a +prompt-aware strategy to adpative identify and cluster essential visual tokens. +PAR categorizes visual context redundancy into two types: external and +internal. External redundancy is minimized through semantic retrieval, while +internal redundancy is addressed using a token routing mechanism. This method +substantially reduces computational load without requiring additional training +or complex architectural modifications. \textbf{Experimental results +demonstrate that across various visual question answering tasks, PAR reduces +FLOPs by 83\% with a compression ratio of 89\%, while retaining 97\% of +baseline accuracy.} The adaptive design of PAR achieves a 2x token reduction +ratio compared to prior approaches, enabling a better balance between +performance and efficiency. + +
+
+ comment: 10 pages, 5 figures,3 tables +
+
+
+
+
+ + ♻ ☆ Demystify Mamba in Vision: A Linear Attention Perspective NeurIPS 2024 + + +
+ Mamba is an effective state space model with linear computation complexity. +It has recently shown impressive efficiency in dealing with high-resolution +inputs across various vision tasks. In this paper, we reveal that the powerful +Mamba model shares surprising similarities with linear attention Transformer, +which typically underperform conventional Transformer in practice. By exploring +the similarities and disparities between the effective Mamba and subpar linear +attention Transformer, we provide comprehensive analyses to demystify the key +factors behind Mamba's success. Specifically, we reformulate the selective +state space model and linear attention within a unified formulation, rephrasing +Mamba as a variant of linear attention Transformer with six major distinctions: +input gate, forget gate, shortcut, no attention normalization, single-head, and +modified block design. For each design, we meticulously analyze its pros and +cons, and empirically evaluate its impact on model performance in vision tasks. +Interestingly, the results highlight the forget gate and block design as the +core contributors to Mamba's success, while the other four designs are less +crucial. Based on these findings, we propose a Mamba-Inspired Linear Attention +(MILA) model by incorporating the merits of these two key designs into linear +attention. The resulting model outperforms various vision Mamba models in both +image classification and high-resolution dense prediction tasks, while enjoying +parallelizable computation and fast inference speed. Code is available at +https://github.com/LeapLabTHU/MLLA. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ SITReg: Multi-resolution architecture for symmetric, inverse consistent, + and topology preserving image registration + + +
+ Deep learning has emerged as a strong alternative for classical iterative +methods for deformable medical image registration, where the goal is to find a +mapping between the coordinate systems of two images. Popular classical image +registration methods enforce the useful inductive biases of symmetricity, +inverse consistency, and topology preservation by construction. However, while +many deep learning registration methods encourage these properties via loss +functions, no earlier methods enforce all of them by construction. Here, we +propose a novel registration architecture based on extracting multi-resolution +feature representations which is by construction symmetric, inverse consistent, +and topology preserving. We also develop an implicit layer for memory efficient +inversion of the deformation fields. Our method achieves state-of-the-art +registration accuracy on three datasets. The code is available at +https://github.com/honkamj/SITReg. + +
+
+ comment: Accepted for publication at the Journal of Machine Learning for + Biomedical Imaging (MELBA) https://melba-journal.org/2024:026 +
+
+
+
+
+ + ♻ ☆ CoVLA: Comprehensive Vision-Language-Action Dataset for Autonomous + Driving WACV 2025 + + +
+ Autonomous driving, particularly navigating complex and unanticipated +scenarios, demands sophisticated reasoning and planning capabilities. While +Multi-modal Large Language Models (MLLMs) offer a promising avenue for this, +their use has been largely confined to understanding complex environmental +contexts or generating high-level driving commands, with few studies extending +their application to end-to-end path planning. A major research bottleneck is +the lack of large-scale annotated datasets encompassing vision, language, and +action. To address this issue, we propose CoVLA (Comprehensive +Vision-Language-Action) Dataset, an extensive dataset comprising real-world +driving videos spanning more than 80 hours. This dataset leverages a novel, +scalable approach based on automated data processing and a caption generation +pipeline to generate accurate driving trajectories paired with detailed natural +language descriptions of driving environments and maneuvers. This approach +utilizes raw in-vehicle sensor data, allowing it to surpass existing datasets +in scale and annotation richness. Using CoVLA, we investigate the driving +capabilities of MLLMs that can handle vision, language, and action in a variety +of driving scenarios. Our results illustrate the strong proficiency of our +model in generating coherent language and action outputs, emphasizing the +potential of Vision-Language-Action (VLA) models in the field of autonomous +driving. This dataset establishes a framework for robust, interpretable, and +data-driven autonomous driving systems by providing a comprehensive platform +for training and evaluating VLA models, contributing to safer and more reliable +self-driving vehicles. The dataset is released for academic purpose. + +
+
+ comment: WACV 2025, Project Page: https://turingmotors.github.io/covla-ad/ +
+
+
+
+
+ + ♻ ☆ Visual Cue Enhancement and Dual Low-Rank Adaptation for Efficient Visual + Instruction Fine-Tuning + + +
+ Parameter-efficient fine-tuning multimodal large language models (MLLMs) +presents significant challenges, including reliance on high-level visual +features that limit fine-grained detail comprehension, and data conflicts that +arise from task complexity. To address these issues, we propose an efficient +fine-tuning framework with two novel approaches: Vision Cue Enhancement (VCE) +and Dual Low-Rank Adaptation (Dual-LoRA). VCE enhances the vision projector by +integrating multi-level visual cues, improving the model's ability to capture +fine-grained visual features. Dual-LoRA introduces a dual low-rank structure +for instruction tuning, decoupling learning into skill and task spaces to +enable precise control and efficient adaptation across diverse tasks. Our +method simplifies implementation, enhances visual comprehension, and improves +adaptability. Experiments on both downstream tasks and general benchmarks +demonstrate the effectiveness of our proposed approach. + +
+
+
+
+
+ + ♻ ☆ From Pixels to Insights: A Survey on Automatic Chart Understanding in + the Era of Large Foundation Models + + +
+ Data visualization in the form of charts plays a pivotal role in data +analysis, offering critical insights and aiding in informed decision-making. +Automatic chart understanding has witnessed significant advancements with the +rise of large foundation models in recent years. Foundation models, such as +large language models, have revolutionized various natural language processing +tasks and are increasingly being applied to chart understanding tasks. This +survey paper provides a comprehensive overview of the recent developments, +challenges, and future directions in chart understanding within the context of +these foundation models. We review fundamental building blocks crucial for +studying chart understanding tasks. Additionally, we explore various tasks and +their evaluation metrics and sources of both charts and textual inputs. Various +modeling strategies are then examined, encompassing both classification-based +and generation-based approaches, along with tool augmentation techniques that +enhance chart understanding performance. Furthermore, we discuss the +state-of-the-art performance of each task and discuss how we can improve the +performance. Challenges and future directions are addressed, highlighting the +importance of several topics, such as domain-specific charts, lack of efforts +in developing evaluation metrics, and agent-oriented settings. This survey +paper serves as a comprehensive resource for researchers and practitioners in +the fields of natural language processing, computer vision, and data analysis, +providing valuable insights and directions for future research in chart +understanding leveraging large foundation models. The studies mentioned in this +paper, along with emerging new research, will be continually updated at: +https://github.com/khuangaf/Awesome-Chart-Understanding. + +
+
+ comment: IEEE Transactions on Knowledge and Data Engineering (TKDE) +
+
+
+
+
+ + ♻ ☆ Monocular Lane Detection Based on Deep Learning: A Survey + + +
+ Lane detection plays an important role in autonomous driving perception +systems. As deep learning algorithms gain popularity, monocular lane detection +methods based on them have demonstrated superior performance and emerged as a +key research direction in autonomous driving perception. The core designs of +these algorithmic frameworks can be summarized as follows: (1) Task paradigm, +focusing on lane instance-level discrimination; (2) Lane modeling, representing +lanes as a set of learnable parameters in the neural network; (3) Global +context supplementation, enhancing inference on the obscure lanes; (4) +Perspective effect elimination, providing accurate 3D lanes for downstream +applications. From these perspectives, this paper presents a comprehensive +overview of existing methods, encompassing both the increasingly mature 2D lane +detection approaches and the developing 3D lane detection works. Besides, this +paper compares the performance of mainstream methods on different benchmarks +and investigates their inference speed under a unified setting for fair +comparison. Moreover, we present some extended works on lane detection, +including multi-task perception, video lane detection, online high-definition +map construction, and lane topology reasoning, to offer readers a comprehensive +roadmap for the evolution of lane detection. Finally, we point out some +potential future research directions in this field. We exhaustively collect the +papers and codes of existing works at +https://github.com/Core9724/Awesome-Lane-Detection and will keep tracing the +research. + +
+
+
+
+
+ + ♻ ChatRex: Taming Multimodal LLM for Joint Perception and Understanding + + +
+ Perception and understanding are two pillars of computer vision. While +multimodal large language models (MLLM) have demonstrated remarkable visual +understanding capabilities, they arguably lack accurate perception abilities, +e.g. the stage-of-the-art model Qwen2-VL only achieves a 43.9 recall rate on +the COCO dataset, limiting many tasks requiring the combination of perception +and understanding. In this work, we aim to bridge this perception gap from both +model designing and data development perspectives. We first introduce ChatRex, +an MLLM with a decoupled perception design. Instead of having the LLM directly +predict box coordinates, we feed the output boxes from a universal proposal +network into the LLM, allowing it to output the corresponding box indices to +represent its detection results, turning the regression task into a +retrieval-based task that LLM handles more proficiently. From the data +perspective, we build a fully automated data engine and construct the +Rexverse-2M dataset which possesses multiple granularities to support the joint +training of perception and understanding. After standard two-stage training, +ChatRex demonstrates strong perception capabilities while preserving multimodal +understanding performance. The combination of these two capabilities +simultaneously unlocks many attractive applications, demonstrating the +complementary roles of both perception and understanding in MLLM. Code is +available at \url{https://github.com/IDEA-Research/ChatRex}. + +
+
+ comment: 35 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ T2Vid: Translating Long Text into Multi-Image is the Catalyst for + Video-LLMs + + +
+ The success of Multimodal Large Language Models (MLLMs) in the image domain +has garnered wide attention from the research community. Drawing on previous +successful experiences, researchers have recently explored extending the +success to the video understanding realms. Apart from training from scratch, an +efficient way is to utilize the pre-trained image-LLMs, leading to two +mainstream approaches, i.e. zero-shot inference and further fine-tuning with +video data. In this work, our study of these approaches harvests an effective +data augmentation method. We first make a deeper inspection of the zero-shot +inference way and identify two limitations, i.e. limited generalization and +lack of temporal understanding capabilities. Thus, we further investigate the +fine-tuning approach and find a low learning efficiency when simply using all +the video data samples, which can be attributed to a lack of instruction +diversity. Aiming at this issue, we develop a method called T2Vid to synthesize +video-like samples to enrich the instruction diversity in the training corpus. +Integrating these data enables a simple and efficient training scheme, which +achieves performance comparable to or even superior to using full video +datasets by training with just 15% the sample size. Meanwhile, we find that the +proposed scheme can boost the performance of long video understanding without +training with long video samples. We hope our study will spark more thinking +about using MLLMs for video understanding and curation of high-quality data. +The code is released at https://github.com/xjtupanda/T2Vid. + +
+
+ comment: Project page: https://github.com/xjtupanda/T2Vid +
+
+
+
+
+ + ♻ ☆ A design of Convolutional Neural Network model for the Diagnosis of the + COVID-19 + + +
+ With the spread of COVID-19 around the globe over the past year, the usage of +artificial intelligence (AI) algorithms and image processing methods to analyze +the X-ray images of patients' chest with COVID-19 has become essential. The +COVID-19 virus recognition in the lung area of a patient is one of the basic +and essential needs of clicical centers and hospitals. Most research in this +field has been devoted to papers on the basis of deep learning methods +utilizing CNNs (Convolutional Neural Network), which mainly deal with the +screening of sick and healthy people.In this study, a new structure of a +19-layer CNN has been recommended for accurately recognition of the COVID-19 +from the X-ray pictures of chest. The offered CNN is developed to serve as a +precise diagnosis system for a three class (viral pneumonia, Normal, COVID) and +a four classclassification (Lung opacity, Normal, COVID-19, and pneumonia). A +comparison is conducted among the outcomes of the offered procedure and some +popular pretrained networks, including Inception, Alexnet, ResNet50, +Squeezenet, and VGG19 and based on Specificity, Accuracy, Precision, +Sensitivity, Confusion Matrix, and F1-score. The experimental results of the +offered CNN method specify its dominance over the existing published +procedures. This method can be a useful tool for clinicians in deciding +properly about COVID-19. + +
+
+ comment: Important mistakes. Also, another author has contributed some to the + revised version. So it is not appropriate for it to be with only my name +
+
+
+
+
+ + ♻ ☆ SL-YOLO: A Stronger and Lighter Drone Target Detection Model + + +
+ Detecting small objects in complex scenes, such as those captured by drones, +is a daunting challenge due to the difficulty in capturing the complex features +of small targets. While the YOLO family has achieved great success in large +target detection, its performance is less than satisfactory when faced with +small targets. Because of this, this paper proposes a revolutionary model +SL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small +target detection. We propose the Hierarchical Extended Path Aggregation Network +(HEPAN), a pioneering cross-scale feature fusion method that can ensure +unparalleled detection accuracy even in the most challenging environments. At +the same time, without sacrificing detection capabilities, we design the C2fDCB +lightweight module and add the SCDown downsampling module to greatly reduce the +model's parameters and computational complexity. Our experimental results on +the VisDrone2019 dataset reveal a significant improvement in performance, with +mAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to +28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M, +and the FPS can reach 132, making it an ideal solution for real-time small +object detection in resource-constrained environments. + +
+
+ comment: We are withdrawing this submission to incorporate substantial updates + and improvements to the manuscript, including additional data and analysis +
+
+
+
+
+ + ♻ ☆ GausSurf: Geometry-Guided 3D Gaussian Splatting for Surface + Reconstruction + + +
+ 3D Gaussian Splatting has achieved impressive performance in novel view +synthesis with real-time rendering capabilities. However, reconstructing +high-quality surfaces with fine details using 3D Gaussians remains a +challenging task. In this work, we introduce GausSurf, a novel approach to +high-quality surface reconstruction by employing geometry guidance from +multi-view consistency in texture-rich areas and normal priors in texture-less +areas of a scene. We observe that a scene can be mainly divided into two +primary regions: 1) texture-rich and 2) texture-less areas. To enforce +multi-view consistency at texture-rich areas, we enhance the reconstruction +quality by incorporating a traditional patch-match based Multi-View Stereo +(MVS) approach to guide the geometry optimization in an iterative scheme. This +scheme allows for mutual reinforcement between the optimization of Gaussians +and patch-match refinement, which significantly improves the reconstruction +results and accelerates the training process. Meanwhile, for the texture-less +areas, we leverage normal priors from a pre-trained normal estimation model to +guide optimization. Extensive experiments on the DTU and Tanks and Temples +datasets demonstrate that our method surpasses state-of-the-art methods in +terms of reconstruction quality and computation time. + +
+
+ comment: Project page: https://jiepengwang.github.io/GausSurf/ +
+
+
+
+
+ + ♻ ☆ Neighboring Slice Noise2Noise: Self-Supervised Medical Image Denoising + from Single Noisy Image Volume + + +
+ In the last few years, with the rapid development of deep learning +technologies, supervised methods based on convolutional neural networks have +greatly enhanced the performance of medical image denoising. However, these +methods require large quantities of noisy-clean image pairs for training, which +greatly limits their practicality. Although some researchers have attempted to +train denoising networks using only single noisy images, existing +self-supervised methods, including blind-spot-based and data-splitting-based +methods, heavily rely on the assumption that noise is pixel-wise independent. +However, this assumption often does not hold in real-world medical images. +Therefore, in the field of medical imaging, there remains a lack of simple and +practical denoising methods that can achieve high-quality denoising performance +using only single noisy images. In this paper, we propose a novel +self-supervised medical image denoising method, Neighboring Slice Noise2Noise +(NS-N2N). The proposed method utilizes neighboring slices within a single noisy +image volume to construct weighted training data, and then trains the denoising +network using a self-supervised scheme with regional consistency loss and +inter-slice continuity loss. NS-N2N only requires a single noisy image volume +obtained from one medical imaging procedure to achieve high-quality denoising +of the image volume itself. Extensive experiments demonstrate that the proposed +method outperforms state-of-the-art self-supervised denoising methods in both +denoising performance and processing efficiency. Furthermore, since NS-N2N +operates solely in the image domain, it is free from device-specific issues +such as reconstruction geometry, making it easier to apply in various clinical +practices. + +
+
+
+
+
+ + ♻ ☆ Asynchronous Feedback Network for Perceptual Point Cloud Quality + Assessment + + +
+ Recent years have witnessed the success of the deep learning-based technique +in research of no-reference point cloud quality assessment (NR-PCQA). For a +more accurate quality prediction, many previous studies have attempted to +capture global and local features in a bottom-up manner, but ignored the +interaction and promotion between them. To solve this problem, we propose a +novel asynchronous feedback quality prediction network (AFQ-Net). Motivated by +human visual perception mechanisms, AFQ-Net employs a dual-branch structure to +deal with global and local features, simulating the left and right hemispheres +of the human brain, and constructs a feedback module between them. +Specifically, the input point clouds are first fed into a transformer-based +global encoder to generate the attention maps that highlight these semantically +rich regions, followed by being merged into the global feature. Then, we +utilize the generated attention maps to perform dynamic convolution for +different semantic regions and obtain the local feature. Finally, a +coarse-to-fine strategy is adopted to merge the two features into the final +quality score. We conduct comprehensive experiments on three datasets and +achieve superior performance over the state-of-the-art approaches on all of +these datasets. The code will be available at The code will be available at +https://github.com/zhangyujie-1998/AFQ-Net. + +
+
+
+
+
+ + ♻ ☆ Task-aware Distributed Source Coding under Dynamic Bandwidth + + +
+ Efficient compression of correlated data is essential to minimize +communication overload in multi-sensor networks. In such networks, each sensor +independently compresses the data and transmits them to a central node due to +limited communication bandwidth. A decoder at the central node decompresses and +passes the data to a pre-trained machine learning-based task to generate the +final output. Thus, it is important to compress the features that are relevant +to the task. Additionally, the final performance depends heavily on the total +available bandwidth. In practice, it is common to encounter varying +availability in bandwidth, and higher bandwidth results in better performance +of the task. We design a novel distributed compression framework composed of +independent encoders and a joint decoder, which we call neural distributed +principal component analysis (NDPCA). NDPCA flexibly compresses data from +multiple sources to any available bandwidth with a single model, reducing +computing and storage overhead. NDPCA achieves this by learning low-rank task +representations and efficiently distributing bandwidth among sensors, thus +providing a graceful trade-off between performance and bandwidth. Experiments +show that NDPCA improves the success rate of multi-view robotic arm +manipulation by 9% and the accuracy of object detection tasks on satellite +imagery by 14% compared to an autoencoder with uniform bandwidth allocation. + +
+
+
+
+
+ + ♻ ☆ Critic-V: VLM Critics Help Catch VLM Errors in Multimodal Reasoning + + +
+ Vision-language models (VLMs) have shown remarkable advancements in +multimodal reasoning tasks. However, they still often generate inaccurate or +irrelevant responses due to issues like hallucinated image understandings or +unrefined reasoning paths. To address these challenges, we introduce Critic-V, +a novel framework inspired by the Actor-Critic paradigm to boost the reasoning +capability of VLMs. This framework decouples the reasoning process and critic +process by integrating two independent components: the Reasoner, which +generates reasoning paths based on visual and textual inputs, and the Critic, +which provides constructive critique to refine these paths. In this approach, +the Reasoner generates reasoning responses according to text prompts, which can +evolve iteratively as a policy based on feedback from the Critic. This +interaction process was theoretically driven by a reinforcement learning +framework where the Critic offers natural language critiques instead of scalar +rewards, enabling more nuanced feedback to boost the Reasoner's capability on +complex reasoning tasks. The Critic model is trained using Direct Preference +Optimization (DPO), leveraging a preference dataset of critiques ranked by +Rule-based Reward~(RBR) to enhance its critic capabilities. Evaluation results +show that the Critic-V framework significantly outperforms existing methods, +including GPT-4V, on 5 out of 8 benchmarks, especially regarding reasoning +accuracy and efficiency. Combining a dynamic text-based policy for the Reasoner +and constructive feedback from the preference-optimized Critic enables a more +reliable and context-sensitive multimodal reasoning process. Our approach +provides a promising solution to enhance the reliability of VLMs, improving +their performance in real-world reasoning-heavy multimodal applications such as +autonomous driving and embodied intelligence. + +
+
+ comment: 16 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ AC3D: Analyzing and Improving 3D Camera Control in Video Diffusion + Transformers + + +
+ Numerous works have recently integrated 3D camera control into foundational +text-to-video models, but the resulting camera control is often imprecise, and +video generation quality suffers. In this work, we analyze camera motion from a +first principles perspective, uncovering insights that enable precise 3D camera +manipulation without compromising synthesis quality. First, we determine that +motion induced by camera movements in videos is low-frequency in nature. This +motivates us to adjust train and test pose conditioning schedules, accelerating +training convergence while improving visual and motion quality. Then, by +probing the representations of an unconditional video diffusion transformer, we +observe that they implicitly perform camera pose estimation under the hood, and +only a sub-portion of their layers contain the camera information. This +suggested us to limit the injection of camera conditioning to a subset of the +architecture to prevent interference with other video features, leading to 4x +reduction of training parameters, improved training speed and 10% higher visual +quality. Finally, we complement the typical dataset for camera control learning +with a curated dataset of 20K diverse dynamic videos with stationary cameras. +This helps the model disambiguate the difference between camera and scene +motion, and improves the dynamics of generated pose-conditioned videos. We +compound these findings to design the Advanced 3D Camera Control (AC3D) +architecture, the new state-of-the-art model for generative video modeling with +camera control. + +
+
+ comment: Project Page: https://snap-research.github.io/ac3d/ +
+
+
+
+
+ + ♻ ☆ Two-Stage Approach for Brain MR Image Synthesis: 2D Image Synthesis and + 3D Refinement MICCAI 2024 + + +
+ Despite significant advancements in automatic brain tumor segmentation +methods, their performance is not guaranteed when certain MR sequences are +missing. Addressing this issue, it is crucial to synthesize the missing MR +images that reflect the unique characteristics of the absent modality with +precise tumor representation. Typically, MRI synthesis methods generate partial +images rather than full-sized volumes due to computational constraints. This +limitation can lead to a lack of comprehensive 3D volumetric information and +result in image artifacts during the merging process. In this paper, we propose +a two-stage approach that first synthesizes MR images from 2D slices using a +novel intensity encoding method and then refines the synthesized MRI. The +proposed intensity encoding reduces artifacts when synthesizing MRI on a 2D +slice basis. Then, the \textit{Refiner}, which leverages complete 3D volume +information, further improves the quality of the synthesized images and +enhances their applicability to segmentation methods. Experimental results +demonstrate that the intensity encoding effectively minimizes artifacts in the +synthesized MRI and improves perceptual quality. Furthermore, using the +\textit{Refiner} on synthesized MRI significantly improves brain tumor +segmentation results, highlighting the potential of our approach in practical +applications. + +
+
+ comment: MICCAI 2024 BraSyn Challenge 1st place +
+
+
+
+
+ + ♻ ☆ Imagine and Seek: Improving Composed Image Retrieval with an Imagined + Proxy + + +
+ The Zero-shot Composed Image Retrieval (ZSCIR) requires retrieving images +that match the query image and the relative captions. Current methods focus on +projecting the query image into the text feature space, subsequently combining +them with features of query texts for retrieval. However, retrieving images +only with the text features cannot guarantee detailed alignment due to the +natural gap between images and text. In this paper, we introduce Imagined Proxy +for CIR (IP-CIR), a training-free method that creates a proxy image aligned +with the query image and text description, enhancing query representation in +the retrieval process. We first leverage the large language model's +generalization capability to generate an image layout, and then apply both the +query text and image for conditional generation. The robust query features are +enhanced by merging the proxy image, query image, and text semantic +perturbation. Our newly proposed balancing metric integrates text-based and +proxy retrieval similarities, allowing for more accurate retrieval of the +target image while incorporating image-side information into the process. +Experiments on three public datasets demonstrate that our method significantly +improves retrieval performances. We achieve state-of-the-art (SOTA) results on +the CIRR dataset with a Recall@K of 70.07 at K=10. Additionally, we achieved an +improvement in Recall@10 on the FashionIQ dataset, rising from 45.11 to 45.74, +and improved the baseline performance in CIRCO with a mAPK@10 score, increasing +from 32.24 to 34.26. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ NoisyNN: Exploring the Impact of Information Entropy Change in Learning + Systems + + +
+ We investigate the impact of entropy change in deep learning systems by noise +injection at different levels, including the embedding space and the image. The +series of models that employ our methodology are collectively known as Noisy +Neural Networks (NoisyNN), with examples such as NoisyViT and NoisyCNN. Noise +is conventionally viewed as a harmful perturbation in various deep learning +architectures, such as convolutional neural networks (CNNs) and vision +transformers (ViTs), as well as different learning tasks like image +classification and transfer learning. However, this work shows noise can be an +effective way to change the entropy of the learning system. We demonstrate that +specific noise can boost the performance of various deep models under certain +conditions. We theoretically prove the enhancement gained from positive noise +by reducing the task complexity defined by information entropy and +experimentally show the significant performance gain in large image datasets, +such as the ImageNet. Herein, we use the information entropy to define the +complexity of the task. We categorize the noise into two types, positive noise +(PN) and harmful noise (HN), based on whether the noise can help reduce the +task complexity. Extensive experiments of CNNs and ViTs have shown performance +improvements by proactively injecting positive noise, where we achieved an +unprecedented top 1 accuracy of 95$\%$ on ImageNet. Both theoretical analysis +and empirical evidence have confirmed that the presence of positive noise, can +benefit the learning process, while the traditionally perceived harmful noise +indeed impairs deep learning models. The different roles of noise offer new +explanations for deep models on specific tasks and provide a new paradigm for +improving model performance. Moreover, it reminds us that we can influence the +performance of learning systems via information entropy change. + +
+
+ comment: Task Entropy, NoisyViT, NoisyCNN +
+
+
+
+
+ + ♻ ☆ Texture, Shape and Order Matter: A New Transformer Design for Sequential + DeepFake Detection WACV 2025 + + +
+ Sequential DeepFake detection is an emerging task that predicts the +manipulation sequence in order. Existing methods typically formulate it as an +image-to-sequence problem, employing conventional Transformer architectures. +However, these methods lack dedicated design and consequently result in limited +performance. As such, this paper describes a new Transformer design, called +TSOM, by exploring three perspectives: Texture, Shape, and Order of +Manipulations. Our method features four major improvements: \ding{182} we +describe a new texture-aware branch that effectively captures subtle +manipulation traces with a Diversiform Pixel Difference Attention module. +\ding{183} Then we introduce a Multi-source Cross-attention module to seek deep +correlations among spatial and sequential features, enabling effective modeling +of complex manipulation traces. \ding{184} To further enhance the +cross-attention, we describe a Shape-guided Gaussian mapping strategy, +providing initial priors of the manipulation shape. \ding{185} Finally, +observing that the subsequent manipulation in a sequence may influence traces +left in the preceding one, we intriguingly invert the prediction order from +forward to backward, leading to notable gains as expected. Extensive +experimental results demonstrate that our method outperforms others by a large +margin, highlighting the superiority of our method. + +
+
+ comment: WACV 2025 +
+
+
+
+
+ + ♻ ☆ AnySynth: Harnessing the Power of Image Synthetic Data Generation for + Generalized Vision-Language Tasks + + +
+ Diffusion models have recently been employed to generate high-quality images, +reducing the need for manual data collection and improving model generalization +in tasks such as object detection, instance segmentation, and image perception. +However, the synthetic framework is usually designed with meticulous human +effort for each task due to various requirements on image layout, content, and +annotation formats, restricting the application of synthetic data on more +general scenarios. In this paper, we propose AnySynth, a unified framework +integrating adaptable, comprehensive, and highly controllable components +capable of generating an arbitrary type of synthetic data given diverse +requirements. Specifically, the Task-Specific Layout Generation Module is first +introduced to produce reasonable layouts for different tasks by leveraging the +generation ability of large language models and layout priors of real-world +images. A Uni-Controlled Image Generation Module is then developed to create +high-quality synthetic images that are controllable and based on the generated +layouts. In addition, user specific reference images, and style images can be +incorporated into the generation to task requirements. Finally, the +Task-Oriented Annotation Module offers precise and detailed annotations for the +generated images across different tasks. We have validated our framework's +performance across various tasks, including Few-shot Object Detection, +Cross-domain Object Detection, Zero-shot Composed Image Retrieval, and +Multi-modal Image Perception and Grounding. The specific data synthesized by +our framework significantly improves model performance in these tasks, +demonstrating the generality and effectiveness of our framework. + +
+
+
+
+
+ + ♻ ☆ Aligning Step-by-Step Instructional Diagrams to Video Demonstrations CVPR'23 + + +
+ Multimodal alignment facilitates the retrieval of instances from one modality +when queried using another. In this paper, we consider a novel setting where +such an alignment is between (i) instruction steps that are depicted as +assembly diagrams (commonly seen in Ikea assembly manuals) and (ii) video +segments from in-the-wild videos; these videos comprising an enactment of the +assembly actions in the real world. To learn this alignment, we introduce a +novel supervised contrastive learning method that learns to align videos with +the subtle details in the assembly diagrams, guided by a set of novel losses. +To study this problem and demonstrate the effectiveness of our method, we +introduce a novel dataset: IAW for Ikea assembly in the wild consisting of 183 +hours of videos from diverse furniture assembly collections and nearly 8,300 +illustrations from their associated instruction manuals and annotated for their +ground truth alignments. We define two tasks on this dataset: First, nearest +neighbor retrieval between video segments and illustrations, and, second, +alignment of instruction steps and the segments for each video. Extensive +experiments on IAW demonstrate superior performances of our approach against +alternatives. + +
+
+ comment: Accepted to CVPR'23 +
+
+
+
+
+ + ♻ GameGen-X: Interactive Open-world Game Video Generation + + +
+ We introduce GameGen-X, the first diffusion transformer model specifically +designed for both generating and interactively controlling open-world game +videos. This model facilitates high-quality, open-domain generation by +simulating an extensive array of game engine features, such as innovative +characters, dynamic environments, complex actions, and diverse events. +Additionally, it provides interactive controllability, predicting and altering +future content based on the current clip, thus allowing for gameplay +simulation. To realize this vision, we first collected and built an Open-World +Video Game Dataset from scratch. It is the first and largest dataset for +open-world game video generation and control, which comprises over a million +diverse gameplay video clips sampling from over 150 games with informative +captions from GPT-4o. GameGen-X undergoes a two-stage training process, +consisting of foundation model pre-training and instruction tuning. Firstly, +the model was pre-trained via text-to-video generation and video continuation, +endowing it with the capability for long-sequence, high-quality open-domain +game video generation. Further, to achieve interactive controllability, we +designed InstructNet to incorporate game-related multi-modal control signal +experts. This allows the model to adjust latent representations based on user +inputs, unifying character interaction and scene content control for the first +time in video generation. During instruction tuning, only the InstructNet is +updated while the pre-trained foundation model is frozen, enabling the +integration of interactive controllability without loss of diversity and +quality of generated video content. + +
+
+ comment: Homepage: https://gamegen-x.github.io/ Github: + https://github.com/GameGen-X/GameGen-X +
+
+
+
+
+ + ♻ ☆ Temporally Grounding Instructional Diagrams in Unconstrained Videos WACV'25 + + +
+ We study the challenging problem of simultaneously localizing a sequence of +queries in the form of instructional diagrams in a video. This requires +understanding not only the individual queries but also their +interrelationships. However, most existing methods focus on grounding one query +at a time, ignoring the inherent structures among queries such as the general +mutual exclusiveness and the temporal order. Consequently, the predicted +timespans of different step diagrams may overlap considerably or violate the +temporal order, thus harming the accuracy. In this paper, we tackle this issue +by simultaneously grounding a sequence of step diagrams. Specifically, we +propose composite queries, constructed by exhaustively pairing up the visual +content features of the step diagrams and a fixed number of learnable +positional embeddings. Our insight is that self-attention among composite +queries carrying different content features suppress each other to reduce +timespan overlaps in predictions, while the cross-attention corrects the +temporal misalignment via content and position joint guidance. We demonstrate +the effectiveness of our approach on the IAW dataset for grounding step +diagrams and the YouCook2 benchmark for grounding natural language queries, +significantly outperforming existing methods while simultaneously grounding +multiple queries. + +
+
+ comment: Accepted to WACV'25 +
+
+
+
+
+ + ♻ ☆ DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow + Decoding + + +
+ Human motion, inherently continuous and dynamic, presents significant +challenges for generative models. Despite their dominance, discrete +quantization methods, such as VQ-VAEs, suffer from inherent limitations, +including restricted expressiveness and frame-wise noise artifacts. Continuous +approaches, while producing smoother and more natural motions, often falter due +to high-dimensional complexity and limited training data. To resolve this +"discord" between discrete and continuous representations, we introduce +DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a +novel method that decodes discrete motion tokens into continuous motion through +rectified flow. By employing an iterative refinement process in the continuous +space, DisCoRD captures fine-grained dynamics and ensures smoother and more +natural motions. Compatible with any discrete-based framework, our method +enhances naturalness without compromising faithfulness to the conditioning +signals. Extensive evaluations demonstrate that DisCoRD achieves +state-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on +KIT-ML. These results solidify DisCoRD as a robust solution for bridging the +divide between discrete efficiency and continuous realism. Our project page is +available at: https://whwjdqls.github.io/discord.github.io/. + +
+
+ comment: 20 pages 18 figures +
+
+
+
+
+ + ♻ ☆ HyperSeg: Towards Universal Visual Segmentation with Large Language + Model + + +
+ This paper aims to address universal segmentation for image and video +perception with the strong reasoning ability empowered by Visual Large Language +Models (VLLMs). Despite significant progress in current unified segmentation +methods, limitations in adaptation to both image and video scenarios, as well +as the complex reasoning segmentation, make it difficult for them to handle +various challenging instructions and achieve an accurate understanding of +fine-grained vision-language correlations. We propose HyperSeg, the first +VLLM-based universal segmentation model for pixel-level image and video +perception, encompassing generic segmentation tasks and more complex reasoning +perception tasks requiring powerful reasoning abilities and world knowledge. +Besides, to fully leverage the recognition capabilities of VLLMs and the +fine-grained visual information, HyperSeg incorporates hybrid entity +recognition and fine-grained visual perceiver modules for various segmentation +tasks. Combined with the temporal adapter, HyperSeg achieves a comprehensive +understanding of temporal information. Experimental results validate the +effectiveness of our insights in resolving universal image and video +segmentation tasks, including the more complex reasoning perception tasks. Our +code is available. + +
+
+
+
+
+ + ♻ ☆ Predicting and Enhancing the Fairness of DNNs with the Curvature of + Perceptual Manifolds CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we first establish a geometric +perspective for analyzing model fairness and then systematically propose a +series of geometric measurements for perceptual manifolds in deep neural +networks. Subsequently, we comprehensively explore the effect of the geometric +characteristics of perceptual manifolds on classification difficulty and how +learning shapes the geometric characteristics of perceptual manifolds. An +unanticipated finding is that the correlation between the class accuracy and +the separation degree of perceptual manifolds gradually decreases during +training, while the negative correlation with the curvature gradually +increases, implying that curvature imbalance leads to model bias.Building upon +these observations, we propose curvature regularization to facilitate the model +to learn curvature-balanced and flatter perceptual manifolds. Evaluations on +multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. + +
+
+ comment: 17pages, Accepted by CVPR 2023, Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ LeanGaussian: Breaking Pixel or Point Cloud Correspondence in Modeling + 3D Gaussians + + +
+ Rencently, Gaussian splatting has demonstrated significant success in novel +view synthesis. Current methods often regress Gaussians with pixel or point +cloud correspondence, linking each Gaussian with a pixel or a 3D point. This +leads to the redundancy of Gaussians being used to overfit the correspondence +rather than the objects represented by the 3D Gaussians themselves, +consequently wasting resources and lacking accurate geometries or textures. In +this paper, we introduce LeanGaussian, a novel approach that treats each query +in deformable Transformer as one 3D Gaussian ellipsoid, breaking the pixel or +point cloud correspondence constraints. We leverage deformable decoder to +iteratively refine the Gaussians layer-by-layer with the image features as keys +and values. Notably, the center of each 3D Gaussian is defined as 3D reference +points, which are then projected onto the image for deformable attention in 2D +space. On both the ShapeNet SRN dataset (category level) and the Google Scanned +Objects dataset (open-category level, trained with the Objaverse dataset), our +approach, outperforms prior methods by approximately 6.1\%, achieving a PSNR of +25.44 and 22.36, respectively. Additionally, our method achieves a 3D +reconstruction speed of 7.2 FPS and rendering speed 500 FPS. The code will be +released at https://github.com/jwubz123/DIG3D. + +
+
+
+
+
+ + ♻ ☆ Towards Complementary Knowledge Distillation for Efficient Dense Image + Prediction + + +
+ It has been revealed that small efficient dense image prediction (EDIP) +models, trained using the knowledge distillation (KD) framework, encounter two +key challenges, including maintaining boundary region completeness and +preserving target region connectivity, despite their favorable capacity to +recognize main object regions. In this work, we propose a complementary +boundary and context distillation (BCD) method within the KD framework for +EDIPs, which facilitates the targeted knowledge transfer from large accurate +teacher models to compact efficient student models. Specifically, the boundary +distillation component focuses on extracting explicit object-level semantic +boundaries from the hierarchical feature maps of the backbone network to +enhance the student model's mask quality in boundary regions. Concurrently, the +context distillation component leverages self-relations as a bridge to transfer +implicit pixel-level contexts from the teacher model to the student model, +ensuring strong connectivity in target regions. Our proposed BCD method is +specifically designed for EDIP tasks and is characterized by its simplicity and +efficiency. Extensive experimental results across semantic segmentation, object +detection, and instance segmentation on various representative datasets +demonstrate that our method can outperform existing methods without requiring +extra supervisions or incurring increased inference costs, resulting in +well-defined object boundaries and smooth connecting regions. + +
+
+ comment: under submission +
+
+
+
+
+ + ♻ ☆ EchoSight: Advancing Visual-Language Models with Wiki Knowledge EMNLP 2024 + + +
+ Knowledge-based Visual Question Answering (KVQA) tasks require answering +questions about images using extensive background knowledge. Despite +significant advancements, generative models often struggle with these tasks due +to the limited integration of external knowledge. In this paper, we introduce +EchoSight, a novel multimodal Retrieval-Augmented Generation (RAG) framework +that enables large language models (LLMs) to answer visual questions requiring +fine-grained encyclopedic knowledge. To strive for high-performing retrieval, +EchoSight first searches wiki articles by using visual-only information, +subsequently, these candidate articles are further reranked according to their +relevance to the combined text-image query. This approach significantly +improves the integration of multimodal knowledge, leading to enhanced retrieval +outcomes and more accurate VQA responses. Our experimental results on the +Encyclopedic VQA and InfoSeek datasets demonstrate that EchoSight establishes +new state-of-the-art results in knowledge-based VQA, achieving an accuracy of +41.8% on Encyclopedic VQA and 31.3% on InfoSeek. + +
+
+ comment: Accepted by EMNLP 2024 findings; Project Page: + https://go2heart.github.io/echosight +
+
+
+
+
+ + ♻ ☆ GlocalCLIP: Object-agnostic Global-Local Prompt Learning for Zero-shot + Anomaly Detection + + +
+ Zero-shot anomaly detection (ZSAD) is crucial for detecting anomalous +patterns in target datasets without using training samples, specifically in +scenarios where there are distributional differences between the target domain +and training data or where data scarcity arises because of restricted access. +Although recently pretrained vision-language models demonstrate strong +zero-shot performance across various visual tasks, they focus on learning class +semantics, which makes their direct application to ZSAD challenging. To address +this scenario, we propose GlocalCLIP, which uniquely separates global and local +prompts and jointly optimizes them. This approach enables the object-agnostic +glocal semantic prompt to effectively capture general normal and anomalous +patterns without dependency on specific objects in the image. We refine the +text prompts for more precise adjustments by utilizing deep-text prompt tuning +in the text encoder. In the vision encoder, we apply V-V attention layers to +capture detailed local image features. Finally, we introduce glocal contrastive +learning to improve the complementary learning of global and local prompts, +effectively detecting anomalous patterns across various domains. The +generalization performance of GlocalCLIP in ZSAD was demonstrated on 15 +real-world datasets from both the industrial and medical domains, achieving +superior performance compared to existing methods. Code will be made available +at https://github.com/YUL-git/GlocalCLIP. + +
+
+ comment: 29 pages, 36 figures +
+
+
+
+
+ + ♻ ☆ Differentiable Voxel-based X-ray Rendering Improves Sparse-View 3D CBCT + Reconstruction + + +
+ We present DiffVox, a self-supervised framework for Cone-Beam Computed +Tomography (CBCT) reconstruction by directly optimizing a voxelgrid +representation using physics-based differentiable X-ray rendering. Further, we +investigate how the different implementations of the X-ray image formation +model in the renderer affect the quality of 3D reconstruction and novel view +synthesis. When combined with our regularized voxel-based learning framework, +we find that using an exact implementation of the discrete Beer-Lambert law for +X-ray attenuation in the renderer outperforms both widely used iterative CBCT +reconstruction algorithms and modern neural field approaches, particularly when +given only a few input views. As a result, we reconstruct high-fidelity 3D CBCT +volumes from fewer X-rays, potentially reducing ionizing radiation exposure and +improving diagnostic utility. Our implementation is available at +https://github.com/hossein-momeni/DiffVox. + +
+
+
+
+
+ + ♻ ☆ Det-SAM2:Technical Report on the Self-Prompting Segmentation Framework + Based on Segment Anything Model 2 + + +
+ Segment Anything Model 2 (SAM2) demonstrates exceptional performance in video +segmentation and refinement of segmentation results. We anticipate that it can +further evolve to achieve higher levels of automation for practical +applications. Building upon SAM2, we conducted a series of practices that +ultimately led to the development of a fully automated pipeline, termed +Det-SAM2, in which object prompts are automatically generated by a detection +model to facilitate inference and refinement by SAM2. This pipeline enables +inference on infinitely long video streams with constant VRAM and RAM usage, +all while preserving the same efficiency and accuracy as the original SAM2. + This technical report focuses on the construction of the overall Det-SAM2 +framework and the subsequent engineering optimization applied to SAM2. We +present a case demonstrating an application built on the Det-SAM2 framework: AI +refereeing in a billiards scenario, derived from our business context. The +project at \url{https://github.com/motern88/Det-SAM2}. + +
+
+
+
+
+ + ♻ ☆ HGCLIP: Exploring Vision-Language Models with Graph Representations for + Hierarchical Understanding COLING 2025 + + +
+ Object categories are typically organized into a multi-granularity taxonomic +hierarchy. When classifying categories at different hierarchy levels, +traditional uni-modal approaches focus primarily on image features, revealing +limitations in complex scenarios. Recent studies integrating Vision-Language +Models (VLMs) with class hierarchies have shown promise, yet they fall short of +fully exploiting the hierarchical relationships. These efforts are constrained +by their inability to perform effectively across varied granularity of +categories. To tackle this issue, we propose a novel framework (HGCLIP) that +effectively combines CLIP with a deeper exploitation of the Hierarchical class +structure via Graph representation learning. We explore constructing the class +hierarchy into a graph, with its nodes representing the textual or image +features of each category. After passing through a graph encoder, the textual +features incorporate hierarchical structure information, while the image +features emphasize class-aware features derived from prototypes through the +attention mechanism. Our approach demonstrates significant improvements on 11 +diverse visual recognition benchmarks. Our codes are fully available at +https://github.com/richard-peng-xia/HGCLIP. + +
+
+ comment: COLING 2025 +
+
+
+
+
+ + ♻ ☆ Interpreting and Improving Attention From the Perspective of Large + Kernel Convolution + + +
+ Attention mechanisms have significantly advanced visual models by capturing +global context effectively. However, their reliance on large-scale datasets and +substantial computational resources poses challenges in data-scarce and +resource-constrained scenarios. Moreover, traditional self-attention mechanisms +lack inherent spatial inductive biases, making them suboptimal for modeling +local features critical to tasks involving smaller datasets. In this work, we +introduce Large Kernel Convolutional Attention (LKCA), a novel formulation that +reinterprets attention operations as a single large-kernel convolution. This +design unifies the strengths of convolutional architectures locality and +translation invariance with the global context modeling capabilities of +self-attention. By embedding these properties into a computationally efficient +framework, LKCA addresses key limitations of traditional attention mechanisms. +The proposed LKCA achieves competitive performance across various visual tasks, +particularly in data-constrained settings. Experimental results on CIFAR-10, +CIFAR-100, SVHN, and Tiny-ImageNet demonstrate its ability to excel in image +classification, outperforming conventional attention mechanisms and vision +transformers in compact model settings. These findings highlight the +effectiveness of LKCA in bridging local and global feature modeling, offering a +practical and robust solution for real-world applications with limited data and +resources. + +
+
+
+
+
+
+
+
+ + Artificial Intelligence 60 + +
+
+
+ + ♻ ☆ Compute-Constrained Data Selection + + +
+ Data selection can reduce the amount of training data needed to finetune +LLMs; however, the efficacy of data selection scales directly with its compute. +Motivated by the practical challenge of compute-constrained finetuning, we +consider the setting in which both the cost of selecting data and training are +budgeted for. We first formalize the problem of data selection with a +cost-aware utility function, and model the data selection problem as trading +off initial-selection cost for training gain. We run a comprehensive sweep of +experiments across multiple tasks, varying compute budget by scaling finetuning +tokens, model sizes, and data selection compute. Interestingly we find that +many powerful data selection methods are almost never compute-optimal, and that +cheaper data selection alternatives dominate both from a theoretical and +empirical perspective. For compute-optimal training, we find that perplexity +and gradient data selection require training-to-selection model size ratios of +5x and 10x, respectively. + +
+
+
+
+
+ + ♻ ☆ Inference Scaling fLaws: The Limits of LLM Resampling with Imperfect + Verifiers + + +
+ Recent research has generated hope that inference scaling could allow weaker +language models to match or exceed the accuracy of stronger models, such as by +repeatedly sampling solutions to a coding problem until it passes unit tests. +The central thesis of this paper is that there is no free lunch for inference +scaling: indefinite accuracy improvement through resampling can only be +realized if the "verifier" (in this case, a set of unit tests) is perfect. When +the verifier is imperfect, as it almost always is in domains such as reasoning +or coding (for example, unit tests have imperfect coverage), there is a nonzero +probability of false positives: incorrect solutions that pass the verifier. +Resampling cannot decrease this probability, so it imposes an upper bound to +the accuracy of resampling-based inference scaling even with an infinite +compute budget. We find that there is a very strong correlation between the +model's single-sample accuracy (i.e. accuracy without unit tests) and its false +positive rate on coding benchmarks HumanEval and MBPP, whose unit tests have +limited coverage. Therefore, no amount of inference scaling of weaker models +can enable them to match the single-sample accuracy of a sufficiently strong +model (Fig. 1a). When we consider that false positives have a negative utility +compared to abstaining from producing a solution, it bends the inference +scaling curve further downward. Empirically, we find that the optimal number of +samples can be less than 10 under realistic assumptions (Fig. 1b). Finally, we +show that beyond accuracy, false positives may have other undesirable +qualities, such as poor adherence to coding style conventions. + +
+
+
+
+
+ + ♻ ☆ CREW: Facilitating Human-AI Teaming Research + + +
+ With the increasing deployment of artificial intelligence (AI) technologies, +the potential of humans working with AI agents has been growing at a great +speed. Human-AI teaming is an important paradigm for studying various aspects +when humans and AI agents work together. The unique aspect of Human-AI teaming +research is the need to jointly study humans and AI agents, demanding +multidisciplinary research efforts from machine learning to human-computer +interaction, robotics, cognitive science, neuroscience, psychology, social +science, and complex systems. However, existing platforms for Human-AI teaming +research are limited, often supporting oversimplified scenarios and a single +task, or specifically focusing on either human-teaming research or multi-agent +AI algorithms. We introduce CREW, a platform to facilitate Human-AI teaming +research in real-time decision-making scenarios and engage collaborations from +multiple scientific disciplines, with a strong emphasis on human involvement. +It includes pre-built tasks for cognitive studies and Human-AI teaming with +expandable potentials from our modular design. Following conventional cognitive +neuroscience research, CREW also supports multimodal human physiological signal +recording for behavior analysis. Moreover, CREW benchmarks real-time +human-guided reinforcement learning agents using state-of-the-art algorithms +and well-tuned baselines. With CREW, we were able to conduct 50 human subject +studies within a week to verify the effectiveness of our benchmark. + +
+
+ comment: Our project website is at: http://generalroboticslab.com/CREW +
+
+
+
+
+ + ♻ ☆ Inducing Group Fairness in Prompt-Based Language Model Decisions + + +
+ Classifiers are used throughout industry to enforce policies, ranging from +the detection of toxic content to age-appropriate content filtering. While +these classifiers serve important functions, it is also essential that they are +built in ways that minimize unfair biases for users. + One such fairness consideration is called group fairness, which desires that +different sub-population of users receive equal treatment. This is a +well-studied problem in the context of 'classical' classifiers. However, the +emergence of prompt-based language model (LM) decision making has created new +opportunities to solve text-based classification tasks, and the fairness +properties of these new classifiers are not yet well understood. Further, the +`remediation toolkit' is incomplete for LM-based decision makers and little is +understood about how to improve decision maker group fairness while maintaining +classifier performance. + This work sets out to add more tools to that toolbox. We introduce +adaptations of existing effective approaches from the classical classifier +fairness to the prompt-based classifier space. We also devise simple methods +that take advantage of the new structure of prompt-based decision makers and +operate at the prompt level. We compare these approaches empirically on real +data. Our results suggest that adaptations of approaches that are effective for +classical classifiers remain effective in the LM-based classifier environment. +However, there is room for further exploration of prompt-based remediation +methods (and other remediation methods that take advantage of LM structure). + +
+
+
+
+
+ + ♻ ☆ RIRAG: Regulatory Information Retrieval and Answer Generation + + +
+ Regulatory documents, issued by governmental regulatory bodies, establish +rules, guidelines, and standards that organizations must adhere to for legal +compliance. These documents, characterized by their length, complexity and +frequent updates, are challenging to interpret, requiring significant +allocation of time and expertise on the part of organizations to ensure ongoing +compliance. Regulatory Natural Language Processing (RegNLP) is a +multidisciplinary field aimed at simplifying access to and interpretation of +regulatory rules and obligations. We introduce a task of generating +question-passages pairs, where questions are automatically created and paired +with relevant regulatory passages, facilitating the development of regulatory +question-answering systems. We create the ObliQA dataset, containing 27,869 +questions derived from the collection of Abu Dhabi Global Markets (ADGM) +financial regulation documents, design a baseline Regulatory Information +Retrieval and Answer Generation (RIRAG) system and evaluate it with RePASs, a +novel evaluation metric that tests whether generated answers accurately capture +all relevant obligations while avoiding contradictions. + +
+
+
+
+
+ + ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion Transformer + + +
+ In this paper, we introduce OminiControl, a highly versatile and +parameter-efficient framework that integrates image conditions into pre-trained +Diffusion Transformer (DiT) models. At its core, OminiControl leverages a +parameter reuse mechanism, enabling the DiT to encode image conditions using +itself as a powerful backbone and process them with its flexible multi-modal +attention processors. Unlike existing methods, which rely heavily on additional +encoder modules with complex architectures, OminiControl (1) effectively and +efficiently incorporates injected image conditions with only ~0.1% additional +parameters, and (2) addresses a wide range of image conditioning tasks in a +unified manner, including subject-driven generation and spatially-aligned +conditions such as edges, depth, and more. Remarkably, these capabilities are +achieved by training on images generated by the DiT itself, which is +particularly beneficial for subject-driven generation. Extensive evaluations +demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted +models in both subject-driven and spatially-aligned conditional generation. +Additionally, we release our training dataset, Subjects200K, a diverse +collection of over 200,000 identity-consistent images, along with an efficient +data synthesis pipeline to advance research in subject-consistent generation. + +
+
+
+
+
+ + ♻ ☆ What Differentiates Educational Literature? A Multimodal Fusion Approach + of Transformers and Computational Linguistics + + +
+ The integration of new literature into the English curriculum remains a +challenge since educators often lack scalable tools to rapidly evaluate +readability and adapt texts for diverse classroom needs. This study proposes to +address this gap through a multimodal approach that combines transformer-based +text classification with linguistic feature analysis to align texts with UK Key +Stages. Eight state-of-the-art Transformers were fine-tuned on segmented text +data, with BERT achieving the highest unimodal F1 score of 0.75. In parallel, +500 deep neural network topologies were searched for the classification of +linguistic characteristics, achieving an F1 score of 0.392. The fusion of these +modalities shows a significant improvement, with every multimodal approach +outperforming all unimodal models. In particular, the ELECTRA Transformer fused +with the neural network achieved an F1 score of 0.996. Unimodal and multimodal +approaches are shown to have statistically significant differences in all +validation metrics (accuracy, precision, recall, F1 score) except for inference +time. The proposed approach is finally encapsulated in a stakeholder-facing web +application, providing non-technical stakeholder access to real-time insights +on text complexity, reading difficulty, curriculum alignment, and +recommendations for learning age range. The application empowers data-driven +decision making and reduces manual workload by integrating AI-based +recommendations into lesson planning for English literature. + +
+
+
+
+
+ + ♻ ☆ End-to-End Supervised Hierarchical Graph Clustering for Speaker + Diarization + + +
+ Speaker diarization, the task of segmenting an audio recording based on +speaker identity, constitutes an important speech pre-processing step for +several downstream applications.The conventional approach to diarization +involves multiple steps of embedding extraction and clustering, which are often +optimized in an isolated fashion. While end-to-end diarization systems attempt +to learn a single model for the task, they are often cumbersome to train and +require large supervised datasets. In this paper, we propose an end-to-end +supervised hierarchical clustering algorithm based on graph neural networks +(GNN), called End-to-end Supervised HierARchical Clustering (E-SHARC). The +embedding extractor is initialized using a pre-trained x-vector model while the +GNN model is trained initially using the x-vector embeddings from the +pre-trained model. Finally, the E-SHARC model uses the front-end mel-filterbank +features as input and jointly optimizes the embedding extractor and the GNN +clustering module, performing representation learning, metric learning, and +clustering with end-to-end optimization. Further, with additional inputs from +an external overlap detector, the E-SHARC approach is capable of predicting the +speakers in the overlapping speech regions. The experimental evaluation on +benchmark datasets like AMI, Voxconverse and DISPLACE, illustrates that the +proposed E-SHARC framework provides competitive diarization results using graph +based clustering methods. + +
+
+ comment: 11 pages. Under review IEEE TASLP. \c{opyright} 2024 IEEE +
+
+
+
+
+ + ♻ ☆ A Conditional Independence Test in the Presence of Discretization + + +
+ Testing conditional independence has many applications, such as in Bayesian +network learning and causal discovery. Different test methods have been +proposed. However, existing methods generally can not work when only +discretized observations are available. Specifically, consider $X_1$, +$\tilde{X}_2$ and $X_3$ are observed variables, where $\tilde{X}_2$ is a +discretization of latent variables $X_2$. Applying existing test methods to the +observations of $X_1$, $\tilde{X}_2$ and $X_3$ can lead to a false conclusion +about the underlying conditional independence of variables $X_1$, $X_2$ and +$X_3$. Motivated by this, we propose a conditional independence test +specifically designed to accommodate the presence of such discretization. To +achieve this, we design the bridge equations to recover the parameter +reflecting the statistical information of the underlying latent continuous +variables. An appropriate test statistic and its asymptotic distribution under +the null hypothesis of conditional independence have also been derived. Both +theoretical results and empirical validation have been provided, demonstrating +the effectiveness of our test methods. + +
+
+
+
+
+ + ♻ ☆ ForecastBench: A Dynamic Benchmark of AI Forecasting Capabilities + + +
+ Forecasts of future events are essential inputs into informed +decision-making. Machine learning (ML) systems have the potential to deliver +forecasts at scale, but there is no framework for evaluating the accuracy of ML +systems on a standardized set of forecasting questions. To address this gap, we +introduce ForecastBench: a dynamic benchmark that evaluates the accuracy of ML +systems on an automatically generated and regularly updated set of 1,000 +forecasting questions. To avoid any possibility of data leakage, ForecastBench +is comprised solely of questions about future events that have no known answer +at the time of submission. We quantify the capabilities of current ML systems +by collecting forecasts from expert (human) forecasters, the general public, +and LLMs on a random subset of questions from the benchmark ($N=200$). While +LLMs have achieved super-human performance on many benchmarks, they perform +less well here: expert forecasters outperform the top-performing LLM (p-value +$<0.01$). We display system and human scores in a public leaderboard at +www.forecastbench.org. + +
+
+
+
+
+ + ♻ ☆ On Meta-Prompting + + +
+ Modern generative language models are capable of interpreting input strings +as instructions, or prompts, and carry out tasks based on them. Many approaches +to prompting and pre-training these models involve the automated generation of +these prompts: meta-prompting, or prompting to obtain prompts. We propose a +theoretical framework based on category theory to generalize and describe them. +This framework is flexible enough to account for stochasticity, and allows us +to obtain formal results around task agnosticity and equivalence of various +meta-prompting approaches. Experimentally, we test our framework in two active +areas of model research: creativity and ideation. We find that user preference +strongly favors (p < 0.01) the prompts generated under meta-prompting, as well +as their corresponding outputs, over a series of hardcoded baseline prompts +that include the original task definition. Using our framework, we argue that +meta-prompting is more effective than basic prompting at generating desirable +outputs. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ VisScience: An Extensive Benchmark for Evaluating K12 Educational + Multi-modal Scientific Reasoning + + +
+ Multi-modal large language models (MLLMs) have demonstrated promising +capabilities across various tasks by integrating textual and visual information +to achieve visual understanding in complex scenarios. Despite the availability +of several benchmarks aims to evaluating MLLMs in tasks from visual question +answering to complex problem-solving, most focus predominantly on mathematics +or general visual understanding tasks. This reveals a critical gap in current +benchmarks, which often overlook the inclusion of other key scientific +disciplines such as physics and chemistry. To address this gap, we meticulously +construct a comprehensive benchmark, named VisScience, which is utilized to +assess the multi-modal scientific reasoning across the three disciplines of +mathematics, physics, and chemistry. This benchmark comprises 3,000 questions +drawn from K12 education - spanning elementary school through high school - +equally distributed across three disciplines, with 1,000 questions per +discipline. The questions within VisScience span 21 distinct subjects and are +categorized into five difficulty levels, offering a broad spectrum of topics +within each discipline. With VisScience, we present a detailed evaluation of +the performance of 25 representative MLLMs in scientific reasoning. +Experimental results demonstrate that closed-source MLLMs generally outperform +open-source models. The best performance observed include a 53.4\% accuracy in +mathematics by Claude3.5-Sonnet, 38.2\% in physics by GPT-4o, and 47.0\% in +chemistry by Gemini-1.5-Pro. These results underscore the strengths and +limitations of MLLMs, suggesting areas for future improvement and highlighting +the importance of developing models that can effectively handle the diverse +demands of multi-modal scientific reasoning. + +
+
+ comment: 89 pages, 70 figures +
+
+
+
+
+ + ♻ MathGLM-Vision: Solving Mathematical Problems with Multi-Modal Large + Language Model + + +
+ Large language models (LLMs) have demonstrated significant capabilities in +mathematical reasoning, particularly with text-based mathematical problems. +However, current multi-modal large language models (MLLMs), especially those +specialized in mathematics, tend to focus predominantly on solving geometric +problems but ignore the diversity of visual information available in other +areas of mathematics. Moreover, the geometric information for these specialized +mathematical MLLMs is derived from several public datasets, which are typically +limited in diversity and complexity. To address these limitations, we aim to +construct a fine-tuning dataset named MathVL, and develop a series of +specialized mathematical MLLMs termed MathGLM-Vision by conducting Supervised +Fine-Tuning (SFT) on MathVL with various parameter-scale backbones. To +extensively evaluate the effectiveness of MathGLM-Vision, we conduct +experiments on several public benchmarks and our curated MathVL-test consisting +of 2,000 problems. Experimental results demonstrate that MathGLM-Vision +achieves significant improvements compared with some existing models, including +backbone models and open-source mathematical MLLMs. These findings indicate the +importance of diversity dataset in enhancing the mathematical reasoning +abilities of MLLMs. + +
+
+ comment: 30 pages,19 figures +
+
+
+
+
+ + ♻ ☆ Continual Learning in the Presence of Repetition CVPR + + +
+ Continual learning (CL) provides a framework for training models in +ever-evolving environments. Although re-occurrence of previously seen objects +or tasks is common in real-world problems, the concept of repetition in the +data stream is not often considered in standard benchmarks for CL. Unlike with +the rehearsal mechanism in buffer-based strategies, where sample repetition is +controlled by the strategy, repetition in the data stream naturally stems from +the environment. This report provides a summary of the CLVision challenge at +CVPR 2023, which focused on the topic of repetition in class-incremental +learning. The report initially outlines the challenge objective and then +describes three solutions proposed by finalist teams that aim to effectively +exploit the repetition in the stream to learn continually. The experimental +results from the challenge highlight the effectiveness of ensemble-based +solutions that employ multiple versions of similar modules, each trained on +different but overlapping subsets of classes. This report underscores the +transformative potential of taking a different perspective in CL by employing +repetition in the data stream to foster innovative strategy design. + +
+
+ comment: Accepted version, to appear in Neural Networks; Challenge Report of + the 4th Workshop on Continual Learning in Computer Vision at CVPR +
+
+
+
+
+ + ♻ ☆ Free-Mask: A Novel Paradigm of Integration Between the Segmentation + Diffusion Model and Image Editing to Improve Segmentation Ability + + +
+ Current semantic segmentation models typically require a substantial amount +of manually annotated data, a process that is both time-consuming and +resource-intensive. Alternatively, leveraging advanced text-to-image models +such as Midjourney and Stable Diffusion has emerged as an efficient strategy, +enabling the automatic generation of synthetic data in place of manual +annotations. However, previous methods have been limited to generating +single-instance images, as the generation of multiple instances with Stable +Diffusion has proven unstable. To address this limitation and expand the scope +and diversity of synthetic datasets, we propose a framework \textbf{Free-Mask} +that combines a Diffusion Model for segmentation with advanced image editing +capabilities, allowing for the integration of multiple objects into images via +text-to-image models. Our method facilitates the creation of highly realistic +datasets that closely emulate open-world environments while generating accurate +segmentation masks. It reduces the labor associated with manual annotation and +also ensures precise mask generation. Experimental results demonstrate that +synthetic data generated by \textbf{Free-Mask} enables segmentation models to +outperform those trained on real data, especially in zero-shot settings. +Notably, \textbf{Free-Mask} achieves new state-of-the-art results on previously +unseen classes in the VOC 2012 benchmark. + +
+
+ comment: 16 pages,5 figures,5 tables +
+
+
+
+
+ + ♻ ☆ InvDesFlow: An AI search engine to explore possible high-temperature + superconductors + + +
+ The discovery of new superconducting materials, particularly those exhibiting +high critical temperature ($T_c$), has been a vibrant area of study within the +field of condensed matter physics. Conventional approaches primarily rely on +physical intuition to search for potential superconductors within the existing +databases. However, the known materials only scratch the surface of the +extensive array of possibilities within the realm of materials. Here, we +develop InvDesFlow, an AI search engine that integrates deep model pre-training +and fine-tuning techniques, diffusion models, and physics-based approaches +(e.g., first-principles electronic structure calculation) for the discovery of +high-$T_c$ superconductors. Utilizing InvDesFlow, we have obtained 74 +dynamically stable materials with critical temperatures predicted by the AI +model to be $T_c \geq$ 15 K based on a very small set of samples. Notably, +these materials are not contained in any existing dataset. Furthermore, we +analyze trends in our dataset and individual materials including B$_4$CN$_3$ +(at 5 GPa) and B$_5$CN$_2$ (at ambient pressure) whose $T_c$s are 24.08 K and +15.93 K, respectively. We demonstrate that AI technique can discover a set of +new high-$T_c$ superconductors, outline its potential for accelerating +discovery of the materials with targeted properties. + +
+
+ comment: 22 pages, 17 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Moral Alignment for LLM Agents + + +
+ Decision-making agents based on pre-trained Large Language Models (LLMs) are +increasingly being deployed across various domains of human activity. While +their applications are currently rather specialized, several research efforts +are under way to develop more generalist agents. As LLM-based systems become +more agentic, their influence on human activity will grow and the transparency +of this will decrease. Consequently, developing effective methods for aligning +them to human values is vital. + The prevailing practice in alignment often relies on human preference data +(e.g., in RLHF or DPO), in which values are implicit and are essentially +deduced from relative preferences over different model outputs. In this work, +instead of relying on human feedback, we introduce the design of reward +functions that explicitly encode core human values for Reinforcement +Learning-based fine-tuning of foundation agent models. Specifically, we use +intrinsic rewards for the moral alignment of LLM agents. + We evaluate our approach using the traditional philosophical frameworks of +Deontological Ethics and Utilitarianism, quantifying moral rewards for agents +in terms of actions and consequences on the Iterated Prisoner's Dilemma (IPD) +environment. We also show how moral fine-tuning can be deployed to enable an +agent to unlearn a previously developed selfish strategy. Finally, we find that +certain moral strategies learned on the IPD game generalize to several other +matrix game environments. In summary, we demonstrate that fine-tuning with +intrinsic rewards is a promising general solution for aligning LLM agents to +human values, and it might represent a more transparent and cost-effective +alternative to currently predominant alignment techniques. + +
+
+
+
+
+ + ♻ ☆ Enriching Ontologies with Disjointness Axioms using Large Language + Models ISWC 2024 + + +
+ Ontologies often lack explicit disjointness declarations between classes, +despite their usefulness for sophisticated reasoning and consistency checking +in Knowledge Graphs. In this study, we explore the potential of Large Language +Models (LLMs) to enrich ontologies by identifying and asserting class +disjointness axioms. Our approach aims at leveraging the implicit knowledge +embedded in LLMs, using prompt engineering to elicit this knowledge for +classifying ontological disjointness. We validate our methodology on the +DBpedia ontology, focusing on open-source LLMs. Our findings suggest that LLMs, +when guided by effective prompt strategies, can reliably identify disjoint +class relationships, thus streamlining the process of ontology completion +without extensive manual input. For comprehensive disjointness enrichment, we +propose a process that takes logical relationships between disjointness and +subclass statements into account in order to maintain satisfiability and reduce +the number of calls to the LLM. This work provides a foundation for future +applications of LLMs in automated ontology enhancement and offers insights into +optimizing LLM performance through strategic prompt design. Our code is +publicly available on GitHub at https://github.com/n28div/llm-disjointness. + +
+
+ comment: Accepted at KBC-LM'24 workshop at ISWC 2024, + https://ceur-ws.org/Vol-3853/paper1.pdf +
+
+
+
+
+ + ♻ ☆ Advances in 3D Neural Stylization: A Survey + + +
+ Modern artificial intelligence offers a novel and transformative approach to +creating digital art across diverse styles and modalities like images, videos +and 3D data, unleashing the power of creativity and revolutionizing the way +that we perceive and interact with visual content. This paper reports on recent +advances in stylized 3D asset creation and manipulation with the expressive +power of neural networks. We establish a taxonomy for neural stylization, +considering crucial design choices such as scene representation, guidance data, +optimization strategies, and output styles. Building on such taxonomy, our +survey first revisits the background of neural stylization on 2D images, and +then presents in-depth discussions on recent neural stylization methods for 3D +data, accompanied by a benchmark evaluating selected mesh and neural field +stylization methods. Based on the insights gained from the survey, we highlight +the practical significance, open challenges, future research, and potential +impacts of neural stylization, which facilitates researchers and practitioners +to navigate the rapidly evolving landscape of 3D content creation using modern +artificial intelligence. + +
+
+ comment: curated list of papers: + https://github.com/chenyingshu/advances_3d_neural_stylization +
+
+
+
+
+ + ♻ ☆ MASP: Scalable GNN-based Planning for Multi-Agent Navigation + + +
+ We investigate multi-agent navigation tasks, where multiple agents need to +reach initially unassigned goals in a limited time. Classical planning-based +methods suffer from expensive computation overhead at each step and offer +limited expressiveness for complex cooperation strategies. In contrast, +reinforcement learning (RL) has recently become a popular approach for +addressing this issue. However, RL struggles with low data efficiency and +cooperation when directly exploring (nearly) optimal policies in a large +exploration space, especially with an increased number of agents(e.g., 10+ +agents) or in complex environments (e.g., 3-D simulators). In this paper, we +propose the Multi-Agent Scalable Graph-based Planner (MASP), a goal-conditioned +hierarchical planner for navigation tasks with a substantial number of agents +in the decentralized setting. MASP employs a hierarchical framework to reduce +space complexity by decomposing a large exploration space into multiple +goal-conditioned subspaces, where a high-level policy assigns agents goals, and +a low-level policy navigates agents toward designated goals. For agent +cooperation and the adaptation to varying team sizes, we model agents and goals +as graphs to better capture their relationship. The high-level policy, the Goal +Matcher, leverages a graph-based Self-Encoder and Cross-Encoder to optimize +goal assignment by updating the agent and the goal graphs. The low-level +policy, the Coordinated Action Executor, introduces the Group Information +Fusion to facilitate group division and extract agent relationships across +groups, enhancing training efficiency for agent cooperation. The results +demonstrate that MASP outperforms RL and planning-based baselines in task +efficiency. + +
+
+ comment: Submitted to IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Masked Generative Priors Improve World Models Sequence Modelling + Capabilities + + +
+ Deep Reinforcement Learning (RL) has become the leading approach for creating +artificial agents in complex environments. Model-based approaches, which are RL +methods with world models that predict environment dynamics, are among the most +promising directions for improving data efficiency, forming a critical step +toward bridging the gap between research and real-world deployment. In +particular, world models enhance sample efficiency by learning in imagination, +which involves training a generative sequence model of the environment in a +self-supervised manner. Recently, Masked Generative Modelling has emerged as a +more efficient and superior inductive bias for modelling and generating token +sequences. Building on the Efficient Stochastic Transformer-based World Models +(STORM) architecture, we replace the traditional MLP prior with a Masked +Generative Prior (e.g., MaskGIT Prior) and introduce GIT-STORM. We evaluate our +model on two downstream tasks: reinforcement learning and video prediction. +GIT-STORM demonstrates substantial performance gains in RL tasks on the Atari +100k benchmark. Moreover, we apply Transformer-based World Models to continuous +action environments for the first time, addressing a significant gap in prior +research. To achieve this, we employ a state mixer function that integrates +latent state representations with actions, enabling our model to handle +continuous control tasks. We validate this approach through qualitative and +quantitative analyses on the DeepMind Control Suite, showcasing the +effectiveness of Transformer-based World Models in this new domain. Our results +highlight the versatility and efficacy of the MaskGIT dynamics prior, paving +the way for more accurate world models and effective RL policies. + +
+
+
+
+
+ + ♻ ☆ Topology Only Pre-Training: Towards Generalised Multi-Domain Graph + Models + + +
+ The principal benefit of unsupervised representation learning is that a +pre-trained model can be fine-tuned where data or labels are scarce. Existing +approaches for graph representation learning are domain specific, maintaining +consistent node and edge features across the pre-training and target datasets. +This has precluded transfer to multiple domains. We present Topology Only +Pre-Training (ToP), a graph pre-training method based on node and edge feature +exclusion. We show positive transfer on evaluation datasets from multiple +domains, including domains not present in pre-training data, running directly +contrary to assumptions made in contemporary works. On 75% of experiments, ToP +models perform significantly $p \leq 0.01$ better than a supervised baseline. +Performance is significantly positive on 85.7% of tasks when node and edge +features are used in fine-tuning. We further show that out-of-domain topologies +can produce more useful pre-training than in-domain. Under ToP we show better +transfer from non-molecule pre-training, compared to molecule pre-training, on +79% of molecular benchmarks. Against the limited set of other generalist graph +models ToP performs strongly, including against models with many orders of +magnitude larger. These findings show that ToP opens broad areas of research in +both transfer learning on scarcely populated graph domains and in graph +foundation models. + +
+
+ comment: 28 pages, 5 figures, 5 tables. For in-development code see + https://github.com/neutralpronoun/general-gcl +
+
+
+
+
+ + ♻ ☆ Combining Induction and Transduction for Abstract Reasoning + + +
+ When learning an input-output mapping from very few examples, is it better to +first infer a latent function that explains the examples, or is it better to +directly predict new test outputs, e.g. using a neural network? We study this +question on ARC by training neural models for induction (inferring latent +functions) and transduction (directly predicting the test output for a given +test input). We train on synthetically generated variations of Python programs +that solve ARC training tasks. We find inductive and transductive models solve +different kinds of test problems, despite having the same training problems and +sharing the same neural architecture: Inductive program synthesis excels at +precise computations, and at composing multiple concepts, while transduction +succeeds on fuzzier perceptual concepts. Ensembling them approaches human-level +performance on ARC. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Zeroth-Order Methods for Scalable Large Language + Model Finetuning + + +
+ Fine-tuning on task-specific datasets is a widely-embraced paradigm of +harnessing the powerful capability of pretrained LLMs for various downstream +tasks. Due to the popularity of LLMs fine-tuning and its accompanying privacy +concerns, differentially private (DP) fine-tuning of pretrained LLMs has been +widely used to safeguarding the privacy of task-specific datasets. Lying at the +design core of DP LLM fine-tuning methods is the satisfactory tradeoff among +privacy, utility, and scalability. Most existing methods build upon the seminal +work of DP-SGD. Despite pushing the scalability of DP-SGD to its limit, +DP-SGD-based fine-tuning methods are unfortunately limited by the inherent +inefficiency of SGD. + In this paper, we investigate the potential of DP zeroth-order methods for +LLM pretraining, which avoids the scalability bottleneck of SGD by +approximating the gradient with the more efficient zeroth-order gradient. +Rather than treating the zeroth-order method as a drop-in replacement for SGD, +this paper presents a comprehensive study both theoretically and empirically. +First, we propose the stagewise DP zeroth-order method (DP-ZOSO) that +dynamically schedules key hyperparameters. This design is grounded on the +synergy between DP random perturbation and the gradient approximation error of +the zeroth-order method, and its effect on fine-tuning trajectory. + We provide theoretical analysis for both proposed methods. We conduct +extensive empirical analysis on both encoder-only masked language model and +decoder-only autoregressive language model, achieving impressive results in +terms of scalability and utility regardless of the class of tasks (compared +with DPZero, DP-ZOPO improves $4.5\%$ on SST-5, $5.5\%$ on MNLI with +RoBERTa-Large and 9.2\% on CB, 3.9\% on BoolQ with OPT-2.7b when $\epsilon=4$, +demonstrates more significant enhancement in performance on more complicated +tasks). + +
+
+
+
+
+ + ♻ ☆ Learning General Representation of 12-Lead Electrocardiogram with a + Joint-Embedding Predictive Architecture + + +
+ Electrocardiogram (ECG) captures the heart's electrical signals, offering +valuable information for diagnosing cardiac conditions. However, the scarcity +of labeled data makes it challenging to fully leverage supervised learning in +medical domain. Self-supervised learning (SSL) offers a promising solution, +enabling models to learn from unlabeled data and uncover meaningful patterns. +In this paper, we show that masked modeling in the latent space can be a +powerful alternative to existing self-supervised methods in the ECG domain. We +introduce ECG-JEPA, a SSL model for 12-lead ECG analysis that learns semantic +representations of ECG data by predicting in the hidden latent space, bypassing +the need to reconstruct raw signals. This approach offers several advantages in +the ECG domain: (1) it avoids producing unnecessary details, such as noise, +which is common in ECG; and (2) it addresses the limitations of na\"ive L2 loss +between raw signals. Another key contribution is the introduction of +Cross-Pattern Attention (CroPA), a specialized masked attention mechanism +tailored for 12-lead ECG data. ECG-JEPA is trained on the union of several open +ECG datasets, totaling approximately 180,000 samples, and achieves +state-of-the-art performance in various downstream tasks including ECG +classification and feature prediction. Our code is openly available at +https://github.com/sehunfromdaegu/ECG_JEPA. + +
+
+
+
+
+ + ♻ ☆ VQA$^2$: Visual Question Answering for Video Quality Assessment + + +
+ The advent and proliferation of large multi-modal models (LMMs) have +introduced new paradigms to computer vision, transforming various tasks into a +unified visual question answering framework. Video Quality Assessment (VQA), a +classic field in low-level visual perception, focused initially on quantitative +video quality scoring. However, driven by advances in LMMs, it is now +progressing toward more holistic visual quality understanding tasks. Recent +studies in the image domain have demonstrated that Visual Question Answering +(VQA) can markedly enhance low-level visual quality evaluation. Nevertheless, +related work has not been explored in the video domain, leaving substantial +room for improvement. To address this gap, we introduce the VQA2 Instruction +Dataset - the first visual question answering instruction dataset that focuses +on video quality assessment. This dataset consists of 3 subsets and covers +various video types, containing 157,755 instruction question-answer pairs. +Then, leveraging this foundation, we present the VQA2 series models. The VQA2 +series models interleave visual and motion tokens to enhance the perception of +spatial-temporal quality details in videos. We conduct extensive experiments on +video quality scoring and understanding tasks, and results demonstrate that the +VQA2series models achieve excellent performance in both tasks. Notably, our +final model, the VQA2-Assistant, exceeds the renowned GPT-4o in visual quality +understanding tasks while maintaining strong competitiveness in quality scoring +tasks. Our work provides a foundation and feasible approach for integrating +low-level video quality assessment and understanding with LMMs. + +
+
+ comment: 23 pages 12 figures +
+
+
+
+
+ + ♻ ☆ Revisiting MAE pre-training for 3D medical image segmentation + + +
+ Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the +potential of vast, untapped clinical datasets, for various downstream +applications that suffer from the scarcity of labeled data. While SSL has +revolutionized fields like natural language processing and computer vision, its +adoption in 3D medical image computing has been limited by three key pitfalls: +Small pre-training dataset sizes, architectures inadequate for 3D medical image +analysis, and insufficient evaluation practices. In this paper, we address +these issues by i) leveraging a large-scale dataset of 39k 3D brain MRI volumes +and ii) using a Residual Encoder U-Net architecture within the state-of-the-art +nnU-Net framework. iii) A robust development framework, incorporating 5 +development and 8 testing brain MRI segmentation datasets, allowed +performance-driven design decisions to optimize the simple concept of Masked +Auto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses +previous SSL methods but also outperforms the strong nnU-Net baseline by an +average of approximately 3 Dice points setting a new state-of-the-art. Our code +and models are made available here. + +
+
+ comment: Arxiv Preprint. Revised and under review +
+
+
+
+
+ + ♻ ☆ Protecting Federated Learning from Extreme Model Poisoning Attacks via + Multidimensional Time Series Anomaly Detection + + +
+ Current defense mechanisms against model poisoning attacks in federated +learning (FL) systems have proven effective up to a certain threshold of +malicious clients. In this work, we introduce FLANDERS, a novel pre-aggregation +filter for FL resilient to large-scale model poisoning attacks, i.e., when +malicious clients far exceed legitimate participants. FLANDERS treats the +sequence of local models sent by clients in each FL round as a matrix-valued +time series. Then, it identifies malicious client updates as outliers in this +time series by comparing actual observations with estimates generated by a +matrix autoregressive forecasting model maintained by the server. Experiments +conducted in several non-iid FL setups show that FLANDERS significantly +improves robustness across a wide spectrum of attacks when paired with standard +and robust existing aggregation methods. + +
+
+
+
+
+ + ♻ ☆ An Architectural Approach to Enhance Deep Long-Tailed Learning + + +
+ Deep long-tailed recognition has been widely studied to address the issue of +imbalanced data distributions in real-world scenarios. However, there has been +insufficient focus on the design of neural architectures, despite empirical +evidence suggesting that architecture can significantly impact performance. In +this paper, we attempt to mitigate long-tailed issues through architectural +improvements. To simplify the design process, we utilize Differential +Architecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS +methods struggle to perform well in long-tailed scenarios. To tackle this +challenge, we introduce Long-Tailed Differential Architecture Search (LTDAS). +Specifically, we conduct extensive experiments to explore architectural +components that demonstrate better performance on long-tailed data and propose +a new search space based on our observations. This ensures that the +architecture obtained through our search process incorporates superior +components. Additionally, we propose replacing the learnable linear classifier +with an Equiangular Tight Frame (ETF) classifier to further enhance our method. +This classifier effectively alleviates the biased search process and prevents +performance collapse. Extensive experimental evaluations demonstrate that our +approach consistently improves upon existing methods from an orthogonal +perspective and achieves state-of-the-art results with simple enhancements. + +
+
+
+
+
+ + ♻ ☆ Understanding LLM Embeddings for Regression + + +
+ With the rise of large language models (LLMs) for flexibly processing +information as strings, a natural application is regression, specifically by +preprocessing string representations into LLM embeddings as downstream features +for metric prediction. In this paper, we provide one of the first comprehensive +investigations into embedding-based regression and demonstrate that LLM +embeddings as features can be better for high-dimensional regression tasks than +using traditional feature engineering. This regression performance can be +explained in part due to LLM embeddings over numeric data inherently preserving +Lipschitz continuity over the feature space. Furthermore, we quantify the +contribution of different model effects, most notably model size and language +understanding, which we find surprisingly do not always improve regression +performance. + +
+
+ comment: 16 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Dual-Personalizing Adapter for Federated Foundation Models + + +
+ Recently, foundation models, particularly large language models (LLMs), have +demonstrated an impressive ability to adapt to various tasks by fine-tuning +diverse instruction data. Notably, federated foundation models (FedFM) emerge +as a privacy preservation method to fine-tune models collaboratively under +federated learning (FL) settings by leveraging many distributed datasets with +non-IID data. To alleviate communication and computation overhead, +parameter-efficient methods are introduced for efficiency, and some research +adapted personalization methods to FedFM for better user preferences alignment. +However, a critical gap in existing research is the neglect of test-time +distribution shifts in real-world applications, and conventional methods for +test-time distribution shifts in personalized FL are less effective for FedFM +due to their failure to adapt to complex distribution shift scenarios and the +requirement to train all parameters. To bridge this gap, we refine the setting +in FedFM, termed test-time personalization, which aims to learn personalized +federated foundation models on clients while effectively handling test-time +distribution shifts simultaneously. To address challenges in this setting, we +explore a simple yet effective solution, a Federated Dual-Personalizing Adapter +(FedDPA) architecture. By co-working with a foundation model, a global adapter +and a local adapter jointly tackle the test-time distribution shifts and +client-specific personalization. Additionally, we introduce an instance-wise +dynamic weighting mechanism that dynamically integrates the global and local +adapters for each test instance during inference, facilitating effective +test-time personalization. The effectiveness of the proposed method has been +evaluated on benchmark datasets across different NLP tasks. + +
+
+
+
+
+ + ♻ ☆ Self-Adaptive Quantum Kernel Principal Components Analysis for Compact + Readout of Chemiresistive Sensor Arrays + + +
+ The rapid growth of Internet of Things (IoT) devices necessitates efficient +data compression techniques to handle the vast amounts of data generated by +these devices. Chemiresistive sensor arrays (CSAs), a simple-to-fabricate but +crucial component in IoT systems, generate large volumes of data due to their +simultaneous multi-sensor operations. Classical principal component analysis +(cPCA) methods, a common solution to the data compression challenge, face +limitations in preserving critical information during dimensionality reduction. +In this study, we present self-adaptive quantum kernel (SAQK) PCA as a superior +alternative to enhance information retention. Our findings demonstrate that +SAQK PCA outperforms cPCA in various back-end machine-learning tasks, +especially in low-dimensional scenarios where access to quantum bits is +limited. These results highlight the potential of noisy intermediate-scale +quantum (NISQ) computers to revolutionize data processing in real-world IoT +applications by improving the efficiency and reliability of CSA data +compression and readout, despite the current constraints on qubit availability. + +
+
+ comment: Version 2 +
+
+
+
+
+ + ♻ ☆ MLLM-LLaVA-FL: Multimodal Large Language Model Assisted Federated + Learning WACV 2025 + + +
+ Previous studies on federated learning (FL) often encounter performance +degradation due to data heterogeneity among different clients. In light of the +recent advances in multimodal large language models (MLLMs), such as GPT-4v and +LLaVA, which demonstrate their exceptional proficiency in multimodal tasks, +such as image captioning and multimodal question answering. We introduce a +novel federated learning framework, named Multimodal Large Language Model +Assisted Federated Learning (MLLM-LLaVA-FL), which employs powerful MLLMs at +the server end to address the heterogeneous and long-tailed challenges. Owing +to the advanced cross-modality representation capabilities and the extensive +open-vocabulary prior knowledge of MLLMs, our framework is adept at harnessing +the extensive, yet previously underexploited, open-source data accessible from +websites and powerful server-side computational resources. Hence, the +MLLM-LLaVA-FL not only enhances the performance but also avoids increasing the +risk of privacy leakage and the computational burden on local devices, +distinguishing it from prior methodologies. Our framework has three key stages. +Initially, we conduct global visual-text pretraining of the model. This +pretraining is facilitated by utilizing the extensive open-source data +available online, with the assistance of MLLMs. Subsequently, the pretrained +model is distributed among various clients for local training. Finally, once +the locally trained models are transmitted back to the server, a global +alignment is carried out under the supervision of MLLMs to further enhance the +performance. Experimental evaluations on established benchmarks, show that our +framework delivers promising performance in the typical scenarios with data +heterogeneity and long-tail distribution across different clients in FL. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Strongly-polynomial time and validation analysis of policy gradient + methods + + +
+ This paper proposes a novel termination criterion, termed the advantage gap +function, for finite state and action Markov decision processes (MDP) and +reinforcement learning (RL). By incorporating this advantage gap function into +the design of step size rules and deriving a new linear rate of convergence +that is independent of the stationary state distribution of the optimal policy, +we demonstrate that policy gradient methods can solve MDPs in +strongly-polynomial time. To the best of our knowledge, this is the first time +that such strong convergence properties have been established for policy +gradient methods. Moreover, in the stochastic setting, where only stochastic +estimates of policy gradients are available, we show that the advantage gap +function provides close approximations of the optimality gap for each +individual state and exhibits a sublinear rate of convergence at every state. +The advantage gap function can be easily estimated in the stochastic case, and +when coupled with easily computable upper bounds on policy values, they provide +a convenient way to validate the solutions generated by policy gradient +methods. Therefore, our developments offer a principled and computable measure +of optimality for RL, whereas current practice tends to rely on +algorithm-to-algorithm or baselines comparisons with no certificate of +optimality. + +
+
+ comment: Add numerical experiments +
+
+
+
+
+ + ♻ ☆ Depression Detection and Analysis using Large Language Models on Textual + and Audio-Visual Modalities + + +
+ Depression has proven to be a significant public health issue, profoundly +affecting the psychological well-being of individuals. If it remains +undiagnosed, depression can lead to severe health issues, which can manifest +physically and even lead to suicide. Generally, Diagnosing depression or any +other mental disorder involves conducting semi-structured interviews alongside +supplementary questionnaires, including variants of the Patient Health +Questionnaire (PHQ) by Clinicians and mental health professionals. This +approach places significant reliance on the experience and judgment of trained +physicians, making the diagnosis susceptible to personal biases. Given that the +underlying mechanisms causing depression are still being actively researched, +physicians often face challenges in diagnosing and treating the condition, +particularly in its early stages of clinical presentation. Recently, +significant strides have been made in Artificial neural computing to solve +problems involving text, image, and speech in various domains. Our analysis has +aimed to leverage these state-of-the-art (SOTA) models in our experiments to +achieve optimal outcomes leveraging multiple modalities. The experiments were +performed on the Extended Distress Analysis Interview Corpus Wizard of Oz +dataset (E-DAIC) corpus presented in the Audio/Visual Emotion Challenge (AVEC) +2019 Challenge. The proposed solutions demonstrate better results achieved by +Proprietary and Open-source Large Language Models (LLMs), which achieved a Root +Mean Square Error (RMSE) score of 3.98 on Textual Modality, beating the AVEC +2019 challenge baseline results and current SOTA regression analysis +architectures. Additionally, the proposed solution achieved an accuracy of +71.43% in the classification task. The paper also includes a novel audio-visual +multi-modal network that predicts PHQ-8 scores with an RMSE of 6.51. + +
+
+ comment: 12 pages, 9 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ CoMERA: Computing- and Memory-Efficient Training via Rank-Adaptive + Tensor Optimization + + +
+ Training large AI models such as LLMs and DLRMs costs massive GPUs and +computing time. The high training cost has become only affordable to big tech +companies, meanwhile also causing increasing concerns about the environmental +impact. This paper presents CoMERA, a Computing- and Memory-Efficient training +method via Rank-Adaptive tensor optimization. CoMERA achieves rank-adaptive +tensor-compressed (pre)-training via a multi-objective optimization formulation +and improves the training to provide both a high compression ratio and +excellent accuracy in the training process. Our optimized numerical computation +(e.g., optimized tensorized embedding and tensor-network contractions) and GPU +implementation eliminate part of the run-time overhead in the tensorized +training on GPU. This leads to, for the first time, $2-3\times$ speedup per +training epoch compared with standard training. CoMERA also outperforms the +recent GaLore in terms of both memory and computing efficiency. Specifically, +CoMERA is $2\times$ faster per training epoch and $9\times$ more +memory-efficient than GaLore on a tested six-encoder transformer with +single-batch training. Our method also shows $\sim 2\times$ speedup than +standard pre-training on a BERT-like code-generation LLM while achieving +$4.23\times$ compression ratio in pre-training. With further HPC optimization, +CoMERA may reduce the pre-training cost of many other LLMs. An implementation +of CoMERA is available at https://github.com/ziyangjoy/CoMERA. + +
+
+ comment: Accepted by Neurips 2024 +
+
+
+
+
+ + ♻ ☆ Query-Guided Self-Supervised Summarization of Nursing Notes + + +
+ Nursing notes, an important part of Electronic Health Records (EHRs), track a +patient's health during a care episode. Summarizing key information in nursing +notes can help clinicians quickly understand patients' conditions. However, +existing summarization methods in the clinical setting, especially abstractive +methods, have overlooked nursing notes and require reference summaries for +training. We introduce QGSumm, a novel query-guided self-supervised domain +adaptation approach for abstractive nursing note summarization. The method uses +patient-related clinical queries for guidance, and hence does not need +reference summaries for training. Through automatic experiments and manual +evaluation by an expert clinician, we study our approach and other +state-of-the-art Large Language Models (LLMs) for nursing note summarization. +Our experiments show: 1) GPT-4 is competitive in maintaining information in the +original nursing notes, 2) QGSumm can generate high-quality summaries with a +good balance between recall of the original content and hallucination rate +lower than other top methods. Ultimately, our work offers a new perspective on +conditional text summarization, tailored to clinical applications. + +
+
+
+
+
+ + ♻ ☆ Enhancing the automatic segmentation and analysis of 3D liver + vasculature models + + +
+ Surgical assessment of liver cancer patients requires identification of the +vessel trees from medical images. Specifically, the venous trees - the portal +(perfusing) and the hepatic (draining) trees are important for understanding +the liver anatomy and disease state, and perform surgery planning. This +research aims to improve the 3D segmentation, skeletonization, and subsequent +analysis of vessel trees, by creating an automatic pipeline based on deep +learning and image processing techniques. + The first part of this work explores the impact of differentiable +skeletonization methods such as ClDice and morphological skeletonization loss, +on the overall liver vessel segmentation performance. To this aim, it studies +how to improve vessel tree connectivity. + The second part of this study converts a single class vessel segmentation +into multi-class ones, separating the two venous trees. It builds on the +previous two-class vessel segmentation model, which vessel tree outputs might +be entangled, and on connected components and skeleton analyses of the trees. + After providing sub-labeling of the specific anatomical branches of each +venous tree, these algorithms also enable a morphometric analysis of the vessel +trees by extracting various geometrical markers. + In conclusion, we propose a method that successfully improves current +skeletonization methods, for extensive vascular trees that contain vessels of +different calibers. The separation algorithm creates a clean multi-class +segmentation of the vessels, validated by surgeons to provide low error. A new, +publicly shared high-quality liver vessel dataset of 77 cases is thus created. +Finally a method to annotate vessel trees according to anatomy is provided, +enabling a unique liver vessel morphometry analysis. + +
+
+ comment: Internship at Simbiotx +
+
+
+
+
+ + ♻ ☆ MVBoost: Boost 3D Reconstruction with Multi-View Refinement + + +
+ Recent advancements in 3D object reconstruction have been remarkable, yet +most current 3D models rely heavily on existing 3D datasets. The scarcity of +diverse 3D datasets results in limited generalization capabilities of 3D +reconstruction models. In this paper, we propose a novel framework for boosting +3D reconstruction with multi-view refinement (MVBoost) by generating pseudo-GT +data. The key of MVBoost is combining the advantages of the high accuracy of +the multi-view generation model and the consistency of the 3D reconstruction +model to create a reliable data source. Specifically, given a single-view input +image, we employ a multi-view diffusion model to generate multiple views, +followed by a large 3D reconstruction model to produce consistent 3D data. +MVBoost then adaptively refines these multi-view images, rendered from the +consistent 3D data, to build a large-scale multi-view dataset for training a +feed-forward 3D reconstruction model. Additionally, the input view optimization +is designed to optimize the corresponding viewpoints based on the user's input +image, ensuring that the most important viewpoint is accurately tailored to the +user's needs. Extensive evaluations demonstrate that our method achieves +superior reconstruction results and robust generalization compared to prior +works. + +
+
+
+
+
+ + ♻ ☆ LUMIA: Linear probing for Unimodal and MultiModal Membership Inference + Attacks leveraging internal LLM states + + +
+ Large Language Models (LLMs) are increasingly used in a variety of +applications, but concerns around membership inference have grown in parallel. +Previous efforts focus on black-to-grey-box models, thus neglecting the +potential benefit from internal LLM information. To address this, we propose +the use of Linear Probes (LPs) as a method to detect Membership Inference +Attacks (MIAs) by examining internal activations of LLMs. Our approach, dubbed +LUMIA, applies LPs layer-by-layer to get fine-grained data on the model inner +workings. We test this method across several model architectures, sizes and +datasets, including unimodal and multimodal tasks. In unimodal MIA, LUMIA +achieves an average gain of 15.71 % in Area Under the Curve (AUC) over previous +techniques. Remarkably, LUMIA reaches AUC>60% in 65.33% of cases -- an +increment of 46.80% against the state of the art. Furthermore, our approach +reveals key insights, such as the model layers where MIAs are most detectable. +In multimodal models, LPs indicate that visual inputs can significantly +contribute to detect MIAs -- AUC>60% is reached in 85.90% of experiments. + +
+
+
+
+
+ + ♻ ☆ Recent Advances in Attack and Defense Approaches of Large Language + Models + + +
+ Large Language Models (LLMs) have revolutionized artificial intelligence and +machine learning through their advanced text processing and generating +capabilities. However, their widespread deployment has raised significant +safety and reliability concerns. Established vulnerabilities in deep neural +networks, coupled with emerging threat models, may compromise security +evaluations and create a false sense of security. Given the extensive research +in the field of LLM security, we believe that summarizing the current state of +affairs will help the research community better understand the present +landscape and inform future developments. This paper reviews current research +on LLM vulnerabilities and threats, and evaluates the effectiveness of +contemporary defense mechanisms. We analyze recent studies on attack vectors +and model weaknesses, providing insights into attack mechanisms and the +evolving threat landscape. We also examine current defense strategies, +highlighting their strengths and limitations. By contrasting advancements in +attack and defense methodologies, we identify research gaps and propose future +directions to enhance LLM security. Our goal is to advance the understanding of +LLM safety challenges and guide the development of more robust security +measures. + +
+
+
+
+
+ + ♻ ☆ PAR: Prompt-Aware Token Reduction Method for Efficient Large Multimodal + Models + + +
+ Multimodal large language models (MLLMs) demonstrate strong performance +across visual tasks, but their efficiency is hindered by significant +computational and memory demands from processing long contexts in multimodal +inputs. To address this, we introduce PAR (Prompt-Aware Token Reduction), a +novel and plug-and-play approach that reduces visual tokens efficiently without +compromising model performance. Unlike previous methods that rely heavily on +attention mechanisms and overlooking cross-modal interactions , we uses a +prompt-aware strategy to adpative identify and cluster essential visual tokens. +PAR categorizes visual context redundancy into two types: external and +internal. External redundancy is minimized through semantic retrieval, while +internal redundancy is addressed using a token routing mechanism. This method +substantially reduces computational load without requiring additional training +or complex architectural modifications. \textbf{Experimental results +demonstrate that across various visual question answering tasks, PAR reduces +FLOPs by 83\% with a compression ratio of 89\%, while retaining 97\% of +baseline accuracy.} The adaptive design of PAR achieves a 2x token reduction +ratio compared to prior approaches, enabling a better balance between +performance and efficiency. + +
+
+ comment: 10 pages, 5 figures,3 tables +
+
+
+
+
+ + ♻ ☆ Knowledge Entropy Decay during Language Model Pretraining Hinders New + Knowledge Acquisition + + +
+ In this work, we investigate how a model's tendency to broadly integrate its +parametric knowledge evolves throughout pretraining, and how this behavior +affects overall performance, particularly in terms of knowledge acquisition and +forgetting. We introduce the concept of knowledge entropy, which quantifies the +range of memory sources the model engages with; high knowledge entropy +indicates that the model utilizes a wide range of memory sources, while low +knowledge entropy suggests reliance on specific sources with greater certainty. +Our analysis reveals a consistent decline in knowledge entropy as pretraining +advances. We also find that the decline is closely associated with a reduction +in the model's ability to acquire and retain knowledge, leading us to conclude +that diminishing knowledge entropy (smaller number of active memory sources) +impairs the model's knowledge acquisition and retention capabilities. We find +further support for this by demonstrating that increasing the activity of +inactive memory sources enhances the model's capacity for knowledge acquisition +and retention. + +
+
+
+
+
+ + ♻ ☆ Visual Cue Enhancement and Dual Low-Rank Adaptation for Efficient Visual + Instruction Fine-Tuning + + +
+ Parameter-efficient fine-tuning multimodal large language models (MLLMs) +presents significant challenges, including reliance on high-level visual +features that limit fine-grained detail comprehension, and data conflicts that +arise from task complexity. To address these issues, we propose an efficient +fine-tuning framework with two novel approaches: Vision Cue Enhancement (VCE) +and Dual Low-Rank Adaptation (Dual-LoRA). VCE enhances the vision projector by +integrating multi-level visual cues, improving the model's ability to capture +fine-grained visual features. Dual-LoRA introduces a dual low-rank structure +for instruction tuning, decoupling learning into skill and task spaces to +enable precise control and efficient adaptation across diverse tasks. Our +method simplifies implementation, enhances visual comprehension, and improves +adaptability. Experiments on both downstream tasks and general benchmarks +demonstrate the effectiveness of our proposed approach. + +
+
+
+
+
+ + ♻ ☆ From Pixels to Insights: A Survey on Automatic Chart Understanding in + the Era of Large Foundation Models + + +
+ Data visualization in the form of charts plays a pivotal role in data +analysis, offering critical insights and aiding in informed decision-making. +Automatic chart understanding has witnessed significant advancements with the +rise of large foundation models in recent years. Foundation models, such as +large language models, have revolutionized various natural language processing +tasks and are increasingly being applied to chart understanding tasks. This +survey paper provides a comprehensive overview of the recent developments, +challenges, and future directions in chart understanding within the context of +these foundation models. We review fundamental building blocks crucial for +studying chart understanding tasks. Additionally, we explore various tasks and +their evaluation metrics and sources of both charts and textual inputs. Various +modeling strategies are then examined, encompassing both classification-based +and generation-based approaches, along with tool augmentation techniques that +enhance chart understanding performance. Furthermore, we discuss the +state-of-the-art performance of each task and discuss how we can improve the +performance. Challenges and future directions are addressed, highlighting the +importance of several topics, such as domain-specific charts, lack of efforts +in developing evaluation metrics, and agent-oriented settings. This survey +paper serves as a comprehensive resource for researchers and practitioners in +the fields of natural language processing, computer vision, and data analysis, +providing valuable insights and directions for future research in chart +understanding leveraging large foundation models. The studies mentioned in this +paper, along with emerging new research, will be continually updated at: +https://github.com/khuangaf/Awesome-Chart-Understanding. + +
+
+ comment: IEEE Transactions on Knowledge and Data Engineering (TKDE) +
+
+
+
+
+ + ♻ ☆ Stock Movement Prediction with Multimodal Stable Fusion via Gated + Cross-Attention Mechanism + + +
+ The accurate prediction of stock movements is crucial for investment +strategies. Stock prices are subject to the influence of various forms of +information, including financial indicators, sentiment analysis, news +documents, and relational structures. Predominant analytical approaches, +however, tend to address only unimodal or bimodal sources, neglecting the +complexity of multimodal data. Further complicating the landscape are the +issues of data sparsity and semantic conflicts between these modalities, which +are frequently overlooked by current models, leading to unstable performance +and limiting practical applicability. To address these shortcomings, this study +introduces a novel architecture, named Multimodal Stable Fusion with Gated +Cross-Attention (MSGCA), designed to robustly integrate multimodal input for +stock movement prediction. The MSGCA framework consists of three integral +components: (1) a trimodal encoding module, responsible for processing +indicator sequences, dynamic documents, and a relational graph, and +standardizing their feature representations; (2) a cross-feature fusion module, +where primary and consistent features guide the multimodal fusion of the three +modalities via a pair of gated cross-attention networks; and (3) a prediction +module, which refines the fused features through temporal and dimensional +reduction to execute precise movement forecasting. Empirical evaluations +demonstrate that the MSGCA framework exceeds current leading methods, achieving +performance gains of 8.1%, 6.1%, 21.7% and 31.6% on four multimodal datasets, +respectively, attributed to its enhanced multimodal fusion stability. + +
+
+ comment: 14 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Unveiling and Mitigating Bias in Large Language Model Recommendations: A + Path to Fairness + + +
+ excel in delivering comprehensive suggestions by deeply analyzing content and +user behavior. However, they often inherit biases from skewed training data, +favoring mainstream content while underrepresenting diverse or non-traditional +options. This study explores the interplay between bias and LLM-based +recommendation systems, focusing on music, song, and book recommendations +across diverse demographic and cultural groups. This paper analyzes bias in +LLM-based recommendation systems across multiple models (GPT, LLaMA, and +Gemini), revealing its deep and pervasive impact on outcomes. Intersecting +identities and contextual factors, like socioeconomic status, further amplify +biases, complicating fair recommendations across diverse groups. Our findings +reveal that bias in these systems is deeply ingrained, yet even simple +interventions like prompt engineering can significantly reduce it. We further +propose a retrieval-augmented generation strategy to mitigate bias more +effectively. Numerical experiments validate these strategies, demonstrating +both the pervasive nature of bias and the impact of the proposed solutions. + +
+
+
+
+
+ + ♻ ☆ CantorNet: A Sandbox for Testing Geometrical and Topological Complexity + Measures NeurIPS + + +
+ Many natural phenomena are characterized by self-similarity, for example the +symmetry of human faces, or a repetitive motif of a song. Studying of such +symmetries will allow us to gain deeper insights into the underlying mechanisms +of complex systems. Recognizing the importance of understanding these patterns, +we propose a geometrically inspired framework to study such phenomena in +artificial neural networks. To this end, we introduce \emph{CantorNet}, +inspired by the triadic construction of the Cantor set, which was introduced by +Georg Cantor in the $19^\text{th}$ century. In mathematics, the Cantor set is a +set of points lying on a single line that is self-similar and has a counter +intuitive property of being an uncountably infinite null set. Similarly, we +introduce CantorNet as a sandbox for studying self-similarity by means of novel +topological and geometrical complexity measures. CantorNet constitutes a family +of ReLU neural networks that spans the whole spectrum of possible Kolmogorov +complexities, including the two opposite descriptions (linear and exponential +as measured by the description length). CantorNet's decision boundaries can be +arbitrarily ragged, yet are analytically known. Besides serving as a testing +ground for complexity measures, our work may serve to illustrate potential +pitfalls in geometry-ignorant data augmentation techniques and adversarial +attacks. + +
+
+ comment: Accepted at the NeurIPS Workshop on Symmetry and Geometry in Neural + Representations, 2024 +
+
+
+
+
+ + ♻ ☆ Critical Tokens Matter: Token-Level Contrastive Estimation Enhances + LLM's Reasoning Capability + + +
+ Large Language Models (LLMs) have exhibited remarkable performance on +reasoning tasks. They utilize autoregressive token generation to construct +reasoning trajectories, enabling the development of a coherent chain of +thought. In this work, we explore the impact of individual tokens on the final +outcomes of reasoning tasks. We identify the existence of ``critical tokens'' +that lead to incorrect reasoning trajectories in LLMs. Specifically, we find +that LLMs tend to produce positive outcomes when forced to decode other tokens +instead of critical tokens. Motivated by this observation, we propose a novel +approach - cDPO - designed to automatically recognize and conduct token-level +rewards for the critical tokens during the alignment process. Specifically, we +develop a contrastive estimation approach to automatically identify critical +tokens. It is achieved by comparing the generation likelihood of positive and +negative models. To achieve this, we separately fine-tune the positive and +negative models on various reasoning trajectories, consequently, they are +capable of identifying identify critical tokens within incorrect trajectories +that contribute to erroneous outcomes. Moreover, to further align the model +with the critical token information during the alignment process, we extend the +conventional DPO algorithms to token-level DPO and utilize the differential +likelihood from the aforementioned positive and negative model as important +weight for token-level DPO learning.Experimental results on GSM8K and MATH500 +benchmarks with two-widely used models Llama-3 (8B and 70B) and deepseek-math +(7B) demonstrate the effectiveness of the propsoed approach cDPO. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ NoisyNN: Exploring the Impact of Information Entropy Change in Learning + Systems + + +
+ We investigate the impact of entropy change in deep learning systems by noise +injection at different levels, including the embedding space and the image. The +series of models that employ our methodology are collectively known as Noisy +Neural Networks (NoisyNN), with examples such as NoisyViT and NoisyCNN. Noise +is conventionally viewed as a harmful perturbation in various deep learning +architectures, such as convolutional neural networks (CNNs) and vision +transformers (ViTs), as well as different learning tasks like image +classification and transfer learning. However, this work shows noise can be an +effective way to change the entropy of the learning system. We demonstrate that +specific noise can boost the performance of various deep models under certain +conditions. We theoretically prove the enhancement gained from positive noise +by reducing the task complexity defined by information entropy and +experimentally show the significant performance gain in large image datasets, +such as the ImageNet. Herein, we use the information entropy to define the +complexity of the task. We categorize the noise into two types, positive noise +(PN) and harmful noise (HN), based on whether the noise can help reduce the +task complexity. Extensive experiments of CNNs and ViTs have shown performance +improvements by proactively injecting positive noise, where we achieved an +unprecedented top 1 accuracy of 95$\%$ on ImageNet. Both theoretical analysis +and empirical evidence have confirmed that the presence of positive noise, can +benefit the learning process, while the traditionally perceived harmful noise +indeed impairs deep learning models. The different roles of noise offer new +explanations for deep models on specific tasks and provide a new paradigm for +improving model performance. Moreover, it reminds us that we can influence the +performance of learning systems via information entropy change. + +
+
+ comment: Task Entropy, NoisyViT, NoisyCNN +
+
+
+
+
+ + ♻ ☆ Towards Understanding Domain Adapted Sentence Embeddings for Document + Retrieval + + +
+ A plethora of sentence embedding models makes it challenging to choose one, +especially for technical domains rich with specialized vocabulary. In this +work, we domain adapt embeddings using telecom, health and science datasets for +question answering. We evaluate embeddings obtained from publicly available +models and their domain-adapted variants, on both point retrieval accuracies, +as well as their (95\%) confidence intervals. We establish a systematic method +to obtain thresholds for similarity scores for different embeddings. As +expected, we observe that fine-tuning improves mean bootstrapped accuracies. We +also observe that it results in tighter confidence intervals, which further +improve when pre-training is preceded by fine-tuning. We introduce metrics +which measure the distributional overlaps of top-$K$, correct and random +document similarities with the question. Further, we show that these metrics +are correlated with retrieval accuracy and similarity thresholds. Recent +literature shows conflicting effects of isotropy on retrieval accuracies. Our +experiments establish that the isotropy of embeddings (as measured by two +independent state-of-the-art isotropy metric definitions) is poorly correlated +with retrieval performance. We show that embeddings for domain-specific +sentences have little overlap with those for domain-agnostic ones, and +fine-tuning moves them further apart. Based on our results, we provide +recommendations for use of our methodology and metrics by researchers and +practitioners. + +
+
+
+
+
+ + ♻ GameGen-X: Interactive Open-world Game Video Generation + + +
+ We introduce GameGen-X, the first diffusion transformer model specifically +designed for both generating and interactively controlling open-world game +videos. This model facilitates high-quality, open-domain generation by +simulating an extensive array of game engine features, such as innovative +characters, dynamic environments, complex actions, and diverse events. +Additionally, it provides interactive controllability, predicting and altering +future content based on the current clip, thus allowing for gameplay +simulation. To realize this vision, we first collected and built an Open-World +Video Game Dataset from scratch. It is the first and largest dataset for +open-world game video generation and control, which comprises over a million +diverse gameplay video clips sampling from over 150 games with informative +captions from GPT-4o. GameGen-X undergoes a two-stage training process, +consisting of foundation model pre-training and instruction tuning. Firstly, +the model was pre-trained via text-to-video generation and video continuation, +endowing it with the capability for long-sequence, high-quality open-domain +game video generation. Further, to achieve interactive controllability, we +designed InstructNet to incorporate game-related multi-modal control signal +experts. This allows the model to adjust latent representations based on user +inputs, unifying character interaction and scene content control for the first +time in video generation. During instruction tuning, only the InstructNet is +updated while the pre-trained foundation model is frozen, enabling the +integration of interactive controllability without loss of diversity and +quality of generated video content. + +
+
+ comment: Homepage: https://gamegen-x.github.io/ Github: + https://github.com/GameGen-X/GameGen-X +
+
+
+
+
+ + ♻ ☆ Atari-GPT: Benchmarking Multimodal Large Language Models as Low-Level + Policies in Atari Games + + +
+ Recent advancements in large language models (LLMs) have expanded their +capabilities beyond traditional text-based tasks to multimodal domains, +integrating visual, auditory, and textual data. While multimodal LLMs have been +extensively explored for high-level planning in domains like robotics and +games, their potential as low-level controllers remains largely untapped. In +this paper, we introduce a novel benchmark aimed at testing the emergent +capabilities of multimodal LLMs as low-level policies in Atari games. Unlike +traditional reinforcement learning (RL) methods that require training for each +new environment and reward function specification, these LLMs utilize +pre-existing multimodal knowledge to directly engage with game environments. +Our study assesses the performances of multiple multimodal LLMs against +traditional RL agents, human players, and random agents, focusing on their +ability to understand and interact with complex visual scenes and formulate +strategic responses. Our results show that these multimodal LLMs are not yet +capable of being zero-shot low-level policies. Furthermore, we see that this +is, in part, due to their visual and spatial reasoning. Additional results and +videos are available on our project webpage: +https://dev1nw.github.io/atari-gpt/. + +
+
+ comment: Currently under review +
+
+
+
+
+ + ♻ ☆ DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow + Decoding + + +
+ Human motion, inherently continuous and dynamic, presents significant +challenges for generative models. Despite their dominance, discrete +quantization methods, such as VQ-VAEs, suffer from inherent limitations, +including restricted expressiveness and frame-wise noise artifacts. Continuous +approaches, while producing smoother and more natural motions, often falter due +to high-dimensional complexity and limited training data. To resolve this +"discord" between discrete and continuous representations, we introduce +DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a +novel method that decodes discrete motion tokens into continuous motion through +rectified flow. By employing an iterative refinement process in the continuous +space, DisCoRD captures fine-grained dynamics and ensures smoother and more +natural motions. Compatible with any discrete-based framework, our method +enhances naturalness without compromising faithfulness to the conditioning +signals. Extensive evaluations demonstrate that DisCoRD achieves +state-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on +KIT-ML. These results solidify DisCoRD as a robust solution for bridging the +divide between discrete efficiency and continuous realism. Our project page is +available at: https://whwjdqls.github.io/discord.github.io/. + +
+
+ comment: 20 pages 18 figures +
+
+
+
+
+ + ♻ ☆ Predicting and Enhancing the Fairness of DNNs with the Curvature of + Perceptual Manifolds CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we first establish a geometric +perspective for analyzing model fairness and then systematically propose a +series of geometric measurements for perceptual manifolds in deep neural +networks. Subsequently, we comprehensively explore the effect of the geometric +characteristics of perceptual manifolds on classification difficulty and how +learning shapes the geometric characteristics of perceptual manifolds. An +unanticipated finding is that the correlation between the class accuracy and +the separation degree of perceptual manifolds gradually decreases during +training, while the negative correlation with the curvature gradually +increases, implying that curvature imbalance leads to model bias.Building upon +these observations, we propose curvature regularization to facilitate the model +to learn curvature-balanced and flatter perceptual manifolds. Evaluations on +multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. + +
+
+ comment: 17pages, Accepted by CVPR 2023, Submitted to TPAMI +
+
+
+
+
+ + ♻ ☆ Language Models Benefit from Preparation with Elicited Knowledge + + +
+ The zero-shot chain of thought (CoT) approach is often used in question +answering (QA) by language models (LMs) for tasks that require multiple +reasoning steps. However, some QA tasks hinge more on accessing relevant +knowledge than on chaining reasoning steps. We introduce a simple prompting +technique, called PREP, that involves using two instances of LMs: the first +(LM1) generates relevant information, and the second (LM2) receives the +information from the user and answers the question. This design is intended to +make better use of the LM's instruction-following capability. PREP is +applicable across various QA tasks without domain-specific prompt engineering. +PREP is developed on a dataset of 100 QA questions, derived from an extensive +schematic dataset specifying artifact parts and material composition. These +questions ask which of two artifacts is less likely to share materials with +another artifact. Such questions probe the LM's knowledge of shared materials +in the part structure of different artifacts. We test our method on our +parts-and-materials dataset and three published commonsense reasoning datasets. +The average accuracy of our method is consistently higher than that of all the +other tested methods across all the tested datasets. + +
+
+
+
+
+ + ♻ ☆ Can LLMs plan paths in the real world? + + +
+ As large language models (LLMs) increasingly integrate into vehicle +navigation systems, understanding their path-planning capability is crucial. We +tested three LLMs through six real-world path-planning scenarios in various +settings and with various difficulties. Our experiments showed that all LLMs +made numerous errors in all scenarios, revealing that they are unreliable path +planners. We suggest that future work focus on implementing mechanisms for +reality checks, enhancing model transparency, and developing smaller models. + +
+
+
+
+
+ + ♻ ☆ Evaluating LLMs for Hardware Design and Test + + +
+ Large Language Models (LLMs) have demonstrated capabilities for producing +code in Hardware Description Languages (HDLs). However, most of the focus +remains on their abilities to write functional code, not test code. The +hardware design process consists of both design and test, and so eschewing +validation and verification leaves considerable potential benefit unexplored, +given that a design and test framework may allow for progress towards full +automation of the digital design pipeline. In this work, we perform one of the +first studies exploring how a LLM can both design and test hardware modules +from provided specifications. Using a suite of 8 representative benchmarks, we +examined the capabilities and limitations of the state-of-the-art +conversational LLMs when producing Verilog for functional and verification +purposes. We taped out the benchmarks on a Skywater 130nm shuttle and received +the functional chip. + +
+
+
+
+
+ + ♻ ☆ Deep Dynamics: Vehicle Dynamics Modeling with a Physics-Constrained + Neural Network for Autonomous Racing + + +
+ Autonomous racing is a critical research area for autonomous driving, +presenting significant challenges in vehicle dynamics modeling, such as +balancing model precision and computational efficiency at high speeds +(>280km/h), where minor errors in modeling have severe consequences. Existing +physics-based models for vehicle dynamics require elaborate testing setups and +tuning, which are hard to implement, time-intensive, and cost-prohibitive. +Conversely, purely data-driven approaches do not generalize well and cannot +adequately ensure physical constraints on predictions. This paper introduces +Deep Dynamics, a physics-constrained neural network (PCNN) for vehicle dynamics +modeling of an autonomous racecar. It combines physics coefficient estimation +and dynamical equations to accurately predict vehicle states at high speeds and +includes a unique Physics Guard layer to ensure internal coefficient estimates +remain within their nominal physical ranges. Open-loop and closed-loop +performance assessments, using a physics-based simulator and full-scale +autonomous Indy racecar data, highlight Deep Dynamics as a promising approach +for modeling racecar vehicle dynamics. + +
+
+ comment: Published in the IEEE Robotics and Automation Letters and presented + at the IEEE International Conference on Intelligent Robots and Systems +
+
+
+
+
+ + ♻ ☆ An Information Theoretic Approach to Machine Unlearning + + +
+ To comply with AI and data regulations, the need to forget private or +copyrighted information from trained machine learning models is increasingly +important. The key challenge in unlearning is forgetting the necessary data in +a timely manner, while preserving model performance. In this work, we address +the zero-shot unlearning scenario, whereby an unlearning algorithm must be able +to remove data given only a trained model and the data to be forgotten. We +explore unlearning from an information theoretic perspective, connecting the +influence of a sample to the information gain a model receives by observing it. +From this, we derive a simple but principled zero-shot unlearning method based +on the geometry of the model. Our approach takes the form of minimising the +gradient of a learned function with respect to a small neighbourhood around a +target forget point. This induces a smoothing effect, causing forgetting by +moving the boundary of the classifier. We explore the intuition behind why this +approach can jointly unlearn forget samples while preserving general model +performance through a series of low-dimensional experiments. We perform +extensive empirical evaluation of our method over a range of contemporary +benchmarks, verifying that our method is competitive with state-of-the-art +performance under the strict constraints of zero-shot unlearning. Code for the +project can be found at +https://github.com/jwf40/Information-Theoretic-Unlearning + +
+
+ comment: Updated, new low-dimensional experiments and updated perspective on + unlearning from an information theoretic view +
+
+
+
+
+
+
+
+ + Machine Learning 97 + +
+
+
+ + ♻ ☆ Compute-Constrained Data Selection + + +
+ Data selection can reduce the amount of training data needed to finetune +LLMs; however, the efficacy of data selection scales directly with its compute. +Motivated by the practical challenge of compute-constrained finetuning, we +consider the setting in which both the cost of selecting data and training are +budgeted for. We first formalize the problem of data selection with a +cost-aware utility function, and model the data selection problem as trading +off initial-selection cost for training gain. We run a comprehensive sweep of +experiments across multiple tasks, varying compute budget by scaling finetuning +tokens, model sizes, and data selection compute. Interestingly we find that +many powerful data selection methods are almost never compute-optimal, and that +cheaper data selection alternatives dominate both from a theoretical and +empirical perspective. For compute-optimal training, we find that perplexity +and gradient data selection require training-to-selection model size ratios of +5x and 10x, respectively. + +
+
+
+
+
+ + ♻ ☆ A Note on Doubly Robust Estimator in Regression Continuity Designs + + +
+ This note introduces a doubly robust (DR) estimator for regression +discontinuity (RD) designs. RD designs provide a quasi-experimental framework +for estimating treatment effects, where treatment assignment depends on whether +a running variable surpasses a predefined cutoff. A common approach in RD +estimation is the use of nonparametric regression methods, such as local linear +regression. However, the validity of these methods still relies on the +consistency of the nonparametric estimators. In this study, we propose the +DR-RD estimator, which combines two distinct estimators for the conditional +expected outcomes. The primary advantage of the DR-RD estimator lies in its +ability to ensure the consistency of the treatment effect estimation as long as +at least one of the two estimators is consistent. Consequently, our DR-RD +estimator enhances robustness of treatment effect estimators in RD designs. + +
+
+ comment: There is a critical error in the previous submission. We have revised + the original claim and present a weakened result +
+
+
+
+
+ + ♻ ☆ Inference Scaling fLaws: The Limits of LLM Resampling with Imperfect + Verifiers + + +
+ Recent research has generated hope that inference scaling could allow weaker +language models to match or exceed the accuracy of stronger models, such as by +repeatedly sampling solutions to a coding problem until it passes unit tests. +The central thesis of this paper is that there is no free lunch for inference +scaling: indefinite accuracy improvement through resampling can only be +realized if the "verifier" (in this case, a set of unit tests) is perfect. When +the verifier is imperfect, as it almost always is in domains such as reasoning +or coding (for example, unit tests have imperfect coverage), there is a nonzero +probability of false positives: incorrect solutions that pass the verifier. +Resampling cannot decrease this probability, so it imposes an upper bound to +the accuracy of resampling-based inference scaling even with an infinite +compute budget. We find that there is a very strong correlation between the +model's single-sample accuracy (i.e. accuracy without unit tests) and its false +positive rate on coding benchmarks HumanEval and MBPP, whose unit tests have +limited coverage. Therefore, no amount of inference scaling of weaker models +can enable them to match the single-sample accuracy of a sufficiently strong +model (Fig. 1a). When we consider that false positives have a negative utility +compared to abstaining from producing a solution, it bends the inference +scaling curve further downward. Empirically, we find that the optimal number of +samples can be less than 10 under realistic assumptions (Fig. 1b). Finally, we +show that beyond accuracy, false positives may have other undesirable +qualities, such as poor adherence to coding style conventions. + +
+
+
+
+
+ + ♻ ☆ Topology-Based Reconstruction Prevention for Decentralised Learning + + +
+ Decentralised learning has recently gained traction as an alternative to +federated learning in which both data and coordination are distributed. To +preserve the confidentiality of users' data, decentralised learning relies on +differential privacy, multi-party computation, or both. However, running +multiple privacy-preserving summations in sequence may allow adversaries to +perform reconstruction attacks. Current reconstruction countermeasures either +cannot trivially be adapted to the distributed setting, or add excessive +amounts of noise. + In this work, we first show that passive honest-but-curious adversaries can +infer other users' private data after several privacy-preserving summations. +For example, in subgraphs with 18 users, we show that only three passive +honest-but-curious adversaries succeed at reconstructing private data 11.0% of +the time, requiring an average of 8.8 summations per adversary. The success +rate depends only on the adversaries' direct neighbourhood, and is independent +of the size of the full network. We consider weak adversaries that do not +control the graph topology, cannot exploit the summation's inner workings, and +do not have auxiliary knowledge; and show that these adversaries can still +infer private data. + We analyse how reconstruction relates to topology and propose the first +topology-based decentralised defence against reconstruction attacks. We show +that reconstruction requires a number of adversaries linear in the length of +the network's shortest cycle. Consequently, exact attacks over +privacy-preserving summations are impossible in acyclic networks. + Our work is a stepping stone for a formal theory of topology-based +decentralised reconstruction defences. Such a theory would generalise our +countermeasure beyond summation, define confidentiality in terms of entropy, +and describe the interactions with (topology-aware) differential privacy. + +
+
+ comment: 14 pages, 19 figures, for associated experiment source code see + doi:10.4121/21572601.v2 +
+
+
+
+
+ + ♻ ☆ Dynamic Estimation of Learning Rates Using a Non-Linear Autoregressive + Model + + +
+ We introduce a new class of adaptive non-linear autoregressive (Nlar) models +incorporating the concept of momentum, which dynamically estimate both the +learning rates and momentum as the number of iterations increases. In our +method, the growth of the gradients is controlled using a scaling (clipping) +function, leading to stable convergence. Within this framework, we propose +three distinct estimators for learning rates and provide theoretical proof of +their convergence. We further demonstrate how these estimators underpin the +development of effective Nlar optimizers. The performance of the proposed +estimators and optimizers is rigorously evaluated through extensive experiments +across several datasets and a reinforcement learning environment. The results +highlight two key features of the Nlar optimizers: robust convergence despite +variations in underlying parameters, including large initial learning rates, +and strong adaptability with rapid convergence during the initial epochs. + +
+
+ comment: Typos corrected +
+
+
+
+
+ + ♻ ☆ CREW: Facilitating Human-AI Teaming Research + + +
+ With the increasing deployment of artificial intelligence (AI) technologies, +the potential of humans working with AI agents has been growing at a great +speed. Human-AI teaming is an important paradigm for studying various aspects +when humans and AI agents work together. The unique aspect of Human-AI teaming +research is the need to jointly study humans and AI agents, demanding +multidisciplinary research efforts from machine learning to human-computer +interaction, robotics, cognitive science, neuroscience, psychology, social +science, and complex systems. However, existing platforms for Human-AI teaming +research are limited, often supporting oversimplified scenarios and a single +task, or specifically focusing on either human-teaming research or multi-agent +AI algorithms. We introduce CREW, a platform to facilitate Human-AI teaming +research in real-time decision-making scenarios and engage collaborations from +multiple scientific disciplines, with a strong emphasis on human involvement. +It includes pre-built tasks for cognitive studies and Human-AI teaming with +expandable potentials from our modular design. Following conventional cognitive +neuroscience research, CREW also supports multimodal human physiological signal +recording for behavior analysis. Moreover, CREW benchmarks real-time +human-guided reinforcement learning agents using state-of-the-art algorithms +and well-tuned baselines. With CREW, we were able to conduct 50 human subject +studies within a week to verify the effectiveness of our benchmark. + +
+
+ comment: Our project website is at: http://generalroboticslab.com/CREW +
+
+
+
+
+ + ♻ ☆ Two Tales of Single-Phase Contrastive Hebbian Learning ICML 2024 + + +
+ The search for ``biologically plausible'' learning algorithms has converged +on the idea of representing gradients as activity differences. However, most +approaches require a high degree of synchronization (distinct phases during +learning) and introduce substantial computational overhead, which raises doubts +regarding their biological plausibility as well as their potential utility for +neuromorphic computing. Furthermore, they commonly rely on applying +infinitesimal perturbations (nudges) to output units, which is impractical in +noisy environments. Recently it has been shown that by modelling artificial +neurons as dyads with two oppositely nudged compartments, it is possible for a +fully local learning algorithm named ``dual propagation'' to bridge the +performance gap to backpropagation, without requiring separate learning phases +or infinitesimal nudging. However, the algorithm has the drawback that its +numerical stability relies on symmetric nudging, which may be restrictive in +biological and analog implementations. In this work we first provide a solid +foundation for the objective underlying the dual propagation method, which also +reveals a surprising connection with adversarial robustness. Second, we +demonstrate how dual propagation is related to a particular adjoint state +method, which is stable regardless of asymmetric nudging. + +
+
+ comment: ICML 2024; 21 pages +
+
+
+
+
+ + ♻ ☆ Inducing Group Fairness in Prompt-Based Language Model Decisions + + +
+ Classifiers are used throughout industry to enforce policies, ranging from +the detection of toxic content to age-appropriate content filtering. While +these classifiers serve important functions, it is also essential that they are +built in ways that minimize unfair biases for users. + One such fairness consideration is called group fairness, which desires that +different sub-population of users receive equal treatment. This is a +well-studied problem in the context of 'classical' classifiers. However, the +emergence of prompt-based language model (LM) decision making has created new +opportunities to solve text-based classification tasks, and the fairness +properties of these new classifiers are not yet well understood. Further, the +`remediation toolkit' is incomplete for LM-based decision makers and little is +understood about how to improve decision maker group fairness while maintaining +classifier performance. + This work sets out to add more tools to that toolbox. We introduce +adaptations of existing effective approaches from the classical classifier +fairness to the prompt-based classifier space. We also devise simple methods +that take advantage of the new structure of prompt-based decision makers and +operate at the prompt level. We compare these approaches empirically on real +data. Our results suggest that adaptations of approaches that are effective for +classical classifiers remain effective in the LM-based classifier environment. +However, there is room for further exploration of prompt-based remediation +methods (and other remediation methods that take advantage of LM structure). + +
+
+
+
+
+ + ♻ ☆ Regression Trees Know Calculus + + +
+ Regression trees have emerged as a preeminent tool for solving real-world +regression problems due to their ability to deal with nonlinearities, +interaction effects and sharp discontinuities. In this article, we rather study +regression trees applied to well-behaved, differentiable functions, and +determine the relationship between node parameters and the local gradient of +the function being approximated. We find a simple estimate of the gradient +which can be efficiently computed using quantities exposed by popular tree +learning libraries. This allows the tools developed in the context of +differentiable algorithms, like neural nets and Gaussian processes, to be +deployed to tree-based models. To demonstrate this, we study measures of model +sensitivity defined in terms of integrals of gradients and demonstrate how to +compute them for regression trees using the proposed gradient estimates. +Quantitative and qualitative numerical experiments reveal the capability of +gradients estimated by regression trees to improve predictive analysis, solve +tasks in uncertainty quantification, and provide interpretation of model +behavior. + +
+
+ comment: Comments very welcome! +
+
+
+
+
+ + ♻ ☆ Asynchronous Message-Passing and Zeroth-Order Optimization Based + Distributed Learning with a Use-Case in Resource Allocation in Communication + Networks + + +
+ Distributed learning and adaptation have received significant interest and +found wide-ranging applications in machine learning and signal processing. +While various approaches, such as shared-memory optimization, multi-task +learning, and consensus-based learning (e.g., federated learning and learning +over graphs), focus on optimizing either local costs or a global cost, there +remains a need for further exploration of their interconnections. This paper +specifically focuses on a scenario where agents collaborate towards a common +task (i.e., optimizing a global cost equal to aggregated local costs) while +effectively having distinct individual tasks (i.e., optimizing individual local +parameters in a local cost). Each agent's actions can potentially impact other +agents' performance through interactions. Notably, each agent has access to +only its local zeroth-order oracle (i.e., cost function value) and shares +scalar values, rather than gradient vectors, with other agents, leading to +communication bandwidth efficiency and agent privacy. Agents employ +zeroth-order optimization to update their parameters, and the asynchronous +message-passing between them is subject to bounded but possibly random +communication delays. This paper presents theoretical convergence analyses and +establishes a convergence rate for nonconvex problems. Furthermore, it +addresses the relevant use-case of deep learning-based resource allocation in +communication networks and conducts numerical experiments in which agents, +acting as transmitters, collaboratively train their individual policies to +maximize a global reward, e.g., a sum of data rates. + +
+
+
+
+
+ + ♻ ☆ Understanding Generalizability of Diffusion Models Requires Rethinking + the Hidden Gaussian Structure + + +
+ In this work, we study the generalizability of diffusion models by looking +into the hidden properties of the learned score functions, which are +essentially a series of deep denoisers trained on various noise levels. We +observe that as diffusion models transition from memorization to +generalization, their corresponding nonlinear diffusion denoisers exhibit +increasing linearity. This discovery leads us to investigate the linear +counterparts of the nonlinear diffusion models, which are a series of linear +models trained to match the function mappings of the nonlinear diffusion +denoisers. Surprisingly, these linear denoisers are approximately the optimal +denoisers for a multivariate Gaussian distribution characterized by the +empirical mean and covariance of the training dataset. This finding implies +that diffusion models have the inductive bias towards capturing and utilizing +the Gaussian structure (covariance information) of the training dataset for +data generation. We empirically demonstrate that this inductive bias is a +unique property of diffusion models in the generalization regime, which becomes +increasingly evident when the model's capacity is relatively small compared to +the training dataset size. In the case that the model is highly +overparameterized, this inductive bias emerges during the initial training +phases before the model fully memorizes its training data. Our study provides +crucial insights into understanding the notable strong generalization +phenomenon recently observed in real-world diffusion models. + +
+
+
+
+
+ + ♻ ☆ OminiControl: Minimal and Universal Control for Diffusion Transformer + + +
+ In this paper, we introduce OminiControl, a highly versatile and +parameter-efficient framework that integrates image conditions into pre-trained +Diffusion Transformer (DiT) models. At its core, OminiControl leverages a +parameter reuse mechanism, enabling the DiT to encode image conditions using +itself as a powerful backbone and process them with its flexible multi-modal +attention processors. Unlike existing methods, which rely heavily on additional +encoder modules with complex architectures, OminiControl (1) effectively and +efficiently incorporates injected image conditions with only ~0.1% additional +parameters, and (2) addresses a wide range of image conditioning tasks in a +unified manner, including subject-driven generation and spatially-aligned +conditions such as edges, depth, and more. Remarkably, these capabilities are +achieved by training on images generated by the DiT itself, which is +particularly beneficial for subject-driven generation. Extensive evaluations +demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted +models in both subject-driven and spatially-aligned conditional generation. +Additionally, we release our training dataset, Subjects200K, a diverse +collection of over 200,000 identity-consistent images, along with an efficient +data synthesis pipeline to advance research in subject-consistent generation. + +
+
+
+
+
+ + ♻ ☆ What Differentiates Educational Literature? A Multimodal Fusion Approach + of Transformers and Computational Linguistics + + +
+ The integration of new literature into the English curriculum remains a +challenge since educators often lack scalable tools to rapidly evaluate +readability and adapt texts for diverse classroom needs. This study proposes to +address this gap through a multimodal approach that combines transformer-based +text classification with linguistic feature analysis to align texts with UK Key +Stages. Eight state-of-the-art Transformers were fine-tuned on segmented text +data, with BERT achieving the highest unimodal F1 score of 0.75. In parallel, +500 deep neural network topologies were searched for the classification of +linguistic characteristics, achieving an F1 score of 0.392. The fusion of these +modalities shows a significant improvement, with every multimodal approach +outperforming all unimodal models. In particular, the ELECTRA Transformer fused +with the neural network achieved an F1 score of 0.996. Unimodal and multimodal +approaches are shown to have statistically significant differences in all +validation metrics (accuracy, precision, recall, F1 score) except for inference +time. The proposed approach is finally encapsulated in a stakeholder-facing web +application, providing non-technical stakeholder access to real-time insights +on text complexity, reading difficulty, curriculum alignment, and +recommendations for learning age range. The application empowers data-driven +decision making and reduces manual workload by integrating AI-based +recommendations into lesson planning for English literature. + +
+
+
+
+
+ + ♻ ☆ Discovering group dynamics in coordinated time series via hierarchical + recurrent switching-state models + + +
+ We seek a computationally efficient model for a collection of time series +arising from multiple interacting entities (a.k.a. "agents"). Recent models of +spatiotemporal patterns across individuals fail to incorporate explicit +system-level collective behavior that can influence the trajectories of +individual entities. To address this gap in the literature, we present a new +hierarchical switching-state model that can be trained in an unsupervised +fashion to simultaneously learn both system-level and individual-level +dynamics. We employ a latent system-level discrete state Markov chain that +provides top-down influence on latent entity-level chains which in turn govern +the emission of each observed time series. Recurrent feedback from the +observations to the latent chains at both entity and system levels allows +recent situational context to inform how dynamics unfold at all levels in +bottom-up fashion. We hypothesize that including both top-down and bottom-up +influences on group dynamics will improve interpretability of the learned +dynamics and reduce error when forecasting. Our hierarchical switching +recurrent dynamical model can be learned via closed-form variational coordinate +ascent updates to all latent chains that scale linearly in the number of +entities. This is asymptotically no more costly than fitting a separate model +for each entity. Analysis of both synthetic data and real basketball team +movements suggests our lean parametric model can achieve competitive forecasts +compared to larger neural network models that require far more computational +resources. Further experiments on soldier data as well as a synthetic task with +64 cooperating entities show how our approach can yield interpretable insights +about team dynamics over time. + +
+
+
+
+
+ + ♻ ☆ A Conditional Independence Test in the Presence of Discretization + + +
+ Testing conditional independence has many applications, such as in Bayesian +network learning and causal discovery. Different test methods have been +proposed. However, existing methods generally can not work when only +discretized observations are available. Specifically, consider $X_1$, +$\tilde{X}_2$ and $X_3$ are observed variables, where $\tilde{X}_2$ is a +discretization of latent variables $X_2$. Applying existing test methods to the +observations of $X_1$, $\tilde{X}_2$ and $X_3$ can lead to a false conclusion +about the underlying conditional independence of variables $X_1$, $X_2$ and +$X_3$. Motivated by this, we propose a conditional independence test +specifically designed to accommodate the presence of such discretization. To +achieve this, we design the bridge equations to recover the parameter +reflecting the statistical information of the underlying latent continuous +variables. An appropriate test statistic and its asymptotic distribution under +the null hypothesis of conditional independence have also been derived. Both +theoretical results and empirical validation have been provided, demonstrating +the effectiveness of our test methods. + +
+
+
+
+
+ + ♻ ☆ ConvMixFormer- A Resource-efficient Convolution Mixer for + Transformer-based Dynamic Hand Gesture Recognition + + +
+ Transformer models have demonstrated remarkable success in many domains such +as natural language processing (NLP) and computer vision. With the growing +interest in transformer-based architectures, they are now utilized for gesture +recognition. So, we also explore and devise a novel ConvMixFormer architecture +for dynamic hand gestures. The transformers use quadratic scaling of the +attention features with the sequential data, due to which these models are +computationally complex and heavy. We have considered this drawback of the +transformer and designed a resource-efficient model that replaces the +self-attention in the transformer with the simple convolutional layer-based +token mixer. The computational cost and the parameters used for the +convolution-based mixer are comparatively less than the quadratic +self-attention. Convolution-mixer helps the model capture the local spatial +features that self-attention struggles to capture due to their sequential +processing nature. Further, an efficient gate mechanism is employed instead of +a conventional feed-forward network in the transformer to help the model +control the flow of features within different stages of the proposed model. +This design uses fewer learnable parameters which is nearly half the vanilla +transformer that helps in fast and efficient training. The proposed method is +evaluated on NVidia Dynamic Hand Gesture and Briareo datasets and our model has +achieved state-of-the-art results on single and multimodal inputs. We have also +shown the parameter efficiency of the proposed ConvMixFormer model compared to +other methods. The source code is available at +https://github.com/mallikagarg/ConvMixFormer. + +
+
+
+
+
+ + ♻ ☆ DGNN-YOLO: Dynamic Graph Neural Networks with YOLO11 for Small Object + Detection and Tracking in Traffic Surveillance + + +
+ Accurate detection and tracking of small objects such as pedestrians, +cyclists, and motorbikes are critical for traffic surveillance systems, which +are crucial in improving road safety and decision-making in intelligent +transportation systems. However, traditional methods struggle with challenges +such as occlusion, low resolution, and dynamic traffic conditions, +necessitating innovative approaches to address these limitations. This paper +introduces DGNN-YOLO, a novel framework integrating dynamic graph neural +networks (DGNN) with YOLO11 to enhance small object detection and tracking in +traffic surveillance systems. The framework leverages YOLO11's advanced spatial +feature extraction capabilities for precise object detection and incorporates +DGNN to model spatial-temporal relationships for robust real-time tracking +dynamically. By constructing and updating graph structures, DGNN-YOLO +effectively represents objects as nodes and their interactions as edges, +ensuring adaptive and accurate tracking in complex and dynamic environments. +Extensive experiments demonstrate that DGNN-YOLO consistently outperforms +state-of-the-art methods in detecting and tracking small objects under diverse +traffic conditions, achieving the highest precision (0.8382), recall (0.6875), +and mAP@0.5:0.95 (0.6476), showcasing its robustness and scalability, +particularly in challenging scenarios involving small and occluded objects. +This work provides a scalable, real-time traffic surveillance and analysis +solution, significantly contributing to intelligent transportation systems. + +
+
+
+
+
+ + ♻ ☆ Probabilistic Graph Rewiring via Virtual Nodes NeurIPS 2024 + + +
+ Message-passing graph neural networks (MPNNs) have emerged as a powerful +paradigm for graph-based machine learning. Despite their effectiveness, MPNNs +face challenges such as under-reaching and over-squashing, where limited +receptive fields and structural bottlenecks hinder information flow in the +graph. While graph transformers hold promise in addressing these issues, their +scalability is limited due to quadratic complexity regarding the number of +nodes, rendering them impractical for larger graphs. Here, we propose +implicitly rewired message-passing neural networks (IPR-MPNNs), a novel +approach that integrates implicit probabilistic graph rewiring into MPNNs. By +introducing a small number of virtual nodes, i.e., adding additional nodes to a +given graph and connecting them to existing nodes, in a differentiable, +end-to-end manner, IPR-MPNNs enable long-distance message propagation, +circumventing quadratic complexity. Theoretically, we demonstrate that +IPR-MPNNs surpass the expressiveness of traditional MPNNs. Empirically, we +validate our approach by showcasing its ability to mitigate under-reaching and +over-squashing effects, achieving state-of-the-art performance across multiple +graph datasets. Notably, IPR-MPNNs outperform graph transformers while +maintaining significantly faster computational efficiency. + +
+
+ comment: Accepted at 38th Conference on Neural Information Processing Systems + (NeurIPS 2024), Vancouver, Canada +
+
+
+
+
+ + ♻ ☆ ForecastBench: A Dynamic Benchmark of AI Forecasting Capabilities + + +
+ Forecasts of future events are essential inputs into informed +decision-making. Machine learning (ML) systems have the potential to deliver +forecasts at scale, but there is no framework for evaluating the accuracy of ML +systems on a standardized set of forecasting questions. To address this gap, we +introduce ForecastBench: a dynamic benchmark that evaluates the accuracy of ML +systems on an automatically generated and regularly updated set of 1,000 +forecasting questions. To avoid any possibility of data leakage, ForecastBench +is comprised solely of questions about future events that have no known answer +at the time of submission. We quantify the capabilities of current ML systems +by collecting forecasts from expert (human) forecasters, the general public, +and LLMs on a random subset of questions from the benchmark ($N=200$). While +LLMs have achieved super-human performance on many benchmarks, they perform +less well here: expert forecasters outperform the top-performing LLM (p-value +$<0.01$). We display system and human scores in a public leaderboard at +www.forecastbench.org. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Real NVP for Satellite Power System Fault Detection + + +
+ The unique challenges posed by the space environment, characterized by +extreme conditions and limited accessibility, raise the need for robust and +reliable techniques to identify and prevent satellite faults. Fault detection +methods in the space sector are required to ensure mission success and to +protect valuable assets. In this context, this paper proposes an Artificial +Intelligence (AI) based fault detection methodology and evaluates its +performance on ADAPT (Advanced Diagnostics and Prognostics Testbed), an +Electrical Power System (EPS) dataset, crafted in laboratory by NASA. Our study +focuses on the application of a physics-informed (PI) real-valued non-volume +preserving (Real NVP) model for fault detection in space systems. The efficacy +of this method is systematically compared against other AI approaches such as +Gated Recurrent Unit (GRU) and Autoencoder-based techniques. Results show that +our physics-informed approach outperforms existing methods of fault detection, +demonstrating its suitability for addressing the unique challenges of satellite +EPS sub-system faults. Furthermore, we unveil the competitive advantage of +physics-informed loss in AI models to address specific space needs, namely +robustness, reliability, and power constraints, crucial for space exploration +and satellite missions. + +
+
+ comment: C. Cena, U. Albertin, M. Martini, S. Bucci and M. Chiaberge, + "Physics-Informed Real NVP for Satellite Power System Fault Detection," 2024 + IEEE International Conference on Advanced Intelligent Mechatronics (AIM), + Boston, MA, USA, 2024, pp. 679-684, doi: 10.1109/AIM55361.2024.10636990 +
+
+
+
+
+ + ♻ ☆ A Self-Supervised Task for Fault Detection in Satellite Multivariate + Time Series SP + + +
+ In the space sector, due to environmental conditions and restricted +accessibility, robust fault detection methods are imperative for ensuring +mission success and safeguarding valuable assets. This work proposes a novel +approach leveraging Physics-Informed Real NVP neural networks, renowned for +their ability to model complex and high-dimensional distributions, augmented +with a self-supervised task based on sensors' data permutation. It focuses on +enhancing fault detection within the satellite multivariate time series. The +experiments involve various configurations, including pre-training with +self-supervision, multi-task learning, and standalone self-supervised training. +Results indicate significant performance improvements across all settings. In +particular, employing only the self-supervised loss yields the best overall +results, suggesting its efficacy in guiding the network to extract relevant +features for fault detection. This study presents a promising direction for +improving fault detection in space systems and warrants further exploration in +other datasets and applications. + +
+
+ comment: SPAICE: AI in and for Space, 2024 +
+
+
+
+
+ + ♻ ☆ Brain Tumour Removing and Missing Modality Generation using 3D WDM + + +
+ This paper presents the second-placed solution for task 8 and the +participation solution for task 7 of BraTS 2024. The adoption of automated +brain analysis algorithms to support clinical practice is increasing. However, +many of these algorithms struggle with the presence of brain lesions or the +absence of certain MRI modalities. The alterations in the brain's morphology +leads to high variability and thus poor performance of predictive models that +were trained only on healthy brains. The lack of information that is usually +provided by some of the missing MRI modalities also reduces the reliability of +the prediction models trained with all modalities. In order to improve the +performance of these models, we propose the use of conditional 3D wavelet +diffusion models. The wavelet transform enabled full-resolution image training +and prediction on a GPU with 48 GB VRAM, without patching or downsampling, +preserving all information for prediction. The code for these tasks is +available at https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ♻ ☆ Limits to Predicting Online Speech Using Large Language Models + + +
+ We study the predictability of online speech on social media, and whether +predictability improves with information outside a user's own posts. Recent +theoretical results suggest that posts from a user's social circle are as +predictive of the user's future posts as that of the user's past posts. +Motivated by the success of large language models, we empirically test this +hypothesis. We define predictability as a measure of the model's uncertainty, +i.e., its negative log-likelihood on future tokens given context. As the basis +of our study, we collect 10M tweets for ``tweet-tuning'' base models and a +further 6.25M posts from more than five thousand X (previously Twitter) users +and their peers. Across four large language models ranging in size from 1.5 +billion to 70 billion parameters, we find that predicting a user's posts from +their peers' posts performs poorly. Moreover, the value of the user's own posts +for prediction is consistently higher than that of their peers'. We extend our +investigation with a detailed analysis on what's learned in-context and the +robustness of our findings. From context, base models learn to correctly +predict @-mentions and hashtags. Moreover, our results replicate if instead of +prompting the model with additional context, we finetune on it. Across the +board, we find that predicting the posts of individual users remains hard. + +
+
+
+
+
+ + ♻ ☆ From Text to Insight: Large Language Models for Materials Science Data + Extraction + + +
+ The vast majority of materials science knowledge exists in unstructured +natural language, yet structured data is crucial for innovative and systematic +materials design. Traditionally, the field has relied on manual curation and +partial automation for data extraction for specific use cases. The advent of +large language models (LLMs) represents a significant shift, potentially +enabling efficient extraction of structured, actionable data from unstructured +text by non-experts. While applying LLMs to materials science data extraction +presents unique challenges, domain knowledge offers opportunities to guide and +validate LLM outputs. This review provides a comprehensive overview of +LLM-based structured data extraction in materials science, synthesizing current +knowledge and outlining future directions. We address the lack of standardized +guidelines and present frameworks for leveraging the synergy between LLMs and +materials science expertise. This work serves as a foundational resource for +researchers aiming to harness LLMs for data-driven materials research. The +insights presented here could significantly enhance how researchers across +disciplines access and utilize scientific information, potentially accelerating +the development of novel materials for critical societal needs. + +
+
+
+
+
+ + ♻ ☆ On Meta-Prompting + + +
+ Modern generative language models are capable of interpreting input strings +as instructions, or prompts, and carry out tasks based on them. Many approaches +to prompting and pre-training these models involve the automated generation of +these prompts: meta-prompting, or prompting to obtain prompts. We propose a +theoretical framework based on category theory to generalize and describe them. +This framework is flexible enough to account for stochasticity, and allows us +to obtain formal results around task agnosticity and equivalence of various +meta-prompting approaches. Experimentally, we test our framework in two active +areas of model research: creativity and ideation. We find that user preference +strongly favors (p < 0.01) the prompts generated under meta-prompting, as well +as their corresponding outputs, over a series of hardcoded baseline prompts +that include the original task definition. Using our framework, we argue that +meta-prompting is more effective than basic prompting at generating desirable +outputs. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Ranking by Lifts: A Cost-Benefit Approach to Large-Scale A/B Tests + + +
+ A/B testers that conduct large-scale tests often prioritize lifts as the main +outcome metric and want to be able to control costs resulting from false +rejections of the null. This work develops a decision-theoretic framework for +maximizing profits subject to false discovery rate (FDR) control. We build an +empirical Bayes solution for the problem via a greedy knapsack approach. We +derive an oracle rule based on ranking the ratio of expected lifts and the cost +of wrong rejections using the local false discovery rate (lfdr) statistic. Our +oracle decision rule is valid and optimal for large-scale tests. Further, we +establish asymptotic validity for the data-driven procedure and demonstrate +finite-sample validity in experimental studies. We also demonstrate the merit +of the proposed method over other FDR control methods. Finally, we discuss an +application to data collected by experiments on the Optimizely platform. + +
+
+ comment: Updated +
+
+
+
+
+ + ♻ ☆ Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation, + Embrace Orthogonality + + +
+ We introduce a yat-product-powered neural network, the Neural Matter Network +(NMN), a breakthrough in deep learning that achieves non-linear pattern +recognition without activation functions. Our key innovation relies on the +yat-product and yat-product, which naturally induces non-linearity by +projecting inputs into a pseudo-metric space, eliminating the need for +traditional activation functions while maintaining only a softmax layer for +final class probability distribution. This approach simplifies network +architecture and provides unprecedented transparency into the network's +decision-making process. Our comprehensive empirical evaluation across +different datasets demonstrates that NMN consistently outperforms traditional +MLPs. The results challenge the assumption that separate activation functions +are necessary for effective deep-learning models. The implications of this work +extend beyond immediate architectural benefits, by eliminating intermediate +activation functions while preserving non-linear capabilities, yat-MLP +establishes a new paradigm for neural network design that combines simplicity +with effectiveness. Most importantly, our approach provides unprecedented +insights into the traditionally opaque "black-box" nature of neural networks, +offering a clearer understanding of how these models process and classify +information. + +
+
+ comment: fixed proof, added softermax +
+
+
+
+
+ + ♻ ☆ Latent Diffusion for Neural Spiking Data NeurIPS + 2024 + + +
+ Modern datasets in neuroscience enable unprecedented inquiries into the +relationship between complex behaviors and the activity of many simultaneously +recorded neurons. While latent variable models can successfully extract +low-dimensional embeddings from such recordings, using them to generate +realistic spiking data, especially in a behavior-dependent manner, still poses +a challenge. Here, we present Latent Diffusion for Neural Spiking data (LDNS), +a diffusion-based generative model with a low-dimensional latent space: LDNS +employs an autoencoder with structured state-space (S4) layers to project +discrete high-dimensional spiking data into continuous time-aligned latents. On +these inferred latents, we train expressive (conditional) diffusion models, +enabling us to sample neural activity with realistic single-neuron and +population spiking statistics. We validate LDNS on synthetic data, accurately +recovering latent structure, firing rates, and spiking statistics. Next, we +demonstrate its flexibility by generating variable-length data that mimics +human cortical activity during attempted speech. We show how to equip LDNS with +an expressive observation model that accounts for single-neuron dynamics not +mediated by the latent state, further increasing the realism of generated +samples. Finally, conditional LDNS trained on motor cortical activity during +diverse reaching behaviors can generate realistic spiking data given reach +direction or unseen reach trajectories. In summary, LDNS simultaneously enables +inference of low-dimensional latents and realistic conditional generation of +neural spiking datasets, opening up further possibilities for simulating +experimentally testable hypotheses. + +
+
+ comment: 38th Conference on Neural Information Processing Systems (NeurIPS + 2024) +
+
+
+
+
+ + ♻ ☆ Continual Learning in the Presence of Repetition CVPR + + +
+ Continual learning (CL) provides a framework for training models in +ever-evolving environments. Although re-occurrence of previously seen objects +or tasks is common in real-world problems, the concept of repetition in the +data stream is not often considered in standard benchmarks for CL. Unlike with +the rehearsal mechanism in buffer-based strategies, where sample repetition is +controlled by the strategy, repetition in the data stream naturally stems from +the environment. This report provides a summary of the CLVision challenge at +CVPR 2023, which focused on the topic of repetition in class-incremental +learning. The report initially outlines the challenge objective and then +describes three solutions proposed by finalist teams that aim to effectively +exploit the repetition in the stream to learn continually. The experimental +results from the challenge highlight the effectiveness of ensemble-based +solutions that employ multiple versions of similar modules, each trained on +different but overlapping subsets of classes. This report underscores the +transformative potential of taking a different perspective in CL by employing +repetition in the data stream to foster innovative strategy design. + +
+
+ comment: Accepted version, to appear in Neural Networks; Challenge Report of + the 4th Workshop on Continual Learning in Computer Vision at CVPR +
+
+
+
+
+ + ♻ ☆ Fair Generalized Linear Mixed Models + + +
+ When using machine learning for automated prediction, it is important to +account for fairness in the prediction. Fairness in machine learning aims to +ensure that biases in the data and model inaccuracies do not lead to +discriminatory decisions. E.g., predictions from fair machine learning models +should not discriminate against sensitive variables such as sexual orientation +and ethnicity. The training data often in obtained from social surveys. In +social surveys, oftentimes the data collection process is a strata sampling, +e.g. due to cost restrictions. In strata samples, the assumption of +independence between the observation is not fulfilled. Hence, if the machine +learning models do not account for the strata correlations, the results may be +biased. Especially high is the bias in cases where the strata assignment is +correlated to the variable of interest. We present in this paper an algorithm +that can handle both problems simultaneously, and we demonstrate the impact of +stratified sampling on the quality of fair machine learning predictions in a +reproducible simulation study. + +
+
+ comment: 25 pages, 12 figures. arXiv admin note: text overlap with + arXiv:2405.06433 +
+
+
+
+
+ + ♻ ☆ Fair Mixed Effects Support Vector Machine + + +
+ To ensure unbiased and ethical automated predictions, fairness must be a core +principle in machine learning applications. Fairness in machine learning aims +to mitigate biases present in the training data and model imperfections that +could lead to discriminatory outcomes. This is achieved by preventing the model +from making decisions based on sensitive characteristics like ethnicity or +sexual orientation. A fundamental assumption in machine learning is the +independence of observations. However, this assumption often does not hold true +for data describing social phenomena, where data points are often clustered +based. Hence, if the machine learning models do not account for the cluster +correlations, the results may be biased. Especially high is the bias in cases +where the cluster assignment is correlated to the variable of interest. We +present a fair mixed effects support vector machine algorithm that can handle +both problems simultaneously. With a reproducible simulation study we +demonstrate the impact of clustered data on the quality of fair machine +learning predictions. + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ ProtFAD: Introducing function-aware domains as implicit modality towards + protein function prediction + + +
+ Protein function prediction is currently achieved by encoding its sequence or +structure, where the sequence-to-function transcendence and high-quality +structural data scarcity lead to obvious performance bottlenecks. Protein +domains are "building blocks" of proteins that are functionally independent, +and their combinations determine the diverse biological functions. However, +most existing studies have yet to thoroughly explore the intricate functional +information contained in the protein domains. To fill this gap, we propose a +synergistic integration approach for a function-aware domain representation, +and a domain-joint contrastive learning strategy to distinguish different +protein functions while aligning the modalities. Specifically, we align the +domain semantics with GO terms and text description to pre-train domain +embeddings. Furthermore, we partition proteins into multiple sub-views based on +continuous joint domains for contrastive training under the supervision of a +novel triplet InfoNCE loss. Our approach significantly and comprehensively +outperforms the state-of-the-art methods on various benchmarks, and clearly +differentiates proteins carrying distinct functions compared to the competitor. +Our implementation is available at +https://github.com/AI-HPC-Research-Team/ProtFAD. + +
+
+ comment: 17 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Anomaly Detection in Medical Imaging -- A Mini Review SC2021 + + +
+ The increasing digitization of medical imaging enables machine learning based +improvements in detecting, visualizing and segmenting lesions, easing the +workload for medical experts. However, supervised machine learning requires +reliable labelled data, which is is often difficult or impossible to collect or +at least time consuming and thereby costly. Therefore methods requiring only +partly labeled data (semi-supervised) or no labeling at all (unsupervised +methods) have been applied more regularly. Anomaly detection is one possible +methodology that is able to leverage semi-supervised and unsupervised methods +to handle medical imaging tasks like classification and segmentation. This +paper uses a semi-exhaustive literature review of relevant anomaly detection +papers in medical imaging to cluster into applications, highlight important +results, establish lessons learned and give further advice on how to approach +anomaly detection in medical imaging. The qualitative analysis is based on +google scholar and 4 different search terms, resulting in 120 different +analysed papers. The main results showed that the current research is mostly +motivated by reducing the need for labelled data. Also, the successful and +substantial amount of research in the brain MRI domain shows the potential for +applications in further domains like OCT and chest X-ray. + +
+
+ comment: Accepted and presented at iDSC2021 edit: During work on this + publication Maximilian Ernst Tschuchnig was affiliated with Salzburg + University of Applied Sciences and University of Salzburg +
+
+
+
+
+ + ♻ ☆ Moral Alignment for LLM Agents + + +
+ Decision-making agents based on pre-trained Large Language Models (LLMs) are +increasingly being deployed across various domains of human activity. While +their applications are currently rather specialized, several research efforts +are under way to develop more generalist agents. As LLM-based systems become +more agentic, their influence on human activity will grow and the transparency +of this will decrease. Consequently, developing effective methods for aligning +them to human values is vital. + The prevailing practice in alignment often relies on human preference data +(e.g., in RLHF or DPO), in which values are implicit and are essentially +deduced from relative preferences over different model outputs. In this work, +instead of relying on human feedback, we introduce the design of reward +functions that explicitly encode core human values for Reinforcement +Learning-based fine-tuning of foundation agent models. Specifically, we use +intrinsic rewards for the moral alignment of LLM agents. + We evaluate our approach using the traditional philosophical frameworks of +Deontological Ethics and Utilitarianism, quantifying moral rewards for agents +in terms of actions and consequences on the Iterated Prisoner's Dilemma (IPD) +environment. We also show how moral fine-tuning can be deployed to enable an +agent to unlearn a previously developed selfish strategy. Finally, we find that +certain moral strategies learned on the IPD game generalize to several other +matrix game environments. In summary, we demonstrate that fine-tuning with +intrinsic rewards is a promising general solution for aligning LLM agents to +human values, and it might represent a more transparent and cost-effective +alternative to currently predominant alignment techniques. + +
+
+
+
+
+ + ♻ ☆ Constraining Generative Models for Engineering Design with Negative Data + + +
+ Generative models have recently achieved remarkable success and widespread +adoption in society, yet they often struggle to generate realistic and accurate +outputs. This challenge extends beyond language and vision into fields like +engineering design, where safety-critical engineering standards and +non-negotiable physical laws tightly constrain what outputs are considered +acceptable. In this work, we introduce a novel training method to guide a +generative model toward constraint-satisfying outputs using `negative data' -- +examples of what to avoid. Our negative-data generative model (NDGM) +formulation easily outperforms classic models, generating 1/6 as many +constraint-violating samples using 1/8 as much data in certain problems. It +also consistently outperforms other baselines, achieving a balance between +constraint satisfaction and distributional similarity that is unsurpassed by +any other model in 12 of the 14 problems tested. This widespread superiority is +rigorously demonstrated across numerous synthetic tests and real engineering +problems, such as ship hull synthesis with hydrodynamic constraints and vehicle +design with impact safety constraints. Our benchmarks showcase both the +best-in-class performance of our new NDGM formulation and the overall dominance +of NDGMs versus classic generative models. We publicly release the code and +benchmarks at https://github.com/Lyleregenwetter/NDGMs. + +
+
+
+
+
+ + ♻ ☆ Autobidders with Budget and ROI Constraints: Efficiency, Regret, and + Pacing Dynamics COLT 2024 + + +
+ We study a game between autobidding algorithms that compete in an online +advertising platform. Each autobidder is tasked with maximizing its +advertiser's total value over multiple rounds of a repeated auction, subject to +budget and return-on-investment constraints. We propose a gradient-based +learning algorithm that is guaranteed to satisfy all constraints and achieves +vanishing individual regret. Our algorithm uses only bandit feedback and can be +used with the first- or second-price auction, as well as with any +"intermediate" auction format. Our main result is that when these autobidders +play against each other, the resulting expected liquid welfare over all rounds +is at least half of the expected optimal liquid welfare achieved by any +allocation. This holds whether or not the bidding dynamics converges to an +equilibrium. + +
+
+ comment: Appeared at COLT 2024. Numerical experiments added since Jun'24 + version +
+
+
+
+
+ + ♻ ☆ Evaluation of Multi-Scale Multiple Instance Learning to Improve Thyroid + Cancer Classification + + +
+ Thyroid cancer is currently the fifth most common malignancy diagnosed in +women. Since differentiation of cancer sub-types is important for treatment and +current, manual methods are time consuming and subjective, automatic +computer-aided differentiation of cancer types is crucial. Manual +differentiation of thyroid cancer is based on tissue sections, analysed by +pathologists using histological features. Due to the enormous size of gigapixel +whole slide images, holistic classification using deep learning methods is not +feasible. Patch based multiple instance learning approaches, combined with +aggregations such as bag-of-words, is a common approach. This work's +contribution is to extend a patch based state-of-the-art method by generating +and combining feature vectors of three different patch resolutions and +analysing three distinct ways of combining them. The results showed +improvements in one of the three multi-scale approaches, while the others led +to decreased scores. This provides motivation for analysis and discussion of +the individual approaches. + +
+
+ comment: Accepted and presented at IPTA 2022 (Best Paper) edit: During work on + this publication Maximilian Ernst Tschuchnig was affiliated with Salzburg + University of Applied Sciences and University of Salzburg +
+
+
+
+
+ + ♻ ☆ Sample Complexity Bounds for Linear System Identification from a Finite + Set + + +
+ This paper considers a finite sample perspective on the problem of +identifying an LTI system from a finite set of possible systems using +trajectory data. To this end, we use the maximum likelihood estimator to +identify the true system and provide an upper bound for its sample complexity. +Crucially, the derived bound does not rely on a potentially restrictive +stability assumption. Additionally, we leverage tools from information theory +to provide a lower bound to the sample complexity that holds independently of +the used estimator. The derived sample complexity bounds are analyzed +analytically and numerically. + +
+
+
+
+
+ + ♻ ☆ Cross-Refine: Improving Natural Language Explanation Generation by + Learning in Tandem COLING 2025 + + +
+ Natural language explanations (NLEs) are vital for elucidating the reasoning +behind large language model (LLM) decisions. Many techniques have been +developed to generate NLEs using LLMs. However, like humans, LLMs might not +always produce optimal NLEs on first attempt. Inspired by human learning +processes, we introduce Cross-Refine, which employs role modeling by deploying +two LLMs as generator and critic, respectively. The generator outputs a first +NLE and then refines this initial explanation using feedback and suggestions +provided by the critic. Cross-Refine does not require any supervised training +data or additional training. We validate Cross-Refine across three NLP tasks +using three state-of-the-art open-source LLMs through automatic and human +evaluation. We select Self-Refine (Madaan et al., 2023) as the baseline, which +only utilizes self-feedback to refine the explanations. Our findings from +automatic evaluation and a user study indicate that Cross-Refine outperforms +Self-Refine. Meanwhile, Cross-Refine can perform effectively with less powerful +LLMs, whereas Self-Refine only yields strong results with ChatGPT. +Additionally, we conduct an ablation study to assess the importance of feedback +and suggestions. Both of them play an important role in refining explanations. +We further evaluate Cross-Refine on a bilingual dataset in English and German. + +
+
+ comment: Accepted at COLING 2025; long paper +
+
+
+
+
+ + ♻ ☆ BK-SDM: A Lightweight, Fast, and Cheap Version of Stable Diffusion ECCV 2024 + + +
+ Text-to-image (T2I) generation with Stable Diffusion models (SDMs) involves +high computing demands due to billion-scale parameters. To enhance efficiency, +recent studies have reduced sampling steps and applied network quantization +while retaining the original architectures. The lack of architectural reduction +attempts may stem from worries over expensive retraining for such massive +models. In this work, we uncover the surprising potential of block pruning and +feature distillation for low-cost general-purpose T2I. By removing several +residual and attention blocks from the U-Net of SDMs, we achieve 30%~50% +reduction in model size, MACs, and latency. We show that distillation +retraining is effective even under limited resources: using only 13 A100 days +and a tiny dataset, our compact models can imitate the original SDMs (v1.4 and +v2.1-base with over 6,000 A100 days). Benefiting from the transferred +knowledge, our BK-SDMs deliver competitive results on zero-shot MS-COCO against +larger multi-billion parameter models. We further demonstrate the applicability +of our lightweight backbones in personalized generation and image-to-image +translation. Deployment of our models on edge devices attains 4-second +inference. Code and models can be found at: +https://github.com/Nota-NetsPresso/BK-SDM + +
+
+ comment: ECCV 2024 Camera-Ready Version +
+
+
+
+
+ + ♻ ☆ MASP: Scalable GNN-based Planning for Multi-Agent Navigation + + +
+ We investigate multi-agent navigation tasks, where multiple agents need to +reach initially unassigned goals in a limited time. Classical planning-based +methods suffer from expensive computation overhead at each step and offer +limited expressiveness for complex cooperation strategies. In contrast, +reinforcement learning (RL) has recently become a popular approach for +addressing this issue. However, RL struggles with low data efficiency and +cooperation when directly exploring (nearly) optimal policies in a large +exploration space, especially with an increased number of agents(e.g., 10+ +agents) or in complex environments (e.g., 3-D simulators). In this paper, we +propose the Multi-Agent Scalable Graph-based Planner (MASP), a goal-conditioned +hierarchical planner for navigation tasks with a substantial number of agents +in the decentralized setting. MASP employs a hierarchical framework to reduce +space complexity by decomposing a large exploration space into multiple +goal-conditioned subspaces, where a high-level policy assigns agents goals, and +a low-level policy navigates agents toward designated goals. For agent +cooperation and the adaptation to varying team sizes, we model agents and goals +as graphs to better capture their relationship. The high-level policy, the Goal +Matcher, leverages a graph-based Self-Encoder and Cross-Encoder to optimize +goal assignment by updating the agent and the goal graphs. The low-level +policy, the Coordinated Action Executor, introduces the Group Information +Fusion to facilitate group division and extract agent relationships across +groups, enhancing training efficiency for agent cooperation. The results +demonstrate that MASP outperforms RL and planning-based baselines in task +efficiency. + +
+
+ comment: Submitted to IEEE RA-L +
+
+
+
+
+ + ♻ ☆ Masked Generative Priors Improve World Models Sequence Modelling + Capabilities + + +
+ Deep Reinforcement Learning (RL) has become the leading approach for creating +artificial agents in complex environments. Model-based approaches, which are RL +methods with world models that predict environment dynamics, are among the most +promising directions for improving data efficiency, forming a critical step +toward bridging the gap between research and real-world deployment. In +particular, world models enhance sample efficiency by learning in imagination, +which involves training a generative sequence model of the environment in a +self-supervised manner. Recently, Masked Generative Modelling has emerged as a +more efficient and superior inductive bias for modelling and generating token +sequences. Building on the Efficient Stochastic Transformer-based World Models +(STORM) architecture, we replace the traditional MLP prior with a Masked +Generative Prior (e.g., MaskGIT Prior) and introduce GIT-STORM. We evaluate our +model on two downstream tasks: reinforcement learning and video prediction. +GIT-STORM demonstrates substantial performance gains in RL tasks on the Atari +100k benchmark. Moreover, we apply Transformer-based World Models to continuous +action environments for the first time, addressing a significant gap in prior +research. To achieve this, we employ a state mixer function that integrates +latent state representations with actions, enabling our model to handle +continuous control tasks. We validate this approach through qualitative and +quantitative analyses on the DeepMind Control Suite, showcasing the +effectiveness of Transformer-based World Models in this new domain. Our results +highlight the versatility and efficacy of the MaskGIT dynamics prior, paving +the way for more accurate world models and effective RL policies. + +
+
+
+
+
+ + ♻ ☆ Topology Only Pre-Training: Towards Generalised Multi-Domain Graph + Models + + +
+ The principal benefit of unsupervised representation learning is that a +pre-trained model can be fine-tuned where data or labels are scarce. Existing +approaches for graph representation learning are domain specific, maintaining +consistent node and edge features across the pre-training and target datasets. +This has precluded transfer to multiple domains. We present Topology Only +Pre-Training (ToP), a graph pre-training method based on node and edge feature +exclusion. We show positive transfer on evaluation datasets from multiple +domains, including domains not present in pre-training data, running directly +contrary to assumptions made in contemporary works. On 75% of experiments, ToP +models perform significantly $p \leq 0.01$ better than a supervised baseline. +Performance is significantly positive on 85.7% of tasks when node and edge +features are used in fine-tuning. We further show that out-of-domain topologies +can produce more useful pre-training than in-domain. Under ToP we show better +transfer from non-molecule pre-training, compared to molecule pre-training, on +79% of molecular benchmarks. Against the limited set of other generalist graph +models ToP performs strongly, including against models with many orders of +magnitude larger. These findings show that ToP opens broad areas of research in +both transfer learning on scarcely populated graph domains and in graph +foundation models. + +
+
+ comment: 28 pages, 5 figures, 5 tables. For in-development code see + https://github.com/neutralpronoun/general-gcl +
+
+
+
+
+ + ♻ ☆ Multi-turn Reinforcement Learning from Preference Human Feedback + + +
+ Reinforcement Learning from Human Feedback (RLHF) has become the standard +approach for aligning Large Language Models (LLMs) with human preferences, +allowing LLMs to demonstrate remarkable abilities in various tasks. Existing +methods work by emulating the preferences at the single decision (turn) level, +limiting their capabilities in settings that require planning or multi-turn +interactions to achieve a long-term goal. In this paper, we address this issue +by developing novel methods for Reinforcement Learning (RL) from preference +feedback between two full multi-turn conversations. In the tabular setting, we +present a novel mirror-descent-based policy optimization algorithm for the +general multi-turn preference-based RL problem, and prove its convergence to +Nash equilibrium. To evaluate performance, we create a new environment, +Education Dialogue, where a teacher agent guides a student in learning a random +topic, and show that a deep RL variant of our algorithm outperforms RLHF +baselines. Finally, we show that in an environment with explicit rewards, our +algorithm recovers the same performance as a reward-based RL baseline, despite +relying solely on a weaker preference signal. + +
+
+
+
+
+ + ♻ ☆ Methods for generating and evaluating synthetic longitudinal patient + data: a systematic review + + +
+ The rapid growth in data availability has facilitated research and +development, yet not all industries have benefited equally due to legal and +privacy constraints. The healthcare sector faces significant challenges in +utilizing patient data because of concerns about data security and +confidentiality. To address this, various privacy-preserving methods, including +synthetic data generation, have been proposed. Synthetic data replicate +existing data as closely as possible, acting as a proxy for sensitive +information. While patient data are often longitudinal, this aspect remains +underrepresented in existing reviews of synthetic data generation in +healthcare. This paper maps and describes methods for generating and evaluating +synthetic longitudinal patient data in real-life settings through a systematic +literature review, conducted following the PRISMA guidelines and incorporating +data from five databases up to May 2024. Thirty-nine methods were identified, +with four addressing all challenges of longitudinal data generation, though +none included privacy-preserving mechanisms. Resemblance was evaluated in most +studies, utility in the majority, and privacy in just over half. Only a small +fraction of studies assessed all three aspects. Our findings highlight the need +for further research in this area. + +
+
+
+
+
+ + ♻ ☆ Combining Induction and Transduction for Abstract Reasoning + + +
+ When learning an input-output mapping from very few examples, is it better to +first infer a latent function that explains the examples, or is it better to +directly predict new test outputs, e.g. using a neural network? We study this +question on ARC by training neural models for induction (inferring latent +functions) and transduction (directly predicting the test output for a given +test input). We train on synthetically generated variations of Python programs +that solve ARC training tasks. We find inductive and transductive models solve +different kinds of test problems, despite having the same training problems and +sharing the same neural architecture: Inductive program synthesis excels at +precise computations, and at composing multiple concepts, while transduction +succeeds on fuzzier perceptual concepts. Ensembling them approaches human-level +performance on ARC. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Zeroth-Order Methods for Scalable Large Language + Model Finetuning + + +
+ Fine-tuning on task-specific datasets is a widely-embraced paradigm of +harnessing the powerful capability of pretrained LLMs for various downstream +tasks. Due to the popularity of LLMs fine-tuning and its accompanying privacy +concerns, differentially private (DP) fine-tuning of pretrained LLMs has been +widely used to safeguarding the privacy of task-specific datasets. Lying at the +design core of DP LLM fine-tuning methods is the satisfactory tradeoff among +privacy, utility, and scalability. Most existing methods build upon the seminal +work of DP-SGD. Despite pushing the scalability of DP-SGD to its limit, +DP-SGD-based fine-tuning methods are unfortunately limited by the inherent +inefficiency of SGD. + In this paper, we investigate the potential of DP zeroth-order methods for +LLM pretraining, which avoids the scalability bottleneck of SGD by +approximating the gradient with the more efficient zeroth-order gradient. +Rather than treating the zeroth-order method as a drop-in replacement for SGD, +this paper presents a comprehensive study both theoretically and empirically. +First, we propose the stagewise DP zeroth-order method (DP-ZOSO) that +dynamically schedules key hyperparameters. This design is grounded on the +synergy between DP random perturbation and the gradient approximation error of +the zeroth-order method, and its effect on fine-tuning trajectory. + We provide theoretical analysis for both proposed methods. We conduct +extensive empirical analysis on both encoder-only masked language model and +decoder-only autoregressive language model, achieving impressive results in +terms of scalability and utility regardless of the class of tasks (compared +with DPZero, DP-ZOPO improves $4.5\%$ on SST-5, $5.5\%$ on MNLI with +RoBERTa-Large and 9.2\% on CB, 3.9\% on BoolQ with OPT-2.7b when $\epsilon=4$, +demonstrates more significant enhancement in performance on more complicated +tasks). + +
+
+
+
+
+ + ♻ ☆ Learning General Representation of 12-Lead Electrocardiogram with a + Joint-Embedding Predictive Architecture + + +
+ Electrocardiogram (ECG) captures the heart's electrical signals, offering +valuable information for diagnosing cardiac conditions. However, the scarcity +of labeled data makes it challenging to fully leverage supervised learning in +medical domain. Self-supervised learning (SSL) offers a promising solution, +enabling models to learn from unlabeled data and uncover meaningful patterns. +In this paper, we show that masked modeling in the latent space can be a +powerful alternative to existing self-supervised methods in the ECG domain. We +introduce ECG-JEPA, a SSL model for 12-lead ECG analysis that learns semantic +representations of ECG data by predicting in the hidden latent space, bypassing +the need to reconstruct raw signals. This approach offers several advantages in +the ECG domain: (1) it avoids producing unnecessary details, such as noise, +which is common in ECG; and (2) it addresses the limitations of na\"ive L2 loss +between raw signals. Another key contribution is the introduction of +Cross-Pattern Attention (CroPA), a specialized masked attention mechanism +tailored for 12-lead ECG data. ECG-JEPA is trained on the union of several open +ECG datasets, totaling approximately 180,000 samples, and achieves +state-of-the-art performance in various downstream tasks including ECG +classification and feature prediction. Our code is openly available at +https://github.com/sehunfromdaegu/ECG_JEPA. + +
+
+
+
+
+ + ♻ ☆ Simulation-based inference with scattering representations: scattering + is all you need NeurIPS + + +
+ We demonstrate the successful use of scattering representations without +further compression for simulation-based inference (SBI) with images (i.e. +field-level), illustrated with a cosmological case study. Scattering +representations provide a highly effective representational space for +subsequent learning tasks, although the higher dimensional compressed space +introduces challenges. We overcome these through spatial averaging, coupled +with more expressive density estimators. Compared to alternative methods, such +an approach does not require additional simulations for either training or +computing derivatives, is interpretable, and resilient to covariate shift. As +expected, we show that a scattering only approach extracts more information +than traditional second order summary statistics. + +
+
+ comment: 9 pages, 2 figures, accepted by NeurIPS workshop on Machine Learning + and the Physical Sciences +
+
+
+
+
+ + ♻ ☆ Revisiting MAE pre-training for 3D medical image segmentation + + +
+ Self-Supervised Learning (SSL) presents an exciting opportunity to unlock the +potential of vast, untapped clinical datasets, for various downstream +applications that suffer from the scarcity of labeled data. While SSL has +revolutionized fields like natural language processing and computer vision, its +adoption in 3D medical image computing has been limited by three key pitfalls: +Small pre-training dataset sizes, architectures inadequate for 3D medical image +analysis, and insufficient evaluation practices. In this paper, we address +these issues by i) leveraging a large-scale dataset of 39k 3D brain MRI volumes +and ii) using a Residual Encoder U-Net architecture within the state-of-the-art +nnU-Net framework. iii) A robust development framework, incorporating 5 +development and 8 testing brain MRI segmentation datasets, allowed +performance-driven design decisions to optimize the simple concept of Masked +Auto Encoders (MAEs) for 3D CNNs. The resulting model not only surpasses +previous SSL methods but also outperforms the strong nnU-Net baseline by an +average of approximately 3 Dice points setting a new state-of-the-art. Our code +and models are made available here. + +
+
+ comment: Arxiv Preprint. Revised and under review +
+
+
+
+
+ + ♻ ☆ Protecting Federated Learning from Extreme Model Poisoning Attacks via + Multidimensional Time Series Anomaly Detection + + +
+ Current defense mechanisms against model poisoning attacks in federated +learning (FL) systems have proven effective up to a certain threshold of +malicious clients. In this work, we introduce FLANDERS, a novel pre-aggregation +filter for FL resilient to large-scale model poisoning attacks, i.e., when +malicious clients far exceed legitimate participants. FLANDERS treats the +sequence of local models sent by clients in each FL round as a matrix-valued +time series. Then, it identifies malicious client updates as outliers in this +time series by comparing actual observations with estimates generated by a +matrix autoregressive forecasting model maintained by the server. Experiments +conducted in several non-iid FL setups show that FLANDERS significantly +improves robustness across a wide spectrum of attacks when paired with standard +and robust existing aggregation methods. + +
+
+
+
+
+ + ♻ ☆ Reliable Generation of Privacy-preserving Synthetic Electronic Health + Record Time Series via Diffusion Models + + +
+ Electronic Health Records (EHRs) are rich sources of patient-level data, +offering valuable resources for medical data analysis. However, privacy +concerns often restrict access to EHRs, hindering downstream analysis. Current +EHR de-identification methods are flawed and can lead to potential privacy +leakage. Additionally, existing publicly available EHR databases are limited, +preventing the advancement of medical research using EHR. This study aims to +overcome these challenges by generating realistic and privacy-preserving +synthetic electronic health records (EHRs) time series efficiently. We +introduce a new method for generating diverse and realistic synthetic EHR time +series data using Denoising Diffusion Probabilistic Models (DDPM). We conducted +experiments on six databases: Medical Information Mart for Intensive Care III +and IV (MIMIC-III/IV), the eICU Collaborative Research Database (eICU), and +non-EHR datasets on Stocks and Energy. We compared our proposed method with +eight existing methods. Our results demonstrate that our approach significantly +outperforms all existing methods in terms of data fidelity while requiring less +training effort. Additionally, data generated by our method yields a lower +discriminative accuracy compared to other baseline methods, indicating the +proposed method can generate data with less privacy risk. The proposed +diffusion-model-based method can reliably and efficiently generate synthetic +EHR time series, which facilitates the downstream medical data analysis. Our +numerical results show the superiority of the proposed method over all other +existing methods. + +
+
+
+
+
+ + ♻ ☆ Nonequilbrium physics of generative diffusion models + + +
+ Generative diffusion models apply the concept of Langevin dynamics in physics +to machine leaning, attracting a lot of interests from engineering, statistics +and physics, but a complete picture about inherent mechanisms is still lacking. +In this paper, we provide a transparent physics analysis of diffusion models, +formulating the fluctuation theorem, entropy production, equilibrium measure, +and Franz-Parisi potential to understand the dynamic process and intrinsic +phase transitions. Our analysis is rooted in a path integral representation of +both forward and backward dynamics, and in treating the reverse diffusion +generative process as a statistical inference, where the time-dependent state +variables serve as quenched disorder akin to that in spin glass theory. Our +study thus links stochastic thermodynamics, statistical inference and geometry +based analysis together to yield a coherent picture about how the generative +diffusion models work. + +
+
+ comment: 26 pages, 11 figures, 31 refs +
+
+
+
+
+ + ♻ ☆ Understanding LLM Embeddings for Regression + + +
+ With the rise of large language models (LLMs) for flexibly processing +information as strings, a natural application is regression, specifically by +preprocessing string representations into LLM embeddings as downstream features +for metric prediction. In this paper, we provide one of the first comprehensive +investigations into embedding-based regression and demonstrate that LLM +embeddings as features can be better for high-dimensional regression tasks than +using traditional feature engineering. This regression performance can be +explained in part due to LLM embeddings over numeric data inherently preserving +Lipschitz continuity over the feature space. Furthermore, we quantify the +contribution of different model effects, most notably model size and language +understanding, which we find surprisingly do not always improve regression +performance. + +
+
+ comment: 16 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Uncertainty quantification for fast reconstruction methods using + augmented equivariant bootstrap: Application to radio interferometry NeurIPS 2024 + + +
+ The advent of next-generation radio interferometers like the Square Kilometer +Array promises to revolutionise our radio astronomy observational capabilities. +The unprecedented volume of data these devices generate requires fast and +accurate image reconstruction algorithms to solve the ill-posed radio +interferometric imaging problem. Most state-of-the-art reconstruction methods +lack trustworthy and scalable uncertainty quantification, which is critical for +the rigorous scientific interpretation of radio observations. We propose an +unsupervised technique based on a conformalized version of a radio-augmented +equivariant bootstrapping method, which allows us to quantify uncertainties for +fast reconstruction methods. Noticeably, we rely on reconstructions from +ultra-fast unrolled algorithms. The proposed method brings more reliable +uncertainty estimations to our problem than existing alternatives. + +
+
+ comment: 14 pages, 7 figures. Accepted at the Machine Learning and the + Physical Sciences Workshop, NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Improved Multi-Task Brain Tumour Segmentation with Synthetic Data + Augmentation + + +
+ This paper presents the winning solution of task 1 and the third-placed +solution of task 3 of the BraTS challenge. The use of automated tools in +clinical practice has increased due to the development of more and more +sophisticated and reliable algorithms. However, achieving clinical standards +and developing tools for real-life scenarios is a major challenge. To this end, +BraTS has organised tasks to find the most advanced solutions for specific +purposes. In this paper, we propose the use of synthetic data to train +state-of-the-art frameworks in order to improve the segmentation of adult +gliomas in a post-treatment scenario, and the segmentation of meningioma for +radiotherapy planning. Our results suggest that the use of synthetic data leads +to more robust algorithms, although the synthetic data generation pipeline is +not directly suited to the meningioma task. In task 1, we achieved a DSC of +0.7900, 0.8076, 0.7760, 0.8926, 0.7874, 0.8938 and a HD95 of 35.63, 30.35, +44.58, 16.87, 38.19, 17.95 for ET, NETC, RC, SNFH, TC and WT, respectively and, +in task 3, we achieved a DSC of 0.801 and HD95 of 38.26, in the testing phase. +The code for these tasks is available at +https://github.com/ShadowTwin41/BraTS_2023_2024_solutions. + +
+
+
+
+
+ + ♻ ☆ Dual-Personalizing Adapter for Federated Foundation Models + + +
+ Recently, foundation models, particularly large language models (LLMs), have +demonstrated an impressive ability to adapt to various tasks by fine-tuning +diverse instruction data. Notably, federated foundation models (FedFM) emerge +as a privacy preservation method to fine-tune models collaboratively under +federated learning (FL) settings by leveraging many distributed datasets with +non-IID data. To alleviate communication and computation overhead, +parameter-efficient methods are introduced for efficiency, and some research +adapted personalization methods to FedFM for better user preferences alignment. +However, a critical gap in existing research is the neglect of test-time +distribution shifts in real-world applications, and conventional methods for +test-time distribution shifts in personalized FL are less effective for FedFM +due to their failure to adapt to complex distribution shift scenarios and the +requirement to train all parameters. To bridge this gap, we refine the setting +in FedFM, termed test-time personalization, which aims to learn personalized +federated foundation models on clients while effectively handling test-time +distribution shifts simultaneously. To address challenges in this setting, we +explore a simple yet effective solution, a Federated Dual-Personalizing Adapter +(FedDPA) architecture. By co-working with a foundation model, a global adapter +and a local adapter jointly tackle the test-time distribution shifts and +client-specific personalization. Additionally, we introduce an instance-wise +dynamic weighting mechanism that dynamically integrates the global and local +adapters for each test instance during inference, facilitating effective +test-time personalization. The effectiveness of the proposed method has been +evaluated on benchmark datasets across different NLP tasks. + +
+
+
+
+
+ + ♻ ☆ QFNN-FFD: Quantum Federated Neural Network for Financial Fraud Detection + + +
+ This study introduces the Quantum Federated Neural Network for Financial +Fraud Detection (QFNN-FFD), a cutting-edge framework merging Quantum Machine +Learning (QML) and quantum computing with Federated Learning (FL) for financial +fraud detection. Using quantum technologies' computational power and the robust +data privacy protections offered by FL, QFNN-FFD emerges as a secure and +efficient method for identifying fraudulent transactions within the financial +sector. Implementing a dual-phase training model across distributed clients +enhances data integrity and enables superior performance metrics, achieving +precision rates consistently above 95%. Additionally, QFNN-FFD demonstrates +exceptional resilience by maintaining an impressive 80% accuracy, highlighting +its robustness and readiness for real-world applications. This combination of +high performance, security, and robustness against noise positions QFNN-FFD as +a transformative advancement in financial technology solutions and establishes +it as a new benchmark for privacy-focused fraud detection systems. This +framework facilitates the broader adoption of secure, quantum-enhanced +financial services and inspires future innovations that could use QML to tackle +complex challenges in other areas requiring high confidentiality and accuracy. + +
+
+
+
+
+ + ♻ ☆ Image Statistics Predict the Sensitivity of Perceptual Quality Metrics + + +
+ Previously, Barlow and Attneave hypothesised a link between biological vision +and information maximisation. Following Shannon, information was defined using +the probability of natural images. Several physiological and psychophysical +phenomena have been derived from principles like info-max, efficient coding, or +optimal denoising. However, it remains unclear how this link is expressed in +mathematical terms from image probability. Classical derivations were subjected +to strong assumptions on the probability models and on the behaviour of the +sensors. Moreover, the direct evaluation of the hypothesis was limited by the +inability of classical image models to deliver accurate estimates of the +probability. Here, we directly evaluate image probabilities using a generative +model for natural images, and analyse how probability-related factors can be +combined to predict the sensitivity of state-of-the-art subjective image +quality metrics, a proxy for human perception. We use information theory and +regression analysis to find a simple model that when combining just two +probability-related factors achieves 0.77 correlation with subjective metrics. +This probability-based model is validated in two ways: through direct +comparison with the opinion of real observers in a subjective quality +experiment, and by reproducing basic trends of classical psychophysical facts +such as the Contrast Sensitivity Function, the Weber-law, and contrast masking. + +
+
+
+
+
+ + ♻ ☆ Self-Adaptive Quantum Kernel Principal Components Analysis for Compact + Readout of Chemiresistive Sensor Arrays + + +
+ The rapid growth of Internet of Things (IoT) devices necessitates efficient +data compression techniques to handle the vast amounts of data generated by +these devices. Chemiresistive sensor arrays (CSAs), a simple-to-fabricate but +crucial component in IoT systems, generate large volumes of data due to their +simultaneous multi-sensor operations. Classical principal component analysis +(cPCA) methods, a common solution to the data compression challenge, face +limitations in preserving critical information during dimensionality reduction. +In this study, we present self-adaptive quantum kernel (SAQK) PCA as a superior +alternative to enhance information retention. Our findings demonstrate that +SAQK PCA outperforms cPCA in various back-end machine-learning tasks, +especially in low-dimensional scenarios where access to quantum bits is +limited. These results highlight the potential of noisy intermediate-scale +quantum (NISQ) computers to revolutionize data processing in real-world IoT +applications by improving the efficiency and reliability of CSA data +compression and readout, despite the current constraints on qubit availability. + +
+
+ comment: Version 2 +
+
+
+
+
+ + ♻ ☆ Correction to "Wasserstein distance estimates for the distributions of + numerical approximations to ergodic stochastic differential equations" + + +
+ A method for analyzing non-asymptotic guarantees of numerical discretizations +of ergodic SDEs in Wasserstein-2 distance is presented by Sanz-Serna and +Zygalakis in ``Wasserstein distance estimates for the distributions of +numerical approximations to ergodic stochastic differential equations". They +analyze the UBU integrator which is strong order two and only requires one +gradient evaluation per step, resulting in desirable non-asymptotic guarantees, +in particular $\mathcal{O}(d^{1/4}\epsilon^{-1/2})$ steps to reach a distance +of $\epsilon > 0$ in Wasserstein-2 distance away from the target distribution. +However, there is a mistake in the local error estimates in Sanz-Serna and +Zygalakis (2021), in particular, a stronger assumption is needed to achieve +these complexity estimates. This note reconciles the theory with the dimension +dependence observed in practice in many applications of interest. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ MLLM-LLaVA-FL: Multimodal Large Language Model Assisted Federated + Learning WACV 2025 + + +
+ Previous studies on federated learning (FL) often encounter performance +degradation due to data heterogeneity among different clients. In light of the +recent advances in multimodal large language models (MLLMs), such as GPT-4v and +LLaVA, which demonstrate their exceptional proficiency in multimodal tasks, +such as image captioning and multimodal question answering. We introduce a +novel federated learning framework, named Multimodal Large Language Model +Assisted Federated Learning (MLLM-LLaVA-FL), which employs powerful MLLMs at +the server end to address the heterogeneous and long-tailed challenges. Owing +to the advanced cross-modality representation capabilities and the extensive +open-vocabulary prior knowledge of MLLMs, our framework is adept at harnessing +the extensive, yet previously underexploited, open-source data accessible from +websites and powerful server-side computational resources. Hence, the +MLLM-LLaVA-FL not only enhances the performance but also avoids increasing the +risk of privacy leakage and the computational burden on local devices, +distinguishing it from prior methodologies. Our framework has three key stages. +Initially, we conduct global visual-text pretraining of the model. This +pretraining is facilitated by utilizing the extensive open-source data +available online, with the assistance of MLLMs. Subsequently, the pretrained +model is distributed among various clients for local training. Finally, once +the locally trained models are transmitted back to the server, a global +alignment is carried out under the supervision of MLLMs to further enhance the +performance. Experimental evaluations on established benchmarks, show that our +framework delivers promising performance in the typical scenarios with data +heterogeneity and long-tail distribution across different clients in FL. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Strongly-polynomial time and validation analysis of policy gradient + methods + + +
+ This paper proposes a novel termination criterion, termed the advantage gap +function, for finite state and action Markov decision processes (MDP) and +reinforcement learning (RL). By incorporating this advantage gap function into +the design of step size rules and deriving a new linear rate of convergence +that is independent of the stationary state distribution of the optimal policy, +we demonstrate that policy gradient methods can solve MDPs in +strongly-polynomial time. To the best of our knowledge, this is the first time +that such strong convergence properties have been established for policy +gradient methods. Moreover, in the stochastic setting, where only stochastic +estimates of policy gradients are available, we show that the advantage gap +function provides close approximations of the optimality gap for each +individual state and exhibits a sublinear rate of convergence at every state. +The advantage gap function can be easily estimated in the stochastic case, and +when coupled with easily computable upper bounds on policy values, they provide +a convenient way to validate the solutions generated by policy gradient +methods. Therefore, our developments offer a principled and computable measure +of optimality for RL, whereas current practice tends to rely on +algorithm-to-algorithm or baselines comparisons with no certificate of +optimality. + +
+
+ comment: Add numerical experiments +
+
+
+
+
+ + ♻ ☆ BricksRL: A Platform for Democratizing Robotics and Reinforcement + Learning Research and Education with LEGO + + +
+ We present BricksRL, a platform designed to democratize access to robotics +for reinforcement learning research and education. BricksRL facilitates the +creation, design, and training of custom LEGO robots in the real world by +interfacing them with the TorchRL library for reinforcement learning agents. +The integration of TorchRL with the LEGO hubs, via Bluetooth bidirectional +communication, enables state-of-the-art reinforcement learning training on GPUs +for a wide variety of LEGO builds. This offers a flexible and cost-efficient +approach for scaling and also provides a robust infrastructure for +robot-environment-algorithm communication. We present various experiments +across tasks and robot configurations, providing built plans and training +results. Furthermore, we demonstrate that inexpensive LEGO robots can be +trained end-to-end in the real world to achieve simple tasks, with training +times typically under 120 minutes on a normal laptop. Moreover, we show how +users can extend the capabilities, exemplified by the successful integration of +non-LEGO sensors. By enhancing accessibility to both robotics and reinforcement +learning, BricksRL establishes a strong foundation for democratized robotic +learning in research and educational settings. + +
+
+
+
+
+ + ♻ ☆ CoMERA: Computing- and Memory-Efficient Training via Rank-Adaptive + Tensor Optimization + + +
+ Training large AI models such as LLMs and DLRMs costs massive GPUs and +computing time. The high training cost has become only affordable to big tech +companies, meanwhile also causing increasing concerns about the environmental +impact. This paper presents CoMERA, a Computing- and Memory-Efficient training +method via Rank-Adaptive tensor optimization. CoMERA achieves rank-adaptive +tensor-compressed (pre)-training via a multi-objective optimization formulation +and improves the training to provide both a high compression ratio and +excellent accuracy in the training process. Our optimized numerical computation +(e.g., optimized tensorized embedding and tensor-network contractions) and GPU +implementation eliminate part of the run-time overhead in the tensorized +training on GPU. This leads to, for the first time, $2-3\times$ speedup per +training epoch compared with standard training. CoMERA also outperforms the +recent GaLore in terms of both memory and computing efficiency. Specifically, +CoMERA is $2\times$ faster per training epoch and $9\times$ more +memory-efficient than GaLore on a tested six-encoder transformer with +single-batch training. Our method also shows $\sim 2\times$ speedup than +standard pre-training on a BERT-like code-generation LLM while achieving +$4.23\times$ compression ratio in pre-training. With further HPC optimization, +CoMERA may reduce the pre-training cost of many other LLMs. An implementation +of CoMERA is available at https://github.com/ziyangjoy/CoMERA. + +
+
+ comment: Accepted by Neurips 2024 +
+
+
+
+
+ + ♻ ☆ Video-Driven Graph Network-Based Simulators + + +
+ Lifelike visualizations in design, cinematography, and gaming rely on precise +physics simulations, typically requiring extensive computational resources and +detailed physical input. This paper presents a method that can infer a system's +physical properties from a short video, eliminating the need for explicit +parameter input, provided it is close to the training condition. The learned +representation is then used within a Graph Network-based Simulator to emulate +the trajectories of physical systems. We demonstrate that the video-derived +encodings effectively capture the physical properties of the system and +showcase a linear dependence between some of the encodings and the system's +motion. + +
+
+
+
+
+ + ♻ ☆ Query-Guided Self-Supervised Summarization of Nursing Notes + + +
+ Nursing notes, an important part of Electronic Health Records (EHRs), track a +patient's health during a care episode. Summarizing key information in nursing +notes can help clinicians quickly understand patients' conditions. However, +existing summarization methods in the clinical setting, especially abstractive +methods, have overlooked nursing notes and require reference summaries for +training. We introduce QGSumm, a novel query-guided self-supervised domain +adaptation approach for abstractive nursing note summarization. The method uses +patient-related clinical queries for guidance, and hence does not need +reference summaries for training. Through automatic experiments and manual +evaluation by an expert clinician, we study our approach and other +state-of-the-art Large Language Models (LLMs) for nursing note summarization. +Our experiments show: 1) GPT-4 is competitive in maintaining information in the +original nursing notes, 2) QGSumm can generate high-quality summaries with a +good balance between recall of the original content and hallucination rate +lower than other top methods. Ultimately, our work offers a new perspective on +conditional text summarization, tailored to clinical applications. + +
+
+
+
+
+ + ♻ ☆ Artificial Intelligence Mangrove Monitoring System Based on Deep + Learning and Sentinel-2 Satellite Data in the UAE (2017-2024) + + +
+ Mangroves play a crucial role in maintaining coastal ecosystem health and +protecting biodiversity. Therefore, continuous mapping of mangroves is +essential for understanding their dynamics. Earth observation imagery typically +provides a cost-effective way to monitor mangrove dynamics. However, there is a +lack of regional studies on mangrove areas in the UAE. This study utilizes the +UNet++ deep learning model combined with Sentinel-2 multispectral data and +manually annotated labels to monitor the spatiotemporal dynamics of densely +distributed mangroves (coverage greater than 70%) in the UAE from 2017 to 2024, +achieving an mIoU of 87.8% on the validation set. Results show that the total +mangrove area in the UAE in 2024 was approximately 9,142.21 hectares, an +increase of 2,061.33 hectares compared to 2017, with carbon sequestration +increasing by approximately 194,383.42 tons, equivalent to fixing about +713,367.36 tons of carbon dioxide. Abu Dhabi has the largest mangrove area and +plays a dominant role in the UAE's mangrove growth, increasing by 1,855.6 +hectares between 2017-2024, while other emirates have also contributed to +mangrove expansion through stable and sustainable growth in mangrove areas. +This comprehensive growth pattern reflects the collective efforts of all +emirates in mangrove restoration. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Deep Learning Based Super-Resolution For The Shallow Water + Equations + + +
+ Using the nonlinear shallow water equations as benchmark, we demonstrate that +a simulation with the ICON-O ocean model with a 20km resolution that is +frequently corrected by a U-net-type neural network can achieve discretization +errors of a simulation with 10km resolution. The network, originally developed +for image-based super-resolution in post-processing, is trained to compute the +difference between solutions on both meshes and is used to correct the coarse +mesh every 12h. Our setup is the Galewsky test case, modeling transition of a +barotropic instability into turbulent flow. We show that the ML-corrected +coarse resolution run correctly maintains a balance flow and captures the +transition to turbulence in line with the higher resolution simulation. After 8 +day of simulation, the $L_2$-error of the corrected run is similar to a +simulation run on the finer mesh. While mass is conserved in the corrected +runs, we observe some spurious generation of kinetic energy. + +
+
+ comment: 17 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Enhancing Compositional Text-to-Image Generation with Reliable Random + Seeds + + +
+ Text-to-image diffusion models have demonstrated remarkable capability in +generating realistic images from arbitrary text prompts. However, they often +produce inconsistent results for compositional prompts such as "two dogs" or "a +penguin on the right of a bowl". Understanding these inconsistencies is crucial +for reliable image generation. In this paper, we highlight the significant role +of initial noise in these inconsistencies, where certain noise patterns are +more reliable for compositional prompts than others. Our analyses reveal that +different initial random seeds tend to guide the model to place objects in +distinct image areas, potentially adhering to specific patterns of camera +angles and image composition associated with the seed. To improve the model's +compositional ability, we propose a method for mining these reliable cases, +resulting in a curated training set of generated images without requiring any +manual annotation. By fine-tuning text-to-image models on these generated +images, we significantly enhance their compositional capabilities. For +numerical composition, we observe relative increases of 29.3% and 19.5% for +Stable Diffusion and PixArt-{\alpha}, respectively. Spatial composition sees +even larger gains, with 60.7% for Stable Diffusion and 21.1% for +PixArt-{\alpha}. + +
+
+
+
+
+ + ♻ ☆ Ensemble data assimilation to diagnose AI-based weather prediction + model: A case with ClimaX version 0.3.1 + + +
+ Artificial intelligence (AI)-based weather prediction research is growing +rapidly and has shown to be competitive with the advanced dynamic numerical +weather prediction models. However, research combining AI-based weather +prediction models with data assimilation remains limited partially because +long-term sequential data assimilation cycles are required to evaluate data +assimilation systems. This study proposes using ensemble data assimilation for +diagnosing AI-based weather prediction models, and marked the first successful +implementation of ensemble Kalman filter with AI-based weather prediction +models. Our experiments with an AI-based model ClimaX demonstrated that the +ensemble data assimilation cycled stably for the AI-based weather prediction +model using covariance inflation and localization techniques within the +ensemble Kalman filter. While ClimaX showed some limitations in capturing +flow-dependent error covariance compared to dynamical models, the AI-based +ensemble forecasts provided reasonable and beneficial error covariance in +sparsely observed regions. In addition, ensemble data assimilation revealed +that error growth based on ensemble ClimaX predictions was weaker than that of +dynamical NWP models, leading to higher inflation factors. A series of +experiments demonstrated that ensemble data assimilation can be used to +diagnose properties of AI weather prediction models such as physical +consistency and accurate error growth representation. + +
+
+
+
+
+ + ♻ ☆ Preserving Data Privacy for ML-driven Applications in Open Radio Access + Networks + + +
+ Deep learning offers a promising solution to improve spectrum access +techniques by utilizing data-driven approaches to manage and share limited +spectrum resources for emerging applications. For several of these +applications, the sensitive wireless data (such as spectrograms) are stored in +a shared database or multistakeholder cloud environment and are therefore prone +to privacy leaks. This paper aims to address such privacy concerns by examining +the representative case study of shared database scenarios in 5G Open Radio +Access Network (O-RAN) networks where we have a shared database within the +near-real-time (near-RT) RAN intelligent controller. We focus on securing the +data that can be used by machine learning (ML) models for spectrum sharing and +interference mitigation applications without compromising the model and network +performances. The underlying idea is to leverage a (i) Shuffling-based +learnable encryption technique to encrypt the data, following which, (ii) +employ a custom Vision transformer (ViT) as the trained ML model that is +capable of performing accurate inferences on such encrypted data. The paper +offers a thorough analysis and comparisons with analogous convolutional neural +networks (CNN) as well as deeper architectures (such as ResNet-50) as +baselines. Our experiments showcase that the proposed approach significantly +outperforms the baseline CNN with an improvement of 24.5% and 23.9% for the +percent accuracy and F1-Score respectively when operated on encrypted data. +Though deeper ResNet-50 architecture is obtained as a slightly more accurate +model, with an increase of 4.4%, the proposed approach boasts a reduction of +parameters by 99.32%, and thus, offers a much-improved prediction time by +nearly 60%. + +
+
+
+
+
+ + ♻ ☆ GLaPE: Gold Label-agnostic Prompt Evaluation and Optimization for Large + Language Model EMNLP 2024 + + +
+ Despite the rapid progress of large language models (LLMs), their task +performance remains sensitive to prompt design. Recent studies have explored +leveraging the LLM itself as an optimizer to identify optimal prompts that +maximize task accuracy. However, when evaluating prompts, such approaches +heavily rely on elusive manually annotated gold labels to calculate task +accuracy for each candidate prompt, which hinders the widespread implementation +and generality. To overcome the limitation, this work proposes a gold +label-agnostic prompt evaluation (GLaPE) to alleviate dependence on gold +labels. Motivated by the observed correlation between self-consistency and the +accuracy of the answer, we adopt self-consistency as the initial evaluation +score. Subsequently, we refine the scores of prompts producing identical +answers to be mutually consistent. Experimental results show that GLaPE +provides reliable evaluations uniform with accuracy, even in the absence of +gold labels. Moreover, on six popular reasoning tasks, our GLaPE-based prompt +optimization yields effective prompts comparable to accuracy-based ones. The +code is publicly available at https://github.com/thunderous77/GLaPE. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Learning Counterfactual Distributions via Kernel Nearest Neighbors + + +
+ Consider a setting with multiple units (e.g., individuals, cohorts, +geographic locations) and outcomes (e.g., treatments, times, items), where the +goal is to learn a multivariate distribution for each unit-outcome entry, such +as the distribution of a user's weekly spend and engagement under a specific +mobile app version. A common challenge is the prevalence of missing not at +random data, where observations are available only for certain unit-outcome +combinations and the observation availability can be correlated with the +properties of distributions themselves, i.e., there is unobserved confounding. +An additional challenge is that for any observed unit-outcome entry, we only +have a finite number of samples from the underlying distribution. We tackle +these two challenges by casting the problem into a novel distributional matrix +completion framework and introduce a kernel based distributional generalization +of nearest neighbors to estimate the underlying distributions. By leveraging +maximum mean discrepancies and a suitable factor model on the kernel mean +embeddings of the underlying distributions, we establish consistent recovery of +the underlying distributions even when data is missing not at random and +positivity constraints are violated. Furthermore, we demonstrate that our +nearest neighbors approach is robust to heteroscedastic noise, provided we have +access to two or more measurements for the observed unit-outcome entries, a +robustness not present in prior works on nearest neighbors with single +measurements. + +
+
+ comment: 39 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Recurrences reveal shared causal drivers of complex time series + + +
+ Unmeasured causal forces influence diverse experimental time series, such as +the transcription factors that regulate genes, or the descending neurons that +steer motor circuits. Combining the theory of skew-product dynamical systems +with topological data analysis, we show that simultaneous recurrence events +across multiple time series reveal the structure of their shared unobserved +driving signal. We introduce a physics-based unsupervised learning algorithm +that reconstructs causal drivers by iteratively building a recurrence graph +with glass-like structure. As the amount of data increases, a percolation +transition on this graph leads to weak ergodicity breaking for random walks -- +revealing the shared driver's dynamics, even from strongly-corrupted +measurements. We relate reconstruction accuracy to the rate of information +transfer from a chaotic driver to the response systems, and we find that +effective reconstruction proceeds through gradual approximation of the driver's +dynamical attractor. Through extensive benchmarks against classical signal +processing and machine learning techniques, we demonstrate our method's ability +to extract causal drivers from diverse experimental datasets spanning ecology, +genomics, fluid dynamics, and physiology. + +
+
+ comment: Physical Review X (to appear). Code available online at + https://github.com/williamgilpin/shrec +
+
+
+
+
+ + ♻ ☆ Stock Movement Prediction with Multimodal Stable Fusion via Gated + Cross-Attention Mechanism + + +
+ The accurate prediction of stock movements is crucial for investment +strategies. Stock prices are subject to the influence of various forms of +information, including financial indicators, sentiment analysis, news +documents, and relational structures. Predominant analytical approaches, +however, tend to address only unimodal or bimodal sources, neglecting the +complexity of multimodal data. Further complicating the landscape are the +issues of data sparsity and semantic conflicts between these modalities, which +are frequently overlooked by current models, leading to unstable performance +and limiting practical applicability. To address these shortcomings, this study +introduces a novel architecture, named Multimodal Stable Fusion with Gated +Cross-Attention (MSGCA), designed to robustly integrate multimodal input for +stock movement prediction. The MSGCA framework consists of three integral +components: (1) a trimodal encoding module, responsible for processing +indicator sequences, dynamic documents, and a relational graph, and +standardizing their feature representations; (2) a cross-feature fusion module, +where primary and consistent features guide the multimodal fusion of the three +modalities via a pair of gated cross-attention networks; and (3) a prediction +module, which refines the fused features through temporal and dimensional +reduction to execute precise movement forecasting. Empirical evaluations +demonstrate that the MSGCA framework exceeds current leading methods, achieving +performance gains of 8.1%, 6.1%, 21.7% and 31.6% on four multimodal datasets, +respectively, attributed to its enhanced multimodal fusion stability. + +
+
+ comment: 14 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Unveiling and Mitigating Bias in Large Language Model Recommendations: A + Path to Fairness + + +
+ excel in delivering comprehensive suggestions by deeply analyzing content and +user behavior. However, they often inherit biases from skewed training data, +favoring mainstream content while underrepresenting diverse or non-traditional +options. This study explores the interplay between bias and LLM-based +recommendation systems, focusing on music, song, and book recommendations +across diverse demographic and cultural groups. This paper analyzes bias in +LLM-based recommendation systems across multiple models (GPT, LLaMA, and +Gemini), revealing its deep and pervasive impact on outcomes. Intersecting +identities and contextual factors, like socioeconomic status, further amplify +biases, complicating fair recommendations across diverse groups. Our findings +reveal that bias in these systems is deeply ingrained, yet even simple +interventions like prompt engineering can significantly reduce it. We further +propose a retrieval-augmented generation strategy to mitigate bias more +effectively. Numerical experiments validate these strategies, demonstrating +both the pervasive nature of bias and the impact of the proposed solutions. + +
+
+
+
+
+ + ♻ ☆ Variational autoencoders with latent high-dimensional steady geometric + flows for dynamics + + +
+ We develop Riemannian approaches to variational autoencoders (VAEs) for +PDE-type ambient data with regularizing geometric latent dynamics, which we +refer to as VAE-DLM, or VAEs with dynamical latent manifolds. We redevelop the +VAE framework such that manifold geometries, subject to our geometric flow, +embedded in Euclidean space are learned in the intermediary latent space +developed by encoders and decoders. By tailoring the geometric flow in which +the latent space evolves, we induce latent geometric properties of our +choosing, which are reflected in empirical performance. We reformulate the +traditional evidence lower bound (ELBO) loss with a considerate choice of +prior. We develop a linear geometric flow with a steady-state regularizing +term. This flow requires only automatic differentiation of one time derivative, +and can be solved in moderately high dimensions in a physics-informed approach, +allowing more expressive latent representations. We discuss how this flow can +be formulated as a gradient flow, and maintains entropy away from metric +singularity. This, along with an eigenvalue penalization condition, helps +ensure the manifold is sufficiently large in measure, nondegenerate, and a +canonical geometry, which contribute to a robust representation. Our methods +focus on the modified multi-layer perceptron architecture with tanh activations +for the manifold encoder-decoder. We demonstrate, on our datasets of interest, +our methods perform at least as well as the traditional VAE, and oftentimes +better. Our methods can outperform this and a VAE endowed with our proposed +architecture by up to 25% reduction in out-of-distribution (OOD) error and +potentially greater. We highlight our method on ambient PDEs whose solutions +maintain minimal variation in late times. We provide empirical justification +towards how we can improve robust learning for external dynamics with VAEs. + +
+
+ comment: Minor fixes; added details to proofs in the appendix +
+
+
+
+
+ + ♻ ☆ T2Vid: Translating Long Text into Multi-Image is the Catalyst for + Video-LLMs + + +
+ The success of Multimodal Large Language Models (MLLMs) in the image domain +has garnered wide attention from the research community. Drawing on previous +successful experiences, researchers have recently explored extending the +success to the video understanding realms. Apart from training from scratch, an +efficient way is to utilize the pre-trained image-LLMs, leading to two +mainstream approaches, i.e. zero-shot inference and further fine-tuning with +video data. In this work, our study of these approaches harvests an effective +data augmentation method. We first make a deeper inspection of the zero-shot +inference way and identify two limitations, i.e. limited generalization and +lack of temporal understanding capabilities. Thus, we further investigate the +fine-tuning approach and find a low learning efficiency when simply using all +the video data samples, which can be attributed to a lack of instruction +diversity. Aiming at this issue, we develop a method called T2Vid to synthesize +video-like samples to enrich the instruction diversity in the training corpus. +Integrating these data enables a simple and efficient training scheme, which +achieves performance comparable to or even superior to using full video +datasets by training with just 15% the sample size. Meanwhile, we find that the +proposed scheme can boost the performance of long video understanding without +training with long video samples. We hope our study will spark more thinking +about using MLLMs for video understanding and curation of high-quality data. +The code is released at https://github.com/xjtupanda/T2Vid. + +
+
+ comment: Project page: https://github.com/xjtupanda/T2Vid +
+
+
+
+
+ + ♻ ☆ LoRA Soups: Merging LoRAs for Practical Skill Composition Tasks COLING 2025 + + +
+ Low-Rank Adaptation (LoRA) is a popular technique for parameter-efficient +fine-tuning of Large Language Models (LLMs). We study how different LoRA +modules can be merged to achieve skill composition -- testing the performance +of the merged model on a target task that involves combining multiple skills, +each skill coming from a single LoRA. This setup is favorable when it is +difficult to obtain training data for the target task and when it can be +decomposed into multiple skills. First, we identify practically occurring +use-cases that can be studied under the realm of skill composition, e.g. +solving hard math-word problems with code, creating a bot to answer questions +on proprietary manuals or about domain-specialized corpora. Our main +contribution is to show that concatenation of LoRAs (CAT), which optimally +weights LoRAs that were individually trained on different skills, outperforms +existing model- and data- merging techniques; for instance on math-word +problems, CAT beats these methods by an average of 43% and 12% respectively. +Thus, this paper advocates model merging as an efficient way to solve +compositional tasks and underscores CAT as a simple, compute-friendly and +effective procedure. To our knowledge, this is the first work demonstrating the +superiority of model merging over data mixing for binary skill composition +tasks. Code and data are available at https://github.com/aksh555/LoRA-Soups + +
+
+ comment: COLING 2025 Industry track; 9 pages plus references and appendices +
+
+
+
+
+ + ♻ ☆ Universal on-chip polarization handling with deep photonic networks + + +
+ We propose a novel design paradigm for arbitrarily capable deep photonic +networks of cascaded Mach-Zehnder Interferometers (MZIs) for on-chip universal +polarization handling. Using a device architecture made of cascaded +Mach-Zehnder interferometers, we modify and train the phase difference between +interferometer arms for both polarizations through wide operation bandwidths. +Three proof-of-concept polarization handling devices are illustrated using a +software-defined, physics-informed neural framework, to achieve user-specified +target device responses as functions of polarization and wavelength. These +devices include a polarization splitter, a polarization-independent power +splitter, and an arbitrary polarization-dependent splitter to illustrate the +capabilities of the design framework. The performance for all three devices is +optimized using transfer matrix calculations; and their final responses are +verified through 3D-FDTD simulations. All devices demonstrate state-of-the-art +performance metrics with over 20 dB extinction, and flat-top transmission bands +through bandwidths of 120 nm. In addition to the functional diversity enabled, +the optimization for each device is completed in under a minute, highlighting +the computational efficiency of the design paradigm presented. These results +demonstrate the versatility of the deep photonic network design ecosystem in +polarization management, unveiling promising prospects for advanced on-chip +applications in optical communications, sensing, and computing. + +
+
+
+
+
+ + ♻ ☆ Critical Tokens Matter: Token-Level Contrastive Estimation Enhances + LLM's Reasoning Capability + + +
+ Large Language Models (LLMs) have exhibited remarkable performance on +reasoning tasks. They utilize autoregressive token generation to construct +reasoning trajectories, enabling the development of a coherent chain of +thought. In this work, we explore the impact of individual tokens on the final +outcomes of reasoning tasks. We identify the existence of ``critical tokens'' +that lead to incorrect reasoning trajectories in LLMs. Specifically, we find +that LLMs tend to produce positive outcomes when forced to decode other tokens +instead of critical tokens. Motivated by this observation, we propose a novel +approach - cDPO - designed to automatically recognize and conduct token-level +rewards for the critical tokens during the alignment process. Specifically, we +develop a contrastive estimation approach to automatically identify critical +tokens. It is achieved by comparing the generation likelihood of positive and +negative models. To achieve this, we separately fine-tune the positive and +negative models on various reasoning trajectories, consequently, they are +capable of identifying identify critical tokens within incorrect trajectories +that contribute to erroneous outcomes. Moreover, to further align the model +with the critical token information during the alignment process, we extend the +conventional DPO algorithms to token-level DPO and utilize the differential +likelihood from the aforementioned positive and negative model as important +weight for token-level DPO learning.Experimental results on GSM8K and MATH500 +benchmarks with two-widely used models Llama-3 (8B and 70B) and deepseek-math +(7B) demonstrate the effectiveness of the propsoed approach cDPO. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Fighting Bias with Bias: A Machine Learning Approach to Assess Human + Bias + + +
+ Biased human decisions have consequential impacts across various domains, +yielding unfair treatment of individuals and resulting in suboptimal outcomes +for organizations and society. In recognition of this fact, organizations +regularly design and deploy interventions aimed at mitigating these biases. +However, measuring human decision biases remains an important but elusive task. +Organizations are frequently concerned with mistaken decisions +disproportionately affecting one group. In practice, however, this is typically +not possible to assess due to the scarcity of a gold standard: a label that +indicates what the correct decision would have been. In this work, we propose a +machine learning-based framework to assess bias in human-generated decisions +when gold standard labels are scarce. We provide theoretical guarantees and +empirical evidence demonstrating the superiority of our method over existing +alternatives. This proposed methodology establishes a foundation for +transparency in human decision-making, carrying substantial implications for +managerial duties, and offering potential for alleviating algorithmic biases +when human decisions are used as labels to train algorithms. + +
+
+
+
+
+ + ♻ ☆ GPU-Accelerated Counterfactual Regret Minimization + + +
+ Counterfactual regret minimization is a family of algorithms of no-regret +learning dynamics capable of solving large-scale imperfect information games. +We propose implementing this algorithm as a series of dense and sparse matrix +and vector operations, thereby making it highly parallelizable for a graphical +processing unit, at a cost of higher memory usage. Our experiments show that +our implementation performs up to about 401.2 times faster than OpenSpiel's +Python implementation and, on an expanded set of games, up to about 203.6 times +faster than OpenSpiel's C++ implementation and the speedup becomes more +pronounced as the size of the game being solved grows. + +
+
+
+
+
+ + ♻ ☆ Realizable Continuous-Space Shields for Safe Reinforcement Learning + + +
+ While Deep Reinforcement Learning (DRL) has achieved remarkable success +across various domains, it remains vulnerable to occasional catastrophic +failures without additional safeguards. An effective solution to prevent these +failures is to use a shield that validates and adjusts the agent's actions to +ensure compliance with a provided set of safety specifications. For real-world +robotic domains, it is essential to define safety specifications over +continuous state and action spaces to accurately account for system dynamics +and compute new actions that minimally deviate from the agent's original +decision. In this paper, we present the first shielding approach specifically +designed to ensure the satisfaction of safety requirements in continuous state +and action spaces, making it suitable for practical robotic applications. Our +method builds upon realizability, an essential property that confirms the +shield will always be able to generate a safe action for any state in the +environment. We formally prove that realizability can be verified for stateful +shields, enabling the incorporation of non-Markovian safety requirements, such +as loop avoidance. Finally, we demonstrate the effectiveness of our approach in +ensuring safety without compromising the policy's success rate by applying it +to a navigation problem and a multi-agent particle environment. + +
+
+ comment: Kim, Corsi, and Rodriguez contributed equally +
+
+
+
+
+ + ♻ ☆ Towards Understanding Domain Adapted Sentence Embeddings for Document + Retrieval + + +
+ A plethora of sentence embedding models makes it challenging to choose one, +especially for technical domains rich with specialized vocabulary. In this +work, we domain adapt embeddings using telecom, health and science datasets for +question answering. We evaluate embeddings obtained from publicly available +models and their domain-adapted variants, on both point retrieval accuracies, +as well as their (95\%) confidence intervals. We establish a systematic method +to obtain thresholds for similarity scores for different embeddings. As +expected, we observe that fine-tuning improves mean bootstrapped accuracies. We +also observe that it results in tighter confidence intervals, which further +improve when pre-training is preceded by fine-tuning. We introduce metrics +which measure the distributional overlaps of top-$K$, correct and random +document similarities with the question. Further, we show that these metrics +are correlated with retrieval accuracy and similarity thresholds. Recent +literature shows conflicting effects of isotropy on retrieval accuracies. Our +experiments establish that the isotropy of embeddings (as measured by two +independent state-of-the-art isotropy metric definitions) is poorly correlated +with retrieval performance. We show that embeddings for domain-specific +sentences have little overlap with those for domain-agnostic ones, and +fine-tuning moves them further apart. Based on our results, we provide +recommendations for use of our methodology and metrics by researchers and +practitioners. + +
+
+
+
+
+ + ♻ ☆ TTSDS -- Text-to-Speech Distribution Score + + +
+ Many recently published Text-to-Speech (TTS) systems produce audio close to +real speech. However, TTS evaluation needs to be revisited to make sense of the +results obtained with the new architectures, approaches and datasets. We +propose evaluating the quality of synthetic speech as a combination of multiple +factors such as prosody, speaker identity, and intelligibility. Our approach +assesses how well synthetic speech mirrors real speech by obtaining correlates +of each factor and measuring their distance from both real speech datasets and +noise datasets. We benchmark 35 TTS systems developed between 2008 and 2024 and +show that our score computed as an unweighted average of factors strongly +correlates with the human evaluations from each time period. + +
+
+ comment: SLT 2024 +
+
+
+
+
+ + ♻ ☆ Provable Acceleration of Nesterov's Accelerated Gradient for Rectangular + Matrix Factorization and Linear Neural Networks + + +
+ We study the convergence rate of first-order methods for rectangular matrix +factorization, which is a canonical nonconvex optimization problem. +Specifically, given a rank-$r$ matrix $\mathbf{A}\in\mathbb{R}^{m\times n}$, we +prove that gradient descent (GD) can find a pair of $\epsilon$-optimal +solutions $\mathbf{X}_T\in\mathbb{R}^{m\times d}$ and +$\mathbf{Y}_T\in\mathbb{R}^{n\times d}$, where $d\geq r$, satisfying +$\lVert\mathbf{X}_T\mathbf{Y}_T^\top-\mathbf{A}\rVert_\mathrm{F}\leq\epsilon\lVert\mathbf{A}\rVert_\mathrm{F}$ +in $T=O(\kappa^2\log\frac{1}{\epsilon})$ iterations with high probability, +where $\kappa$ denotes the condition number of $\mathbf{A}$. Furthermore, we +prove that Nesterov's accelerated gradient (NAG) attains an iteration +complexity of $O(\kappa\log\frac{1}{\epsilon})$, which is the best-known bound +of first-order methods for rectangular matrix factorization. Different from +small balanced random initialization in the existing literature, we adopt an +unbalanced initialization, where $\mathbf{X}_0$ is large and $\mathbf{Y}_0$ is +$0$. Moreover, our initialization and analysis can be further extended to +linear neural networks, where we prove that NAG can also attain an accelerated +linear convergence rate. In particular, we only require the width of the +network to be greater than or equal to the rank of the output label matrix. In +contrast, previous results achieving the same rate require excessive widths +that additionally depend on the condition number and the rank of the input data +matrix. + +
+
+ comment: 30 pages (checklist included) +
+
+
+
+
+ + ♻ ☆ DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow + Decoding + + +
+ Human motion, inherently continuous and dynamic, presents significant +challenges for generative models. Despite their dominance, discrete +quantization methods, such as VQ-VAEs, suffer from inherent limitations, +including restricted expressiveness and frame-wise noise artifacts. Continuous +approaches, while producing smoother and more natural motions, often falter due +to high-dimensional complexity and limited training data. To resolve this +"discord" between discrete and continuous representations, we introduce +DisCoRD: Discrete Tokens to Continuous Motion via Rectified Flow Decoding, a +novel method that decodes discrete motion tokens into continuous motion through +rectified flow. By employing an iterative refinement process in the continuous +space, DisCoRD captures fine-grained dynamics and ensures smoother and more +natural motions. Compatible with any discrete-based framework, our method +enhances naturalness without compromising faithfulness to the conditioning +signals. Extensive evaluations demonstrate that DisCoRD achieves +state-of-the-art performance, with FID of 0.032 on HumanML3D and 0.169 on +KIT-ML. These results solidify DisCoRD as a robust solution for bridging the +divide between discrete efficiency and continuous realism. Our project page is +available at: https://whwjdqls.github.io/discord.github.io/. + +
+
+ comment: 20 pages 18 figures +
+
+
+
+
+ + ♻ ☆ A Statistical Framework of Watermarks for Large Language Models: Pivot, + Detection Efficiency and Optimal Rules + + +
+ Since ChatGPT was introduced in November 2022, embedding (nearly) +unnoticeable statistical signals into text generated by large language models +(LLMs), also known as watermarking, has been used as a principled approach to +provable detection of LLM-generated text from its human-written counterpart. In +this paper, we introduce a general and flexible framework for reasoning about +the statistical efficiency of watermarks and designing powerful detection +rules. Inspired by the hypothesis testing formulation of watermark detection, +our framework starts by selecting a pivotal statistic of the text and a secret +key -- provided by the LLM to the verifier -- to enable controlling the false +positive rate (the error of mistakenly detecting human-written text as +LLM-generated). Next, this framework allows one to evaluate the power of +watermark detection rules by obtaining a closed-form expression of the +asymptotic false negative rate (the error of incorrectly classifying +LLM-generated text as human-written). Our framework further reduces the problem +of determining the optimal detection rule to solving a minimax optimization +program. We apply this framework to two representative watermarks -- one of +which has been internally implemented at OpenAI -- and obtain several findings +that can be instrumental in guiding the practice of implementing watermarks. In +particular, we derive optimal detection rules for these watermarks under our +framework. These theoretically derived detection rules are demonstrated to be +competitive and sometimes enjoy a higher power than existing detection +approaches through numerical experiments. + +
+
+ comment: To appear in the Annals of Statistics +
+
+
+
+
+ + ♻ ☆ Improved AdaBoost for Virtual Reality Experience Prediction Based on + Long Short-Term Memory Network + + +
+ A classification prediction algorithm based on Long Short-Term Memory Network +(LSTM) improved AdaBoost is used to predict virtual reality (VR) user +experience. The dataset is randomly divided into training and test sets in the +ratio of 7:3.During the training process, the model's loss value decreases from +0.65 to 0.31, which shows that the model gradually reduces the discrepancy +between the prediction results and the actual labels, and improves the accuracy +and generalisation ability.The final loss value of 0.31 indicates that the +model fits the training data well, and is able to make predictions and +classifications more accurately. The confusion matrix for the training set +shows a total of 177 correct predictions and 52 incorrect predictions, with an +accuracy of 77%, precision of 88%, recall of 77% and f1 score of 82%. The +confusion matrix for the test set shows a total of 167 correct and 53 incorrect +predictions with 75% accuracy, 87% precision, 57% recall and 69% f1 score. In +summary, the classification prediction algorithm based on LSTM with improved +AdaBoost shows good prediction ability for virtual reality user experience. +This study is of great significance to enhance the application of virtual +reality technology in user experience. By combining LSTM and AdaBoost +algorithms, significant progress has been made in user experience prediction, +which not only improves the accuracy and generalisation ability of the model, +but also provides useful insights for related research in the field of virtual +reality. This approach can help developers better understand user requirements, +optimise virtual reality product design, and enhance user satisfaction, +promoting the wide application of virtual reality technology in various fields. + +
+
+ comment: This work has been peer-reviewed in The 2nd International Conference + on Software Engineering and Machine Learning and published in Applied and + Computational Engineering, DOI: + https://doi.org/10.54254/2755-2721/77/20240678 +
+
+
+
+
+ + ♻ ☆ Evaluating LLMs for Hardware Design and Test + + +
+ Large Language Models (LLMs) have demonstrated capabilities for producing +code in Hardware Description Languages (HDLs). However, most of the focus +remains on their abilities to write functional code, not test code. The +hardware design process consists of both design and test, and so eschewing +validation and verification leaves considerable potential benefit unexplored, +given that a design and test framework may allow for progress towards full +automation of the digital design pipeline. In this work, we perform one of the +first studies exploring how a LLM can both design and test hardware modules +from provided specifications. Using a suite of 8 representative benchmarks, we +examined the capabilities and limitations of the state-of-the-art +conversational LLMs when producing Verilog for functional and verification +purposes. We taped out the benchmarks on a Skywater 130nm shuttle and received +the functional chip. + +
+
+
+
+
+ + ♻ ☆ Deep Dynamics: Vehicle Dynamics Modeling with a Physics-Constrained + Neural Network for Autonomous Racing + + +
+ Autonomous racing is a critical research area for autonomous driving, +presenting significant challenges in vehicle dynamics modeling, such as +balancing model precision and computational efficiency at high speeds +(>280km/h), where minor errors in modeling have severe consequences. Existing +physics-based models for vehicle dynamics require elaborate testing setups and +tuning, which are hard to implement, time-intensive, and cost-prohibitive. +Conversely, purely data-driven approaches do not generalize well and cannot +adequately ensure physical constraints on predictions. This paper introduces +Deep Dynamics, a physics-constrained neural network (PCNN) for vehicle dynamics +modeling of an autonomous racecar. It combines physics coefficient estimation +and dynamical equations to accurately predict vehicle states at high speeds and +includes a unique Physics Guard layer to ensure internal coefficient estimates +remain within their nominal physical ranges. Open-loop and closed-loop +performance assessments, using a physics-based simulator and full-scale +autonomous Indy racecar data, highlight Deep Dynamics as a promising approach +for modeling racecar vehicle dynamics. + +
+
+ comment: Published in the IEEE Robotics and Automation Letters and presented + at the IEEE International Conference on Intelligent Robots and Systems +
+
+
+
+
+ + ♻ ☆ Noisy Nonnegative Tucker Decomposition with Sparse Factors and Missing + Data + + +
+ Tensor decomposition is a powerful tool for extracting physically meaningful +latent factors from multi-dimensional nonnegative data, and has been an +increasing interest in a variety of fields such as image processing, machine +learning, and computer vision. In this paper, we propose a sparse nonnegative +Tucker decomposition and completion method for the recovery of underlying +nonnegative data under noisy observations. Here the underlying nonnegative data +tensor is decomposed into a core tensor and several factor matrices with all +entries being nonnegative and the factor matrices being sparse. The loss +function is derived by the maximum likelihood estimation of the noisy +observations, and the $\ell_0$ norm is employed to enhance the sparsity of the +factor matrices. We establish the error bound of the estimator of the proposed +model under generic noise scenarios, which is then specified to the +observations with additive Gaussian noise, additive Laplace noise, and Poisson +observations, respectively. Our theoretical results are better than those by +existing tensor-based or matrix-based methods. Moreover, the minimax lower +bounds are shown to be matched with the derived upper bounds up to logarithmic +factors. Numerical examples on both synthetic and real-world data sets +demonstrate the superiority of the proposed method for nonnegative tensor data +completion. + +
+
+
+
+
+ + ♻ ☆ A Block Coordinate Descent Method for Nonsmooth Composite Optimization + under Orthogonality Constraints + + +
+ Nonsmooth composite optimization with orthogonality constraints has a wide +range of applications in statistical learning and data science. However, this +problem is challenging due to its nonsmooth objective and computationally +expensive, non-convex constraints. In this paper, we propose a new approach +called \textbf{OBCD}, which leverages Block Coordinate Descent to address these +challenges. \textbf{OBCD} is a feasible method with a small computational +footprint. In each iteration, it updates $k$ rows of the solution matrix, where +$k \geq 2$, by globally solving a small nonsmooth optimization problem under +orthogonality constraints. We prove that the limiting points of \textbf{OBCD}, +referred to as (global) block-$k$ stationary points, offer stronger optimality +than standard critical points. Furthermore, we show that \textbf{OBCD} +converges to $\epsilon$-block-$k$ stationary points with an ergodic convergence +rate of $\mathcal{O}(1/\epsilon)$. Additionally, under the Kurdyka-Lojasiewicz +(KL) inequality, we establish the non-ergodic convergence rate of +\textbf{OBCD}. We also extend \textbf{OBCD} by incorporating breakpoint +searching methods for subproblem solving and greedy strategies for working set +selection. Comprehensive experiments demonstrate the superior performance of +our approach across various tasks. + +
+
+
+
+
+ + ♻ ☆ Enabling more efficient and cost-effective AI/ML systems with Collective + Mind, virtualized MLOps, MLPerf, Collective Knowledge Playground and + reproducible optimization tournaments + + +
+ This white paper introduces my educational community initiative to learn how +to run AI, ML and other emerging workloads in the most efficient and +cost-effective way across diverse models, data sets, software and hardware. +This project leverages Collective Mind (CM), virtualized MLOps and DevOps +(CM4MLOps), MLPerf benchmarks, and the Collective Knowledge playground (CK), +which I have developed in collaboration with the community and MLCommons. + I created Collective Mind as a small and portable Python package with minimal +dependencies, a unified CLI and Python API to help researchers and engineers +automate repetitive, tedious, and time-consuming tasks. I also designed CM as a +distributed framework, continuously enhanced by the community through the CM4* +repositories, which function as the unified interface for organizing and +managing various collections of automations and artifacts. For example, +CM4MLOps repository includes many automations, also known as CM scripts, to +streamline the process of building, running, benchmarking, and optimizing AI, +ML, and other workflows across ever-evolving models, data, and systems. + I donated CK, CM and CM4MLOps to MLCommons to foster collaboration between +academia and industry to learn how to co-design more efficient and +cost-effective AI systems while capturing and encoding knowledge within +Collective Mind, protecting intellectual property, enabling portable skills, +and accelerating the transition of the state-of-the-art research into +production. My ultimate goal is to collaborate with the community to complete +my two-decade journey toward creating self-optimizing software and hardware +that can automatically learn how to run any workload in the most efficient and +cost-effective manner based on user requirements and constraints such as cost, +latency, throughput, accuracy, power consumption, size, and other critical +factors. + +
+
+
+
+
+ + ♻ ☆ An Information Theoretic Approach to Machine Unlearning + + +
+ To comply with AI and data regulations, the need to forget private or +copyrighted information from trained machine learning models is increasingly +important. The key challenge in unlearning is forgetting the necessary data in +a timely manner, while preserving model performance. In this work, we address +the zero-shot unlearning scenario, whereby an unlearning algorithm must be able +to remove data given only a trained model and the data to be forgotten. We +explore unlearning from an information theoretic perspective, connecting the +influence of a sample to the information gain a model receives by observing it. +From this, we derive a simple but principled zero-shot unlearning method based +on the geometry of the model. Our approach takes the form of minimising the +gradient of a learned function with respect to a small neighbourhood around a +target forget point. This induces a smoothing effect, causing forgetting by +moving the boundary of the classifier. We explore the intuition behind why this +approach can jointly unlearn forget samples while preserving general model +performance through a series of low-dimensional experiments. We perform +extensive empirical evaluation of our method over a range of contemporary +benchmarks, verifying that our method is competitive with state-of-the-art +performance under the strict constraints of zero-shot unlearning. Code for the +project can be found at +https://github.com/jwf40/Information-Theoretic-Unlearning + +
+
+ comment: Updated, new low-dimensional experiments and updated perspective on + unlearning from an information theoretic view +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ HybridMQA: Exploring Geometry-Texture Interactions for Colored Mesh + Quality Assessment + + +
+ Mesh quality assessment (MQA) models play a critical role in the design, +optimization, and evaluation of mesh operation systems in a wide variety of +applications. Current MQA models, whether model-based methods using +topology-aware features or projection-based approaches working on rendered 2D +projections, often fail to capture the intricate interactions between texture +and 3D geometry. We introduce HybridMQA, a first-of-its-kind hybrid +full-reference colored MQA framework that integrates model-based and +projection-based approaches, capturing complex interactions between textural +information and 3D structures for enriched quality representations. Our method +employs graph learning to extract detailed 3D representations, which are then +projected to 2D using a novel feature rendering process that precisely aligns +them with colored projections. This enables the exploration of geometry-texture +interactions via cross-attention, producing comprehensive mesh quality +representations. Extensive experiments demonstrate HybridMQA's superior +performance across diverse datasets, highlighting its ability to effectively +leverage geometry-texture interactions for a thorough understanding of mesh +quality. Our implementation will be made publicly available. + +
+
+
+
+
+ + ☆ X-Prompt: Towards Universal In-Context Image Generation in + Auto-Regressive Vision Language Foundation Models + + +
+ In-context generation is a key component of large language models' (LLMs) +open-task generalization capability. By leveraging a few examples as context, +LLMs can perform both in-domain and out-of-domain tasks. Recent advancements in +auto-regressive vision-language models (VLMs) built upon LLMs have showcased +impressive performance in text-to-image generation. However, the potential of +in-context learning for general image generation tasks remains largely +unexplored. To address this, we introduce X-Prompt, a purely auto-regressive +large-vision language model designed to deliver competitive performance across +a wide range of both seen and unseen image generation tasks, all within a +unified in-context learning framework. X-Prompt incorporates a specialized +design that efficiently compresses valuable features from in-context examples, +supporting longer in-context token sequences and improving its ability to +generalize to unseen tasks. A unified training task for both text and image +prediction enables X-Prompt to handle general image generation with enhanced +task awareness from in-context examples. Extensive experiments validate the +model's performance across diverse seen image generation tasks and its capacity +to generalize to previously unseen tasks. + +
+
+ comment: code: https://github.com/SunzeY/X-Prompt +
+
+
+
+
+ + ☆ Divide-and-Conquer: Confluent Triple-Flow Network for RGB-T Salient + Object Detection + + +
+ RGB-Thermal Salient Object Detection aims to pinpoint prominent objects +within aligned pairs of visible and thermal infrared images. Traditional +encoder-decoder architectures, while designed for cross-modality feature +interactions, may not have adequately considered the robustness against noise +originating from defective modalities. Inspired by hierarchical human visual +systems, we propose the ConTriNet, a robust Confluent Triple-Flow Network +employing a Divide-and-Conquer strategy. Specifically, ConTriNet comprises +three flows: two modality-specific flows explore cues from RGB and Thermal +modalities, and a third modality-complementary flow integrates cues from both +modalities. ConTriNet presents several notable advantages. It incorporates a +Modality-induced Feature Modulator in the modality-shared union encoder to +minimize inter-modality discrepancies and mitigate the impact of defective +samples. Additionally, a foundational Residual Atrous Spatial Pyramid Module in +the separated flows enlarges the receptive field, allowing for the capture of +multi-scale contextual information. Furthermore, a Modality-aware Dynamic +Aggregation Module in the modality-complementary flow dynamically aggregates +saliency-related cues from both modality-specific flows. Leveraging the +proposed parallel triple-flow framework, we further refine saliency maps +derived from different flows through a flow-cooperative fusion strategy, +yielding a high-quality, full-resolution saliency map for the final prediction. +To evaluate the robustness and stability of our approach, we collect a +comprehensive RGB-T SOD benchmark, VT-IMAG, covering various real-world +challenging scenarios. Extensive experiments on public benchmarks and our +VT-IMAG dataset demonstrate that ConTriNet consistently outperforms +state-of-the-art competitors in both common and challenging scenarios. + +
+
+ comment: Accepted by IEEE TPAMI. Project page: + https://cser-tang-hao.github.io/contrinet.html +
+
+
+
+
+ + ☆ Long Video Diffusion Generation with Segmented Cross-Attention and + Content-Rich Video Data Curation + + +
+ We introduce Presto, a novel video diffusion model designed to generate +15-second videos with long-range coherence and rich content. Extending video +generation methods to maintain scenario diversity over long durations presents +significant challenges. To address this, we propose a Segmented Cross-Attention +(SCA) strategy, which splits hidden states into segments along the temporal +dimension, allowing each segment to cross-attend to a corresponding +sub-caption. SCA requires no additional parameters, enabling seamless +incorporation into current DiT-based architectures. To facilitate high-quality +long video generation, we build the LongTake-HD dataset, consisting of 261k +content-rich videos with scenario coherence, annotated with an overall video +caption and five progressive sub-captions. Experiments show that our Presto +achieves 78.5% on the VBench Semantic Score and 100% on the Dynamic Degree, +outperforming existing state-of-the-art video generation methods. This +demonstrates that our proposed Presto significantly enhances content richness, +maintains long-range coherence, and captures intricate textual details. More +details are displayed on our project page: https://presto-video.github.io/. + +
+
+
+
+
+ + ☆ Neuron Abandoning Attention Flow: Visual Explanation of Dynamics inside + CNN Models + + +
+ In this paper, we present a Neuron Abandoning Attention Flow (NAFlow) method +to address the open problem of visually explaining the attention evolution +dynamics inside CNNs when making their classification decisions. A novel +cascading neuron abandoning back-propagation algorithm is designed to trace +neurons in all layers of a CNN that involve in making its prediction to address +the problem of significant interference from abandoned neurons. Firstly, a +Neuron Abandoning Back-Propagation (NA-BP) module is proposed to generate +Back-Propagated Feature Maps (BPFM) by using the inverse function of the +intermediate layers of CNN models, on which the neurons not used for +decision-making are abandoned. Meanwhile, the cascading NA-BP modules calculate +the tensors of importance coefficients which are linearly combined with the +tensors of BPFMs to form the NAFlow. Secondly, to be able to visualize +attention flow for similarity metric-based CNN models, a new channel +contribution weights module is proposed to calculate the importance +coefficients via Jacobian Matrix. The effectiveness of the proposed NAFlow is +validated on nine widely-used CNN models for various tasks of general image +classification, contrastive learning classification, few-shot image +classification, and image retrieval. + +
+
+
+
+
+ + ☆ OmniFlow: Any-to-Any Generation with Multi-Modal Rectified Flows + + +
+ We introduce OmniFlow, a novel generative model designed for any-to-any +generation tasks such as text-to-image, text-to-audio, and audio-to-image +synthesis. OmniFlow advances the rectified flow (RF) framework used in +text-to-image models to handle the joint distribution of multiple modalities. +It outperforms previous any-to-any models on a wide range of tasks, such as +text-to-image and text-to-audio synthesis. Our work offers three key +contributions: First, we extend RF to a multi-modal setting and introduce a +novel guidance mechanism, enabling users to flexibly control the alignment +between different modalities in the generated outputs. Second, we propose a +novel architecture that extends the text-to-image MMDiT architecture of Stable +Diffusion 3 and enables audio and text generation. The extended modules can be +efficiently pretrained individually and merged with the vanilla text-to-image +MMDiT for fine-tuning. Lastly, we conduct a comprehensive study on the design +choices of rectified flow transformers for large-scale audio and text +generation, providing valuable insights into optimizing performance across +diverse modalities. The Code will be available at +https://github.com/jacklishufan/OmniFlows. + +
+
+ comment: 12 pages, 14 figures +
+
+
+
+
+ + ☆ FLOAT: Generative Motion Latent Flow Matching for Audio-driven Talking + Portrait + + +
+ With the rapid advancement of diffusion-based generative models, portrait +image animation has achieved remarkable results. However, it still faces +challenges in temporally consistent video generation and fast sampling due to +its iterative sampling nature. This paper presents FLOAT, an audio-driven +talking portrait video generation method based on flow matching generative +model. We shift the generative modeling from the pixel-based latent space to a +learned motion latent space, enabling efficient design of temporally consistent +motion. To achieve this, we introduce a transformer-based vector field +predictor with a simple yet effective frame-wise conditioning mechanism. +Additionally, our method supports speech-driven emotion enhancement, enabling a +natural incorporation of expressive motions. Extensive experiments demonstrate +that our method outperforms state-of-the-art audio-driven talking portrait +methods in terms of visual quality, motion fidelity, and efficiency. + +
+
+ comment: Project page: https://deepbrainai-research.github.io/float/ +
+
+
+
+
+ + ♻ ☆ DAE-Talker: High Fidelity Speech-Driven Talking Face Generation with + Diffusion Autoencoder + + +
+ While recent research has made significant progress in speech-driven talking +face generation, the quality of the generated video still lags behind that of +real recordings. One reason for this is the use of handcrafted intermediate +representations like facial landmarks and 3DMM coefficients, which are designed +based on human knowledge and are insufficient to precisely describe facial +movements. Additionally, these methods require an external pretrained model for +extracting these representations, whose performance sets an upper bound on +talking face generation. To address these limitations, we propose a novel +method called DAE-Talker that leverages data-driven latent representations +obtained from a diffusion autoencoder (DAE). DAE contains an image encoder that +encodes an image into a latent vector and a DDIM image decoder that +reconstructs the image from it. We train our DAE on talking face video frames +and then extract their latent representations as the training target for a +Conformer-based speech2latent model. This allows DAE-Talker to synthesize full +video frames and produce natural head movements that align with the content of +speech, rather than relying on a predetermined head pose from a template video. +We also introduce pose modelling in speech2latent for pose controllability. +Additionally, we propose a novel method for generating continuous video frames +with the DDIM image decoder trained on individual frames, eliminating the need +for modelling the joint distribution of consecutive frames directly. Our +experiments show that DAE-Talker outperforms existing popular methods in +lip-sync, video fidelity, and pose naturalness. We also conduct ablation +studies to analyze the effectiveness of the proposed techniques and demonstrate +the pose controllability of DAE-Talker. + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+
+
+
+ + Genomics 3 + +
+
+
+ + ☆ Microbial Mat Metagenomes from Waikite Valley, Aotearoa New Zealand + + +
+ The rise of complex multicellular ecosystems Neoproterozoic time was preceded +by a microbial Proterozoic biosphere, where productivity may have been largely +restricted to microbial mats made up of bacteria including oxygenic +photosynthetic Cyanobacteria, anoxygenic phototrophs, and heterotrophs. In +modern environments, analogous microbial mats can be found in restricted +environments such as carbonate tidal flats and terrestrial hot springs. Here, +we report metagenomic sequence data from an analog in the hot springs of +Waikite Valley, Aotearoa New Zealand, where carbon-rich, slightly-alkaline +geothermal waters support diverse phototrophic microbial mats. + The Waikite Valley hot spring in the Taupo Volcanic Zone of Aotearoa New +Zealand was sampled in duplicate at 8 points along a temperature gradient +transect of the outflow, from ~62 C (near the source) to ~37 C (~100 meters +downstream). ~686 Gb of shotgun metagenomic sequence data was generated by +Illumina Novaseq. Each sample was assembled using SPAdes, followed by binning +of metagenome-assembled genomes (MAGs) by MetaBAT. These data are useful for +the genomic analysis of novel phototrophic bacteria, as well as for ecological +comparisons between thermophilic communities with varying temperatures but +otherwise similar conditions. + +
+
+ comment: 55 pages, 1 table, 3 data sets +
+
+
+
+
+ + ☆ pasta: Pattern Analysis for Spatial Omics Data + + +
+ Spatial omics assays allow for the molecular characterisation of cells in +their spatial context. Notably, the two main technological streams, +imaging-based and high-throughput sequencing-based, can give rise to very +different data modalities. The characteristics of the two data types are well +known in adjacent fields such as spatial statistics as point patterns and +lattice data, and there is a wide range of tools available. This paper +discusses the application of spatial statistics to spatially-resolved omics +data and in particular, discusses various advantages, challenges, and nuances. +This work is accompanied by a vignette, pasta, that showcases the usefulness of +spatial statistics in biology using several R packages. + +
+
+
+
+
+ + ☆ SUICA: Learning Super-high Dimensional Sparse Implicit Neural + Representations for Spatial Transcriptomics + + +
+ Spatial Transcriptomics (ST) is a method that captures spatial gene +expression profiles within histological sections. The discrete spatial +distribution and the super-high dimensional sequencing results make ST data +challenging to be modeled effectively. In this paper, we manage to model ST in +a continuous and compact manner by the proposed tool, SUICA, empowered by the +great approximation capability of Implicit Neural Representations (INRs) that +can improve both the spatial resolution and the gene expression. Concretely +within the proposed SUICA, we incorporate a graph-augmented Autoencoder to +effectively model the context information of the unstructured spots and provide +informative embeddings that are structure-aware for spatial mapping. We also +tackle the extremely skewed distribution in a regression-by-classification +fashion and enforce classification-based loss functions for the optimization of +SUICA. By extensive experiments of a wide range of common ST platforms, SUICA +outperforms both conventional INR variants and SOTA methods for ST +super-resolution regarding numerical fidelity, statistical correlation, and +bio-conservation. The prediction by SUICA also showcases amplified gene +signatures that enriches the bio-conservation of the raw data and benefits +subsequent analysis. The code is available at https://github.com/Szym29/SUICA. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 38 + +
+
+
+ + ♻ ☆ Differentiable Inverse Rendering with Interpretable Basis BRDFs + + +
+ Inverse rendering seeks to reconstruct both geometry and spatially varying +BRDFs (SVBRDFs) from captured images. To address the inherent ill-posedness of +inverse rendering, basis BRDF representations are commonly used, modeling +SVBRDFs as spatially varying blends of a set of basis BRDFs. However, existing +methods often yield basis BRDFs that lack intuitive separation and have limited +scalability to scenes of varying complexity. In this paper, we introduce a +differentiable inverse rendering method that produces interpretable basis +BRDFs. Our approach models a scene using 2D Gaussians, where the reflectance of +each Gaussian is defined by a weighted blend of basis BRDFs. We efficiently +render an image from the 2D Gaussians and basis BRDFs using differentiable +rasterization and impose a rendering loss with the input images. During this +analysis-by-synthesis optimization process of differentiable inverse rendering, +we dynamically adjust the number of basis BRDFs to fit the target scene while +encouraging sparsity in the basis weights. This ensures that the reflectance of +each Gaussian is represented by only a few basis BRDFs. This approach enables +the reconstruction of accurate geometry and interpretable basis BRDFs that are +spatially separated. Consequently, the resulting scene representation, +comprising basis BRDFs and 2D Gaussians, supports physically-based novel-view +relighting and intuitive scene editing. + +
+
+ comment: This is a different paper from my previous paper "Differentiable + Point-based Inverse Rendering". It must not be removed automatically +
+
+
+
+
+ + ♻ ☆ Comprehensive framework for evaluation of deep neural networks in + detection and quantification of lymphoma from PET/CT images: clinical + insights, pitfalls, and observer agreement analyses + + +
+ This study addresses critical gaps in automated lymphoma segmentation from +PET/CT images, focusing on issues often overlooked in existing literature. +While deep learning has been applied for lymphoma lesion segmentation, few +studies incorporate out-of-distribution testing, raising concerns about model +generalizability across diverse imaging conditions and patient populations. We +highlight the need to compare model performance with expert human annotators, +including intra- and inter-observer variability, to understand task difficulty +better. Most approaches focus on overall segmentation accuracy but overlook +lesion-specific metrics important for precise lesion detection and disease +quantification.To address these gaps, we propose a clinically-relevant +framework for evaluating deep neural networks. Using this lesion-specific +evaluation, we assess the performance of four deep segmentation networks +(ResUNet, SegResNet, DynUNet, and SwinUNETR) across 611 cases from +multi-institutional datasets, covering various lymphoma subtypes and lesion +characteristics. Beyond standard metrics like the Dice similarity coefficient +(DSC), we evaluate clinical lesion measures and their prediction errors. We +also introduce detection criteria for lesion localization and propose a new +detection Criterion 3 based on metabolic characteristics. We show that networks +perform better on large, intense lesions with higher metabolic +activity.Finally, we compare network performance to expert human observers via +intra- and inter-observer variability analyses, demonstrating that network +errors closely resemble those made by experts. Some small, faint lesions remain +challenging for both humans and networks. This study aims to improve automated +lesion segmentation's clinical relevance, supporting better treatment decisions +for lymphoma patients. The code is available at: +https://github.com/microsoft/lymphoma-segmentation-dnn + +
+
+ comment: 32 pages, 15 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Right Place, Right Time! Generalizing ObjectNav to Dynamic Environments + with Portable Targets + + +
+ ObjectNav is a popular task in Embodied AI, where an agent navigates to a +target object in an unseen environment. Prior literature makes the assumption +of a static environment with stationary objects, which lacks realism. To +address this, we present a novel formulation to generalize ObjectNav to dynamic +environments with non-stationary objects, and refer to it as Portable ObjectNav +or P-ObjectNav. In our formulation, we first address several challenging issues +with dynamizing existing topological scene graphs by developing a novel method +that introduces multiple transition behaviors to portable objects in the scene. +We use this technique to dynamize Matterport3D, a popular simulator for +evaluating embodied tasks. We then present a benchmark for P-ObjectNav using a +combination of heuristic, reinforcement learning, and Large Language Model +(LLM)-based navigation approaches on the dynamized environment, while +introducing novel evaluation metrics tailored for our task. Our work +fundamentally challenges the "static-environment" notion of prior ObjectNav +work; the code and dataset for P-ObjectNav will be made publicly available to +foster research on embodied navigation in dynamic scenes. We provide an +anonymized repository for our code and dataset: +https://anonymous.4open.science/r/PObjectNav-1C6D. + +
+
+ comment: 19 +
+
+
+
+
+ + ♻ ☆ Combining Blockchain and Biometrics: A Survey on Technical Aspects and a + First Legal Analysis + + +
+ Biometric recognition as a unique, hard-to-forge, and efficient way of +identification and verification has become an indispensable part of the current +digital world. The fast evolution of this technology has been a strong +incentive for integrating it into many applications. Meanwhile, blockchain, the +very attractive decentralized ledger technology, has been widely received both +by the research and industry in the past years and it is being increasingly +deployed nowadays in many different applications, such as money transfer, IoT, +healthcare, or logistics. Recently, researchers have started to speculate what +would be the pros and cons and what would be the best applications when these +two technologies cross paths. This paper provides a survey of technical +literature research on the combination of blockchain and biometrics and +includes a first legal analysis of this integration to shed light on challenges +and potentials. While this combination is still in its infancy and a growing +body of literature discusses specific blockchain applications and solutions in +an advanced technological set-up, this paper presents a holistic understanding +of blockchains applicability in the biometric sector. This study demonstrates +that combining blockchain and biometrics would be beneficial for novel +applications in biometrics such as the PKI mechanism, distributed trusted +service, and identity management. However, blockchain networks at their current +stage are not efficient and economical for real-time applications. From a legal +point of view, the allocation of accountability remains a main issue, while +other difficulties remain, such as conducting a proper Data Protection Impact +Assessment. Finally, it supplies technical and legal recommendations to reap +the benefits and mitigate the risks of the combination. + +
+
+
+
+
+ + ♻ ☆ SaFL: Sybil-aware Federated Learning with Application to Face + Recognition + + +
+ Federated Learning (FL) is a machine learning paradigm to conduct +collaborative learning among clients on a joint model. The primary goal is to +share clients' local training parameters with an integrating server while +preserving their privacy. This method permits to exploit the potential of +massive mobile users' data for the benefit of machine learning models' +performance while keeping sensitive data on local devices. On the downside, FL +raises security and privacy concerns that have just started to be studied. To +address some of the key threats in FL, researchers have proposed to use secure +aggregation methods (e.g. homomorphic encryption, secure multiparty +computation, etc.). These solutions improve some security and privacy metrics, +but at the same time bring about other serious threats such as poisoning +attacks, backdoor attacks, and free running attacks. This paper proposes a new +defense method against poisoning attacks in FL called SaFL (Sybil-aware +Federated Learning) that minimizes the effect of sybils with a novel +time-variant aggregation scheme. + +
+
+
+
+
+ + ♻ ☆ Document Haystacks: Vision-Language Reasoning Over Piles of 1000+ + Documents + + +
+ Large multimodal models (LMMs) have achieved impressive progress in +vision-language understanding, yet they face limitations in real-world +applications requiring complex reasoning over a large number of images. +Existing benchmarks for multi-image question-answering are limited in scope, +each question is paired with only up to 30 images, which does not fully capture +the demands of large-scale retrieval tasks encountered in the real-world +usages. To reduce these gaps, we introduce two document haystack benchmarks, +dubbed DocHaystack and InfoHaystack, designed to evaluate LMM performance on +large-scale visual document retrieval and understanding. Additionally, we +propose V-RAG, a novel, vision-centric retrieval-augmented generation (RAG) +framework that leverages a suite of multimodal vision encoders, each optimized +for specific strengths, and a dedicated question-document relevance module. +V-RAG sets a new standard, with a 9% and 11% improvement in Recall@1 on the +challenging DocHaystack-1000 and InfoHaystack-1000 benchmarks, respectively, +compared to the previous best baseline models. Additionally, integrating V-RAG +with LMMs enables them to efficiently operate across thousands of images, +yielding significant improvements on our DocHaystack and InfoHaystack +benchmarks. Our code and datasets are available at +https://github.com/Vision-CAIR/dochaystacks + +
+
+ comment: the correct arxiv version +
+
+
+
+
+ + ♻ ☆ Improving Shift Invariance in Convolutional Neural Networks with + Translation Invariant Polyphase Sampling WACV 2025 + + +
+ Downsampling operators break the shift invariance of convolutional neural +networks (CNNs) and this affects the robustness of features learned by CNNs +when dealing with even small pixel-level shift. Through a large-scale +correlation analysis framework, we study shift invariance of CNNs by inspecting +existing downsampling operators in terms of their maximum-sampling bias (MSB), +and find that MSB is negatively correlated with shift invariance. Based on this +crucial insight, we propose a learnable pooling operator called Translation +Invariant Polyphase Sampling (TIPS) and two regularizations on the intermediate +feature maps of TIPS to reduce MSB and learn translation-invariant +representations. TIPS can be integrated into any CNN and can be trained +end-to-end with marginal computational overhead. Our experiments demonstrate +that TIPS results in consistent performance gains in terms of accuracy, shift +consistency, and shift fidelity on multiple benchmarks for image classification +and semantic segmentation compared to previous methods and also leads to +improvements in adversarial and distributional robustness. TIPS results in the +lowest MSB compared to all previous methods, thus explaining our strong +empirical results. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Learning Transferable Features for Implicit Neural Representations + + +
+ Implicit neural representations (INRs) have demonstrated success in a variety +of applications, including inverse problems and neural rendering. An INR is +typically trained to capture one signal of interest, resulting in learned +neural features that are highly attuned to that signal. Assumed to be less +generalizable, we explore the aspect of transferability of such learned neural +features for fitting similar signals. We introduce a new INR training +framework, STRAINER that learns transferrable features for fitting INRs to new +signals from a given distribution, faster and with better reconstruction +quality. Owing to the sequential layer-wise affine operations in an INR, we +propose to learn transferable representations by sharing initial encoder layers +across multiple INRs with independent decoder layers. At test time, the learned +encoder representations are transferred as initialization for an otherwise +randomly initialized INR. We find STRAINER to yield extremely powerful +initialization for fitting images from the same domain and allow for $\approx ++10dB$ gain in signal quality early on compared to an untrained INR itself. +STRAINER also provides a simple way to encode data-driven priors in INRs. We +evaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks +and inverse problems and further provide detailed analysis and discussion on +the transferability of STRAINER's features. Our demo can be accessed at +https://colab.research.google.com/drive/1fBZAwqE8C_lrRPAe-hQZJTWrMJuAKtG2?usp=sharing . + +
+
+ comment: Project Website: https://kushalvyas.github.io/strainer.html +
+
+
+
+
+ + ♻ ☆ DUET: A Tuning-Free Device-Cloud Collaborative Parameters Generation + Framework for Efficient Device Model Generalization WWW'23 + + +
+ Device Model Generalization (DMG) is a practical yet under-investigated +research topic for on-device machine learning applications. It aims to improve +the generalization ability of pre-trained models when deployed on +resource-constrained devices, such as improving the performance of pre-trained +cloud models on smart mobiles. While quite a lot of works have investigated the +data distribution shift across clouds and devices, most of them focus on model +fine-tuning on personalized data for individual devices to facilitate DMG. +Despite their promising, these approaches require on-device re-training, which +is practically infeasible due to the overfitting problem and high time delay +when performing gradient calculation on real-time data. In this paper, we argue +that the computational cost brought by fine-tuning can be rather unnecessary. +We consequently present a novel perspective to improving DMG without increasing +computational cost, i.e., device-specific parameter generation which directly +maps data distribution to parameters. Specifically, we propose an efficient +Device-cloUd collaborative parametErs generaTion framework DUET. DUET is +deployed on a powerful cloud server that only requires the low cost of +forwarding propagation and low time delay of data transmission between the +device and the cloud. By doing so, DUET can rehearse the device-specific model +weight realizations conditioned on the personalized real-time data for an +individual device. Importantly, our DUET elegantly connects the cloud and +device as a 'duet' collaboration, frees the DMG from fine-tuning, and enables a +faster and more accurate DMG paradigm. We conduct an extensive experimental +study of DUET on three public datasets, and the experimental results confirm +our framework's effectiveness and generalisability for different DMG tasks. + +
+
+ comment: Published on WWW'23: Proceedings of the ACM on Web Conference 2023 + (pp. 3077 - 3085) +
+
+
+
+
+ + ♻ ☆ GATE OpenING: A Comprehensive Benchmark for Judging Open-ended + Interleaved Image-Text Generation + + +
+ Multimodal Large Language Models (MLLMs) have made significant strides in +visual understanding and generation tasks. However, generating interleaved +image-text content remains a challenge, which requires integrated multimodal +understanding and generation abilities. While the progress in unified models +offers new solutions, existing benchmarks are insufficient for evaluating these +methods due to data size and diversity limitations. To bridge this gap, we +introduce GATE OpenING (OpenING), a comprehensive benchmark comprising 5,400 +high-quality human-annotated instances across 56 real-world tasks. OpenING +covers diverse daily scenarios such as travel guide, design, and brainstorming, +offering a robust platform for challenging interleaved generation methods. In +addition, we present IntJudge, a judge model for evaluating open-ended +multimodal generation methods. Trained with a novel data pipeline, our IntJudge +achieves an agreement rate of 82. 42% with human judgments, outperforming +GPT-based evaluators by 11.34%. Extensive experiments on OpenING reveal that +current interleaved generation methods still have substantial room for +improvement. Key findings on interleaved image-text generation are further +presented to guide the development of next-generation models. The OpenING is +open-sourced at https://opening-benchmark.github.io. + +
+
+ comment: 53 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ Towards Cross-View-Consistent Self-Supervised Surround Depth Estimation IROS2024 + + +
+ Depth estimation is a cornerstone for autonomous driving, yet acquiring +per-pixel depth ground truth for supervised learning is challenging. +Self-Supervised Surround Depth Estimation (SSSDE) from consecutive images +offers an economical alternative. While previous SSSDE methods have proposed +different mechanisms to fuse information across images, few of them explicitly +consider the cross-view constraints, leading to inferior performance, +particularly in overlapping regions. This paper proposes an efficient and +consistent pose estimation design and two loss functions to enhance cross-view +consistency for SSSDE. For pose estimation, we propose to use only front-view +images to reduce training memory and sustain pose estimation consistency. The +first loss function is the dense depth consistency loss, which penalizes the +difference between predicted depths in overlapping regions. The second one is +the multi-view reconstruction consistency loss, which aims to maintain +consistency between reconstruction from spatial and spatial-temporal contexts. +Additionally, we introduce a novel flipping augmentation to improve the +performance further. Our techniques enable a simple neural model to achieve +state-of-the-art performance on the DDAD and nuScenes datasets. Last but not +least, our proposed techniques can be easily applied to other methods. The code +will be made public. + +
+
+ comment: Accepted by IROS2024 +
+
+
+
+
+ + ♻ ☆ MiKASA: Multi-Key-Anchor & Scene-Aware Transformer for 3D Visual + Grounding CVPR 2024 + + +
+ 3D visual grounding involves matching natural language descriptions with +their corresponding objects in 3D spaces. Existing methods often face +challenges with accuracy in object recognition and struggle in interpreting +complex linguistic queries, particularly with descriptions that involve +multiple anchors or are view-dependent. In response, we present the MiKASA +(Multi-Key-Anchor Scene-Aware) Transformer. Our novel end-to-end trained model +integrates a self-attention-based scene-aware object encoder and an original +multi-key-anchor technique, enhancing object recognition accuracy and the +understanding of spatial relationships. Furthermore, MiKASA improves the +explainability of decision-making, facilitating error diagnosis. Our model +achieves the highest overall accuracy in the Referit3D challenge for both the +Sr3D and Nr3D datasets, particularly excelling by a large margin in categories +that require viewpoint-dependent descriptions. + +
+
+ comment: CVPR 2024 +
+
+
+
+
+ + ♻ ☆ DPA: Dual Prototypes Alignment for Unsupervised Adaptation of + Vision-Language Models WACV 2025 + + +
+ Vision-language models (VLMs), e.g., CLIP, have shown remarkable potential in +zero-shot image classification. However, adapting these models to new domains +remains challenging, especially in unsupervised settings where labeled data is +unavailable. Recent research has proposed pseudo-labeling approaches to adapt +CLIP in an unsupervised manner using unlabeled target data. Nonetheless, these +methods struggle due to noisy pseudo-labels resulting from the misalignment +between CLIP's visual and textual representations. This study introduces DPA, +an unsupervised domain adaptation method for VLMs. DPA introduces the concept +of dual prototypes, acting as distinct classifiers, along with the convex +combination of their outputs, thereby leading to accurate pseudo-label +construction. Next, it ranks pseudo-labels to facilitate robust self-training, +particularly during early training. Finally, it addresses visual-textual +misalignment by aligning textual prototypes with image prototypes to further +improve the adaptation performance. Experiments on 13 downstream vision tasks +demonstrate that DPA significantly outperforms zero-shot CLIP and the +state-of-the-art unsupervised adaptation baselines. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ TED-VITON: Transformer-Empowered Diffusion Models for Virtual Try-On + + +
+ Recent advancements in Virtual Try-On (VTO) have demonstrated exceptional +efficacy in generating realistic images and preserving garment details, largely +attributed to the robust generative capabilities of text-to-image (T2I) +diffusion backbones. However, the T2I models that underpin these methods have +become outdated, thereby limiting the potential for further improvement in VTO. +Additionally, current methods face notable challenges in accurately rendering +text on garments without distortion and preserving fine-grained details, such +as textures and material fidelity. The emergence of Diffusion Transformer (DiT) +based T2I models has showcased impressive performance and offers a promising +opportunity for advancing VTO. Directly applying existing VTO techniques to +transformer-based T2I models is ineffective due to substantial architectural +differences, which hinder their ability to fully leverage the models' advanced +capabilities for improved text generation. To address these challenges and +unlock the full potential of DiT-based T2I models for VTO, we propose +TED-VITON, a novel framework that integrates a Garment Semantic (GS) Adapter +for enhancing garment-specific features, a Text Preservation Loss to ensure +accurate and distortion-free text rendering, and a constraint mechanism to +generate prompts by optimizing Large Language Model (LLM). These innovations +enable state-of-the-art (SOTA) performance in visual quality and text fidelity, +establishing a new benchmark for VTO task. Project page: +\url{https://zhenchenwan.github.io/TED-VITON/} + +
+
+ comment: Project page: \href{https://github.com/ZhenchenWan/TED-VITON}{this + URL} +
+
+
+
+
+ + ♻ ☆ MeshAnything V2: Artist-Created Mesh Generation With Adjacent Mesh + Tokenization + + +
+ Meshes are the de facto 3D representation in the industry but are +labor-intensive to produce. Recently, a line of research has focused on +autoregressively generating meshes. This approach processes meshes into a +sequence composed of vertices and then generates them vertex by vertex, similar +to how a language model generates text. These methods have achieved some +success but still struggle to generate complex meshes. One primary reason for +this limitation is their inefficient tokenization methods. To address this +issue, we introduce MeshAnything V2, an advanced mesh generation model designed +to create Artist-Created Meshes that align precisely with specified shapes. A +key innovation behind MeshAnything V2 is our novel Adjacent Mesh Tokenization +(AMT) method. Unlike traditional approaches that represent each face using +three vertices, AMT optimizes this by employing a single vertex wherever +feasible, effectively reducing the token sequence length by about half on +average. This not only streamlines the tokenization process but also results in +more compact and well-structured sequences, enhancing the efficiency of mesh +generation. With these improvements, MeshAnything V2 effectively doubles the +face limit compared to previous models, delivering superior performance without +increasing computational costs. We will make our code and models publicly +available. Project Page: https://buaacyw.github.io/meshanything-v2/ + +
+
+ comment: Project Page: https://buaacyw.github.io/meshanything-v2/ Github: + https://github.com/buaacyw/MeshAnythingV2 +
+
+
+
+
+ + ♻ ☆ Weakly-Supervised Semantic Segmentation with Image-Level Labels: from + Traditional Models to Foundation Models + + +
+ The rapid development of deep learning has driven significant progress in +image semantic segmentation - a fundamental task in computer vision. Semantic +segmentation algorithms often depend on the availability of pixel-level labels +(i.e., masks of objects), which are expensive, time-consuming, and +labor-intensive. Weakly-supervised semantic segmentation (WSSS) is an effective +solution to avoid such labeling. It utilizes only partial or incomplete +annotations and provides a cost-effective alternative to fully-supervised +semantic segmentation. In this journal, our focus is on the WSSS with +image-level labels, which is the most challenging form of WSSS. Our work has +two parts. First, we conduct a comprehensive survey on traditional methods, +primarily focusing on those presented at premier research conferences. We +categorize them into four groups based on where their methods operate: +pixel-wise, image-wise, cross-image, and external data. Second, we investigate +the applicability of visual foundation models, such as the Segment Anything +Model (SAM), in the context of WSSS. We scrutinize SAM in two intriguing +scenarios: text prompting and zero-shot learning. We provide insights into the +potential and challenges of deploying visual foundational models for WSSS, +facilitating future developments in this exciting research area. + +
+
+ comment: Accepted to ACM Computing Surveys +
+
+
+
+
+ + ♻ ☆ ReVersion: Diffusion-Based Relation Inversion from Images SIGGRAPH + + +
+ Diffusion models gain increasing popularity for their generative +capabilities. Recently, there have been surging needs to generate customized +images by inverting diffusion models from exemplar images, and existing +inversion methods mainly focus on capturing object appearances (i.e., the +"look"). However, how to invert object relations, another important pillar in +the visual world, remains unexplored. In this work, we propose the Relation +Inversion task, which aims to learn a specific relation (represented as +"relation prompt") from exemplar images. Specifically, we learn a relation +prompt with a frozen pre-trained text-to-image diffusion model. The learned +relation prompt can then be applied to generate relation-specific images with +new objects, backgrounds, and styles. + To tackle the Relation Inversion task, we propose the ReVersion Framework. +Specifically, we propose a novel "relation-steering contrastive learning" +scheme to steer the relation prompt towards relation-dense regions, and +disentangle it away from object appearances. We further devise "relation-focal +importance sampling" to emphasize high-level interactions over low-level +appearances (e.g., texture, color). To comprehensively evaluate this new task, +we contribute the ReVersion Benchmark, which provides various exemplar images +with diverse relations. Extensive experiments validate the superiority of our +approach over existing methods across a wide range of visual relations. Our +proposed task and method could be good inspirations for future research in +various domains like generative inversion, few-shot learning, and visual +relation detection. + +
+
+ comment: SIGGRAPH Asia (Conference Track) 2024, Project page: + https://ziqihuangg.github.io/projects/reversion.html Code: + https://github.com/ziqihuangg/ReVersion +
+
+
+
+
+ + ♻ What Matters When Repurposing Diffusion Models for General Dense + Perception Tasks? + + +
+ Extensive pre-training with large data is indispensable for downstream +geometry and semantic visual perception tasks. Thanks to large-scale +text-to-image (T2I) pretraining, recent works show promising results by simply +fine-tuning T2I diffusion models for dense perception tasks. However, several +crucial design decisions in this process still lack comprehensive +justification, encompassing the necessity of the multi-step stochastic +diffusion mechanism, training strategy, inference ensemble strategy, and +fine-tuning data quality. In this work, we conduct a thorough investigation +into critical factors that affect transfer efficiency and performance when +using diffusion priors. Our key findings are: 1) High-quality fine-tuning data +is paramount for both semantic and geometry perception tasks. 2) The stochastic +nature of diffusion models has a slightly negative impact on deterministic +visual perception tasks. 3) Apart from fine-tuning the diffusion model with +only latent space supervision, task-specific image-level supervision is +beneficial to enhance fine-grained details. These observations culminate in the +development of GenPercept, an effective deterministic one-step fine-tuning +paradigm tailed for dense visual perception tasks. Different from the previous +multi-step methods, our paradigm has a much faster inference speed, and can be +seamlessly integrated with customized perception decoders and loss functions +for image-level supervision, which is critical to improving the fine-grained +details of predictions. Comprehensive experiments on diverse dense visual +perceptual tasks, including monocular depth estimation, surface normal +estimation, image segmentation, and matting, are performed to demonstrate the +remarkable adaptability and effectiveness of our proposed method. + +
+
+ comment: Code is at: https://github.com/aim-uofa/GenPercept +
+
+
+
+
+ + ♻ ☆ Denoising-Contrastive Alignment for Continuous Sign Language Recognition + + +
+ Continuous sign language recognition (CSLR) aims to recognize signs in +untrimmed sign language videos to textual glosses. A key challenge of CSLR is +achieving effective cross-modality alignment between video and gloss sequences +to enhance video representation. However, current cross-modality alignment +paradigms often neglect the role of textual grammar to guide the video +representation in learning global temporal context, which adversely affects +recognition performance. To tackle this limitation, we propose a +Denoising-Contrastive Alignment (DCA) paradigm. DCA creatively leverages +textual grammar to enhance video representations through two complementary +approaches: modeling the instance correspondence between signs and glosses from +a discrimination perspective and aligning their global context from a +generative perspective. Specifically, DCA accomplishes flexible instance-level +correspondence between signs and glosses using a contrastive loss. Building on +this, DCA models global context alignment between the video and gloss sequences +by denoising the gloss representation from noise, guided by video +representation. Additionally, DCA introduces gradient modulation to optimize +the alignment and recognition gradients, ensuring a more effective learning +process. By integrating gloss-wise and global context knowledge, DCA +significantly enhances video representations for CSLR tasks. Experimental +results across public benchmarks validate the effectiveness of DCA and confirm +its video representation enhancement feasibility. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ Multi-Class Abnormality Classification in Video Capsule Endoscopy Using + Deep Learning + + +
+ This report outlines Team Seq2Cure's deep learning approach for the Capsule +Vision 2024 Challenge, leveraging an ensemble of convolutional neural networks +(CNNs) and transformer-based architectures for multi-class abnormality +classification in video capsule endoscopy frames. The dataset comprised over +50,000 frames from three public sources and one private dataset, labeled across +10 abnormality classes. To overcome the limitations of traditional CNNs in +capturing global context, we integrated CNN and transformer models within a +multi-model ensemble. Our approach achieved a balanced accuracy of 86.34 +percent and a mean AUC-ROC score of 0.9908 on the validation set, earning our +submission 5th place in the challenge. Code is available at +http://github.com/arnavs04/capsule-vision-2024 . + +
+
+
+
+
+ + ♻ ☆ Deep learning-driven pulmonary artery and vein segmentation reveals + demography-associated vasculature anatomical differences + + +
+ Pulmonary artery-vein segmentation is crucial for disease diagnosis and +surgical planning and is traditionally achieved by Computed Tomography +Pulmonary Angiography (CTPA). However, concerns regarding adverse health +effects from contrast agents used in CTPA have constrained its clinical +utility. In contrast, identifying arteries and veins using non-contrast CT, a +conventional and low-cost clinical examination routine, has long been +considered impossible. Here we propose a High-abundant Pulmonary Artery-vein +Segmentation (HiPaS) framework achieving accurate artery-vein segmentation on +both non-contrast CT and CTPA across various spatial resolutions. HiPaS first +performs spatial normalization on raw CT volumes via a super-resolution module, +and then iteratively achieves segmentation results at different branch levels +by utilizing the lower-level vessel segmentation as a prior for higher-level +vessel segmentation. We trained and validated HiPaS on our established +multi-centric dataset comprising 1,073 CT volumes with meticulous manual +annotations. Both quantitative experiments and clinical evaluation demonstrated +the superior performance of HiPaS, achieving an average dice score of 91.8% and +a sensitivity of 98.0%. Further experiments showed the non-inferiority of HiPaS +segmentation on non-contrast CT compared to segmentation on CTPA. Employing +HiPaS, we have conducted an anatomical study of pulmonary vasculature on 11,784 +participants in China (six sites), discovering a new association of pulmonary +vessel anatomy with sex, age, and disease states: vessel abundance suggests a +significantly higher association with females than males with slightly +decreasing with age, and is also influenced by certain diseases, under the +controlling of lung volumes. + +
+
+
+
+
+ + ♻ ☆ CoMM: A Coherent Interleaved Image-Text Dataset for Multimodal + Understanding and Generation + + +
+ Interleaved image-text generation has emerged as a crucial multimodal task, +aiming at creating sequences of interleaved visual and textual content given a +query. Despite notable advancements in recent multimodal large language models +(MLLMs), generating integrated image-text sequences that exhibit narrative +coherence and entity and style consistency remains challenging due to poor +training data quality. To address this gap, we introduce CoMM, a high-quality +Coherent interleaved image-text MultiModal dataset designed to enhance the +coherence, consistency, and alignment of generated multimodal content. +Initially, CoMM harnesses raw data from diverse sources, focusing on +instructional content and visual storytelling, establishing a foundation for +coherent and consistent content. To further refine the data quality, we devise +a multi-perspective filter strategy that leverages advanced pre-trained models +to ensure the development of sentences, consistency of inserted images, and +semantic alignment between them. Various quality evaluation metrics are +designed to prove the high quality of the filtered dataset. Meanwhile, +extensive few-shot experiments on various downstream tasks demonstrate CoMM's +effectiveness in significantly enhancing the in-context learning capabilities +of MLLMs. Moreover, we propose four new tasks to evaluate MLLMs' interleaved +generation abilities, supported by a comprehensive evaluation framework. We +believe CoMM opens a new avenue for advanced MLLMs with superior multimodal +in-context learning and understanding ability. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ PACA: Perspective-Aware Cross-Attention Representation for Zero-Shot + Scene Rearrangement WACV2025 + + +
+ Scene rearrangement, like table tidying, is a challenging task in robotic +manipulation due to the complexity of predicting diverse object arrangements. +Web-scale trained generative models such as Stable Diffusion can aid by +generating natural scenes as goals. To facilitate robot execution, object-level +representations must be extracted to match the real scenes with the generated +goals and to calculate object pose transformations. Current methods typically +use a multi-step design that involves separate models for generation, +segmentation, and feature encoding, which can lead to a low success rate due to +error accumulation. Furthermore, they lack control over the viewing +perspectives of the generated goals, restricting the tasks to 3-DoF settings. +In this paper, we propose PACA, a zero-shot pipeline for scene rearrangement +that leverages perspective-aware cross-attention representation derived from +Stable Diffusion. Specifically, we develop a representation that integrates +generation, segmentation, and feature encoding into a single step to produce +object-level representations. Additionally, we introduce perspective control, +thus enabling the matching of 6-DoF camera views and extending past approaches +that were limited to 3-DoF top-down views. The efficacy of our method is +demonstrated through its zero-shot performance in real robot experiments across +various scenes, achieving an average matching accuracy and execution success +rate of 87% and 67%, respectively. + +
+
+ comment: Accepted by WACV2025 +
+
+
+
+
+ + ♻ ☆ SITransformer: Shared Information-Guided Transformer for Extreme + Multimodal Summarization + + +
+ Extreme Multimodal Summarization with Multimodal Output (XMSMO) becomes an +attractive summarization approach by integrating various types of information +to create extremely concise yet informative summaries for individual +modalities. Existing methods overlook the issue that multimodal data often +contains more topic irrelevant information, which can mislead the model into +producing inaccurate summaries especially for extremely short ones. In this +paper, we propose SITransformer, a Shared Information-guided Transformer for +extreme multimodal summarization. It has a shared information guided pipeline +which involves a cross-modal shared information extractor and a cross-modal +interaction module. The extractor formulates semantically shared salient +information from different modalities by devising a novel filtering process +consisting of a differentiable top-k selector and a shared-information guided +gating unit. As a result, the common, salient, and relevant contents across +modalities are identified. Next, a transformer with cross-modal attentions is +developed for intra- and inter-modality learning with the shared information +guidance to produce the extreme summary. Comprehensive experiments demonstrate +that SITransformer significantly enhances the summarization quality for both +video and text summaries for XMSMO. Our code will be publicly available at +https://github.com/SichengLeoLiu/MMAsia24-XMSMO. + +
+
+ comment: 8 pages, 5 figures, submitted to ACM Multimedia Asia 2024 +
+
+
+
+
+ + ♻ ☆ From Open Vocabulary to Open World: Teaching Vision Language Models to + Detect Novel Objects + + +
+ Traditional object detection methods operate under the closed-set assumption, +where models can only detect a fixed number of objects predefined in the +training set. Recent works on open vocabulary object detection (OVD) enable the +detection of objects defined by an unbounded vocabulary, which reduces the cost +of training models for specific tasks. However, OVD heavily relies on accurate +prompts provided by an ''oracle'', which limits their use in critical +applications such as driving scene perception. OVD models tend to misclassify +near-out-of-distribution (NOOD) objects that have similar semantics to known +classes, and ignore far-out-of-distribution (FOOD) objects. To address theses +limitations, we propose a framework that enables OVD models to operate in open +world settings, by identifying and incrementally learning novel objects. To +detect FOOD objects, we propose Open World Embedding Learning (OWEL) and +introduce the concept of Pseudo Unknown Embedding which infers the location of +unknown classes in a continuous semantic space based on the information of +known classes. We also propose Multi-Scale Contrastive Anchor Learning (MSCAL), +which enables the identification of misclassified unknown objects by promoting +the intra-class consistency of object embeddings at different scales. The +proposed method achieves state-of-the-art performance in common open world +object detection and autonomous driving benchmarks. + +
+
+
+
+
+ + ♻ ☆ SemiCD-VL: Visual-Language Model Guidance Makes Better Semi-supervised + Change Detector + + +
+ Change Detection (CD) aims to identify pixels with semantic changes between +images. However, annotating massive numbers of pixel-level images is +labor-intensive and costly, especially for multi-temporal images, which require +pixel-wise comparisons by human experts. Considering the excellent performance +of visual language models (VLMs) for zero-shot, open-vocabulary, etc. with +prompt-based reasoning, it is promising to utilize VLMs to make better CD under +limited labeled data. In this paper, we propose a VLM guidance-based +semi-supervised CD method, namely SemiCD-VL. The insight of SemiCD-VL is to +synthesize free change labels using VLMs to provide additional supervision +signals for unlabeled data. However, almost all current VLMs are designed for +single-temporal images and cannot be directly applied to bi- or multi-temporal +images. Motivated by this, we first propose a VLM-based mixed change event +generation (CEG) strategy to yield pseudo labels for unlabeled CD data. Since +the additional supervised signals provided by these VLM-driven pseudo labels +may conflict with the pseudo labels from the consistency regularization +paradigm (e.g. FixMatch), we propose the dual projection head for de-entangling +different signal sources. Further, we explicitly decouple the bi-temporal +images semantic representation through two auxiliary segmentation decoders, +which are also guided by VLM. Finally, to make the model more adequately +capture change representations, we introduce metric-aware supervision by +feature-level contrastive loss in auxiliary branches. Extensive experiments +show the advantage of SemiCD-VL. For instance, SemiCD-VL improves the FixMatch +baseline by +5.3 IoU on WHU-CD and by +2.4 IoU on LEVIR-CD with 5% labels. In +addition, our CEG strategy, in an un-supervised manner, can achieve performance +far superior to state-of-the-art un-supervised CD methods. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Geometric Point Attention Transformer for 3D Shape Reassembly + + +
+ Shape assembly, which aims to reassemble separate parts into a complete +object, has gained significant interest in recent years. Existing methods +primarily rely on networks to predict the poses of individual parts, but often +fail to effectively capture the geometric interactions between the parts and +their poses. In this paper, we present the Geometric Point Attention +Transformer (GPAT), a network specifically designed to address the challenges +of reasoning about geometric relationships. In the geometric point attention +module, we integrate both global shape information and local pairwise geometric +features, along with poses represented as rotation and translation vectors for +each part. To enable iterative updates and dynamic reasoning, we introduce a +geometric recycling scheme, where each prediction is fed into the next +iteration for refinement. We evaluate our model on both the semantic and +geometric assembly tasks, showing that it outperforms previous methods in +absolute pose estimation, achieving accurate pose predictions and high +alignment accuracy. + +
+
+
+
+
+ + ♻ ☆ HAAT: Hybrid Attention Aggregation Transformer for Image + Super-Resolution + + +
+ In the research area of image super-resolution, Swin-transformer-based models +are favored for their global spatial modeling and shifting window attention +mechanism. However, existing methods often limit self-attention to non +overlapping windows to cut costs and ignore the useful information that exists +across channels. To address this issue, this paper introduces a novel model, +the Hybrid Attention Aggregation Transformer (HAAT), designed to better +leverage feature information. HAAT is constructed by integrating +Swin-Dense-Residual-Connected Blocks (SDRCB) with Hybrid Grid Attention Blocks +(HGAB). SDRCB expands the receptive field while maintaining a streamlined +architecture, resulting in enhanced performance. HGAB incorporates channel +attention, sparse attention, and window attention to improve nonlocal feature +fusion and achieve more visually compelling results. Experimental evaluations +demonstrate that HAAT surpasses state-of-the-art methods on benchmark datasets. + Keywords: Image super-resolution, Computer vision, Attention mechanism, +Transformer + +
+
+ comment: 6 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ BiCo-Fusion: Bidirectional Complementary LiDAR-Camera Fusion for + Semantic- and Spatial-Aware 3D Object Detection + + +
+ 3D object detection is an important task that has been widely applied in +autonomous driving. To perform this task, a new trend is to fuse multi-modal +inputs, i.e., LiDAR and camera. Under such a trend, recent methods fuse these +two modalities by unifying them in the same 3D space. However, during direct +fusion in a unified space, the drawbacks of both modalities (LiDAR features +struggle with detailed semantic information and the camera lacks accurate 3D +spatial information) are also preserved, diluting semantic and spatial +awareness of the final unified representation. To address the issue, this +letter proposes a novel bidirectional complementary LiDAR-camera fusion +framework, called BiCo-Fusion that can achieve robust semantic- and +spatial-aware 3D object detection. The key insight is to fuse LiDAR and camera +features in a bidirectional complementary way to enhance the semantic awareness +of the LiDAR and the 3D spatial awareness of the camera. The enhanced features +from both modalities are then adaptively fused to build a semantic- and +spatial-aware unified representation. Specifically, we introduce Pre-Fusion +consisting of a Voxel Enhancement Module (VEM) to enhance the semantic +awareness of voxel features from 2D camera features and Image Enhancement +Module (IEM) to enhance the 3D spatial awareness of camera features from 3D +voxel features. We then introduce Unified Fusion (U-Fusion) to adaptively fuse +the enhanced features from the last stage to build a unified representation. +Extensive experiments demonstrate the superiority of our BiCo-Fusion against +the prior arts. Project page: https://t-ys.github.io/BiCo-Fusion/. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models + for Integrated Capabilities + + +
+ MM-Vet, with open-ended vision-language questions targeting at evaluating +integrated capabilities, has become one of the most popular benchmarks for +large multimodal model evaluation. MM-Vet assesses six core vision-language +(VL) capabilities: recognition, knowledge, spatial awareness, language +generation, OCR, and math. However, its question format is restricted to single +image-text pairs, lacking the interleaved image and text sequences prevalent in +real-world scenarios. To address this limitation, we introduce MM-Vet v2, which +includes a new VL capability called "image-text sequence understanding", +evaluating models' ability to process VL sequences. Furthermore, we maintain +the high quality of evaluation samples while further expanding the evaluation +set size. Using MM-Vet v2 to benchmark large multimodal models, we found that +Claude 3.5 Sonnet is the best model with a score of 71.8, slightly +outperforming GPT-4o which scored 71.0. Among open-weight models, +InternVL2-Llama3-76B leads with a score of 68.4. The code, data, and +leaderboard are accessible at https://github.com/yuweihao/MM-Vet. + +
+
+ comment: Code, data and leaderboard: https://github.com/yuweihao/MM-Vet +
+
+
+
+
+ + ♻ ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities ICML 2024 + + +
+ We propose MM-Vet, an evaluation benchmark that examines large multimodal +models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various +intriguing abilities, such as solving math problems written on the blackboard, +reasoning about events and celebrities in news images, and explaining visual +jokes. Rapid model advancements pose challenges to evaluation benchmark +development. Problems include: (1) How to systematically structure and evaluate +the complicated multimodal tasks; (2) How to design evaluation metrics that +work well across question and answer types; and (3) How to give model insights +beyond a simple performance ranking. To this end, we present MM-Vet, designed +based on the insight that the intriguing ability to solve complicated tasks is +often achieved by a generalist model being able to integrate different core +vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and +examines the 16 integrations of interest derived from the capability +combination. For evaluation metrics, we propose an LLM-based evaluator for +open-ended outputs. The evaluator enables the evaluation across different +question types and answer styles, resulting in a unified scoring metric. We +evaluate representative LMMs on MM-Vet, providing insights into the +capabilities of different LMM system paradigms and models. + +
+
+ comment: ICML 2024. Code, data and leaderboard: + https://github.com/yuweihao/MM-Vet +
+
+
+
+
+ + ♻ ☆ Potential Field Based Deep Metric Learning + + +
+ Deep metric learning (DML) involves training a network to learn a +semantically meaningful representation space. Many current approaches mine +n-tuples of examples and model interactions within each tuplets. We present a +novel, compositional DML model, inspired by electrostatic fields in physics +that, instead of in tuples, represents the influence of each example +(embedding) by a continuous potential field, and superposes the fields to +obtain their combined global potential field. We use attractive/repulsive +potential fields to represent interactions among embeddings from images of the +same/different classes. Contrary to typical learning methods, where mutual +influence of samples is proportional to their distance, we enforce reduction in +such influence with distance, leading to a decaying field. We show that such +decay helps improve performance on real world datasets with large intra-class +variations and label noise. Like other proxy-based methods, we also use proxies +to succinctly represent sub-populations of examples. We evaluate our method on +three standard DML benchmarks- Cars-196, CUB-200-2011, and SOP datasets where +it outperforms state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ An Inversion-based Measure of Memorization for Diffusion Models + + +
+ The past few years have witnessed substantial advances in image generation +powered by diffusion models. However, it was shown that diffusion models are +vulnerable to training data memorization, raising concerns regarding copyright +infringement and privacy invasion. This study delves into a rigorous analysis +of memorization in diffusion models. We introduce an inversion-based measure of +memorization, InvMM, which searches for a sensitive latent noise distribution +accounting for the replication of an image. For accurate estimation of the +memorization score, we propose an adaptive algorithm that balances the +normality and sensitivity of the inverted distribution. Comprehensive +experiments, conducted on both unconditional and text-guided diffusion models, +demonstrate that InvMM is capable of detecting heavily memorized images and +elucidating the effect of various factors on memorization. Additionally, we +discuss how memorization differs from membership. In practice, InvMM serves as +a useful tool for model developers to reliably assess the risk of memorization, +thereby contributing to the enhancement of trustworthiness and +privacy-preserving capabilities of diffusion models. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Variational Translator for Bridging Image Restoration and + High-Level Vision Tasks + + +
+ Recent research tries to extend image restoration capabilities from human +perception to machine perception, thereby enhancing the performance of +high-level vision tasks in degraded environments. These methods, primarily +based on supervised learning, typically involve the retraining of restoration +networks or high-level vision networks. However, collecting paired data in +real-world scenarios and retraining large-scale models are challenge. To this +end, we propose an unsupervised learning method called \textbf{Va}riational +\textbf{T}ranslator (VaT), which does not require retraining existing +restoration and high-level vision networks. Instead, it establishes a +lightweight network that serves as an intermediate bridge between them. By +variational inference, VaT approximates the joint distribution of restoration +output and high-level vision input, dividing the optimization objective into +preserving content and maximizing marginal likelihood associated with +high-level vision tasks. By cleverly leveraging self-training paradigms, VaT +achieves the above optimization objective without requiring labels. As a +result, the translated images maintain a close resemblance to their original +content while also demonstrating exceptional performance on high-level vision +tasks. Extensive experiments in dehazing and low-light enhancement for +detection and classification show the superiority of our method over other +state-of-the-art unsupervised counterparts, even significantly surpassing +supervised methods in some complex real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ HouseLLM: LLM-Assisted Two-Phase Text-to-Floorplan Generation + + +
+ This paper proposes a two-phase text-to-floorplan generation method, which +guides a Large Language Model (LLM) to generate an initial layout (Layout-LLM) +and refines them into the final floorplans through conditional diffusion model. +We incorporate a Chain-of-Thought approach to prompt the LLM based on user text +specifications, enabling a more user-friendly and intuitive house layout +design. This method allows users to describe their needs in natural language, +enhancing accessibility and providing clearer geometric constraints. The final +floorplans generated by Layout-LLM through conditional diffusion refinement are +more accurate and better meet user requirements. Experimental results +demonstrate that our approach achieves state-of-the-art performance across all +metrics, validating its effectiveness in practical home design applications. We +plan to release our code for public use. + +
+
+
+
+
+ + ♻ ☆ EZ-HOI: VLM Adaptation via Guided Prompt Learning for Zero-Shot HOI + Detection NeurIPS 2024 + + +
+ Detecting Human-Object Interactions (HOI) in zero-shot settings, where models +must handle unseen classes, poses significant challenges. Existing methods that +rely on aligning visual encoders with large Vision-Language Models (VLMs) to +tap into the extensive knowledge of VLMs, require large, computationally +expensive models and encounter training difficulties. Adapting VLMs with prompt +learning offers an alternative to direct alignment. However, fine-tuning on +task-specific datasets often leads to overfitting to seen classes and +suboptimal performance on unseen classes, due to the absence of unseen class +labels. To address these challenges, we introduce a novel prompt learning-based +framework for Efficient Zero-Shot HOI detection (EZ-HOI). First, we introduce +Large Language Model (LLM) and VLM guidance for learnable prompts, integrating +detailed HOI descriptions and visual semantics to adapt VLMs to HOI tasks. +However, because training datasets contain seen-class labels alone, fine-tuning +VLMs on such datasets tends to optimize learnable prompts for seen classes +instead of unseen ones. Therefore, we design prompt learning for unseen classes +using information from related seen classes, with LLMs utilized to highlight +the differences between unseen and related seen classes. Quantitative +evaluations on benchmark datasets demonstrate that our EZ-HOI achieves +state-of-the-art performance across various zero-shot settings with only 10.35% +to 33.95% of the trainable parameters compared to existing methods. Code is +available at https://github.com/ChelsieLei/EZ-HOI. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Vastextures: Vast repository of textures and PBR materials extracted + from real-world images using unsupervised methods + + +
+ Vastextures is a vast repository of 500,000 textures and PBR materials +extracted from real-world images using an unsupervised process. The extracted +materials and textures are extremely diverse and cover a vast range of +real-world patterns, but at the same time less refined compared to existing +repositories. The repository is composed of 2D textures cropped from natural +images and SVBRDF/PBR materials generated from these textures. Textures and PBR +materials are essential for CGI. Existing materials repositories focus on +games, animation, and arts, that demand a limited amount of high-quality +assets. However, virtual worlds and synthetic data are becoming increasingly +important for training A.I systems for computer vision. This application +demands a huge amount of diverse assets but at the same time less affected by +noisy and unrefined assets. Vastexture aims to address this need by creating a +free, huge, and diverse assets repository that covers as many real-world +materials as possible. The materials are automatically extracted from natural +images in two steps: 1) Automatically scanning a giant amount of images to +identify and crop regions with uniform textures. This is done by splitting the +image into a grid of cells and identifying regions in which all of the cells +share a similar statistical distribution. 2) Extracting the properties of the +PBR material from the cropped texture. This is done by randomly guessing every +correlation between the properties of the texture image and the properties of +the PBR material. The resulting PBR materials exhibit a vast amount of +real-world patterns as well as unexpected emergent properties. Neutral nets +trained on this repository outperformed nets trained using handcrafted assets. + +
+
+ comment: Vastexture was published as part of Learning Zero-Shot Material + States Segmentation, by Implanting Natural Image Patterns in Synthetic Data, + refer to this work in citations. This document gives a more detailed and + technical discussion of this repository +
+
+
+
+
+ + ♻ ☆ Support-Set Context Matters for Bongard Problems + + +
+ Current machine learning methods struggle to solve Bongard problems, which +are a type of IQ test that requires deriving an abstract "concept" from a set +of positive and negative "support" images, and then classifying whether or not +a new query image depicts the key concept. On Bongard-HOI, a benchmark for +natural-image Bongard problems, most existing methods have reached at best 69% +accuracy (where chance is 50%). Low accuracy is often attributed to neural +nets' lack of ability to find human-like symbolic rules. In this work, we point +out that many existing methods are forfeiting accuracy due to a much simpler +problem: they do not adapt image features given information contained in the +support set as a whole, and rely instead on information extracted from +individual supports. This is a critical issue, because the "key concept" in a +typical Bongard problem can often only be distinguished using multiple +positives and multiple negatives. We explore simple methods to incorporate this +context and show substantial gains over prior works, leading to new +state-of-the-art accuracy on Bongard-LOGO (75.3%) and Bongard-HOI (76.4%) +compared to methods with equivalent vision backbone architectures and strong +performance on the original Bongard problem set (60.8%). + +
+
+ comment: TMLR October 2024. Code: + https://github.com/nraghuraman/bongard-context +
+
+
+
+
+
+
+
+ + Artificial Intelligence 40 + +
+
+
+ + ♻ ☆ Comprehensive framework for evaluation of deep neural networks in + detection and quantification of lymphoma from PET/CT images: clinical + insights, pitfalls, and observer agreement analyses + + +
+ This study addresses critical gaps in automated lymphoma segmentation from +PET/CT images, focusing on issues often overlooked in existing literature. +While deep learning has been applied for lymphoma lesion segmentation, few +studies incorporate out-of-distribution testing, raising concerns about model +generalizability across diverse imaging conditions and patient populations. We +highlight the need to compare model performance with expert human annotators, +including intra- and inter-observer variability, to understand task difficulty +better. Most approaches focus on overall segmentation accuracy but overlook +lesion-specific metrics important for precise lesion detection and disease +quantification.To address these gaps, we propose a clinically-relevant +framework for evaluating deep neural networks. Using this lesion-specific +evaluation, we assess the performance of four deep segmentation networks +(ResUNet, SegResNet, DynUNet, and SwinUNETR) across 611 cases from +multi-institutional datasets, covering various lymphoma subtypes and lesion +characteristics. Beyond standard metrics like the Dice similarity coefficient +(DSC), we evaluate clinical lesion measures and their prediction errors. We +also introduce detection criteria for lesion localization and propose a new +detection Criterion 3 based on metabolic characteristics. We show that networks +perform better on large, intense lesions with higher metabolic +activity.Finally, we compare network performance to expert human observers via +intra- and inter-observer variability analyses, demonstrating that network +errors closely resemble those made by experts. Some small, faint lesions remain +challenging for both humans and networks. This study aims to improve automated +lesion segmentation's clinical relevance, supporting better treatment decisions +for lymphoma patients. The code is available at: +https://github.com/microsoft/lymphoma-segmentation-dnn + +
+
+ comment: 32 pages, 15 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Instruction Tuning for Large Language Models: A Survey + + +
+ This paper surveys research works in the quickly advancing field of +instruction tuning (IT), which can also be referred to as supervised +fine-tuning (SFT)\footnote{In this paper, unless specified otherwise, +supervised fine-tuning (SFT) and instruction tuning (IT) are used +interchangeably.}, a crucial technique to enhance the capabilities and +controllability of large language models (LLMs). Instruction tuning refers to +the process of further training LLMs on a dataset consisting of +\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the +gap between the next-word prediction objective of LLMs and the users' objective +of having LLMs adhere to human instructions. In this work, we make a systematic +review of the literature, including the general methodology of SFT, the +construction of SFT datasets, the training of SFT models, and applications to +different modalities, domains and application, along with analysis on aspects +that influence the outcome of SFT (e.g., generation of instruction outputs, +size of the instruction dataset, etc). We also review the potential pitfalls of +SFT along with criticism against it, along with efforts pointing out current +deficiencies of existing strategies and suggest some avenues for fruitful +research. Project Page: github.com/xiaoya-li/Instruction-Tuning-Survey + +
+
+ comment: V5; Last update: Dec. 1, 2024 +
+
+
+
+
+ + ♻ ☆ Rotation Invariant Quantization for Model Compression + + +
+ Post-training Neural Network (NN) model compression is an attractive approach +for deploying large, memory-consuming models on devices with limited memory +resources. In this study, we investigate the rate-distortion tradeoff for NN +model compression. First, we suggest a Rotation-Invariant Quantization (RIQ) +technique that utilizes a single parameter to quantize the entire NN model, +yielding a different rate at each layer, i.e., mixed-precision quantization. +Then, we prove that our rotation-invariant approach is optimal in terms of +compression. We rigorously evaluate RIQ and demonstrate its capabilities on +various models and tasks. For example, RIQ facilitates $\times 19.4$ and +$\times 52.9$ compression ratios on pre-trained VGG dense and pruned models, +respectively, with $<0.4\%$ accuracy degradation. Code is available in +\href{https://github.com/ehaleva/RIQ}{github.com/ehaleva/RIQ}. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Document Haystacks: Vision-Language Reasoning Over Piles of 1000+ + Documents + + +
+ Large multimodal models (LMMs) have achieved impressive progress in +vision-language understanding, yet they face limitations in real-world +applications requiring complex reasoning over a large number of images. +Existing benchmarks for multi-image question-answering are limited in scope, +each question is paired with only up to 30 images, which does not fully capture +the demands of large-scale retrieval tasks encountered in the real-world +usages. To reduce these gaps, we introduce two document haystack benchmarks, +dubbed DocHaystack and InfoHaystack, designed to evaluate LMM performance on +large-scale visual document retrieval and understanding. Additionally, we +propose V-RAG, a novel, vision-centric retrieval-augmented generation (RAG) +framework that leverages a suite of multimodal vision encoders, each optimized +for specific strengths, and a dedicated question-document relevance module. +V-RAG sets a new standard, with a 9% and 11% improvement in Recall@1 on the +challenging DocHaystack-1000 and InfoHaystack-1000 benchmarks, respectively, +compared to the previous best baseline models. Additionally, integrating V-RAG +with LMMs enables them to efficiently operate across thousands of images, +yielding significant improvements on our DocHaystack and InfoHaystack +benchmarks. Our code and datasets are available at +https://github.com/Vision-CAIR/dochaystacks + +
+
+ comment: the correct arxiv version +
+
+
+
+
+ + ♻ ☆ Estimating Continuous Muscle Fatigue For Multi-Muscle Coordinated + Exercise: A Pilot Study on Walking + + +
+ Assessing the progression of muscle fatigue for daily exercises provides +vital indicators for precise rehabilitation, personalized training dose, +especially under the context of Metaverse. Assessing fatigue of multi-muscle +coordination-involved daily exercises requires the neuromuscular features that +represent the fatigue-induced characteristics of spatiotemporal adaptions of +multiple muscles and the estimator that captures the time-evolving progression +of fatigue. In this paper, we propose to depict fatigue by the features of +muscle compensation and spinal module activation changes and estimate +continuous fatigue by a physiological rationale model. First, we extract muscle +synergy fractionation and the variance of spinal module spikings as features +inspired by the prior of fatigue-induced neuromuscular adaptations. Second, we +treat the features as observations and develop a Bayesian Gaussian process to +capture the time-evolving progression. Third, we solve the issue of lacking +supervision information by mathematically formulating the time-evolving +characteristics of fatigue as the loss function. Finally, we adapt the metrics +that follow the physiological principles of fatigue to quantitatively evaluate +the performance. Our extensive experiments present a 0.99 similarity between +days, a over 0.7 similarity with other views of fatigue and a nearly 1 weak +monotonicity, which outperform other methods. This study would aim the +objective assessment of muscle fatigue. + +
+
+
+
+
+ + ♻ ☆ "Give me the code" -- Log Analysis of First-Year CS Students' + Interactions With GPT + + +
+ The impact of Large Language Models (LLMs) like GPT-3, GPT-4, and Bard in +computer science (CS) education is expected to be profound. Students now have +the power to generate code solutions for a wide array of programming +assignments. For first-year students, this may be particularly problematic +since the foundational skills are still in development and an over-reliance on +generative AI tools can hinder their ability to grasp essential programming +concepts. This paper analyzes the prompts used by 69 freshmen undergraduate +students to solve a certain programming problem within a project assignment, +without giving them prior prompt training. We also present the rules of the +exercise that motivated the prompts, designed to foster critical thinking +skills during the interaction. Despite using unsophisticated prompting +techniques, our findings suggest that the majority of students successfully +leveraged GPT, incorporating the suggested solutions into their projects. +Additionally, half of the students demonstrated the ability to exercise +judgment in selecting from multiple GPT-generated solutions, showcasing the +development of their critical thinking skills in evaluating AI-generated code. + +
+
+ comment: This is the author's version of the work. It is posted here for your + personal use. Not for redistribution +
+
+
+
+
+ + ♻ ☆ The Malicious Use of Artificial Intelligence: Forecasting, Prevention, + and Mitigation + + +
+ This report surveys the landscape of potential security threats from +malicious uses of AI, and proposes ways to better forecast, prevent, and +mitigate these threats. After analyzing the ways in which AI may influence the +threat landscape in the digital, physical, and political domains, we make four +high-level recommendations for AI researchers and other stakeholders. We also +suggest several promising areas for further research that could expand the +portfolio of defenses, or make attacks less effective or harder to execute. +Finally, we discuss, but do not conclusively resolve, the long-term equilibrium +of attackers and defenders. + +
+
+
+
+
+ + ♻ ☆ Learning Transferable Features for Implicit Neural Representations + + +
+ Implicit neural representations (INRs) have demonstrated success in a variety +of applications, including inverse problems and neural rendering. An INR is +typically trained to capture one signal of interest, resulting in learned +neural features that are highly attuned to that signal. Assumed to be less +generalizable, we explore the aspect of transferability of such learned neural +features for fitting similar signals. We introduce a new INR training +framework, STRAINER that learns transferrable features for fitting INRs to new +signals from a given distribution, faster and with better reconstruction +quality. Owing to the sequential layer-wise affine operations in an INR, we +propose to learn transferable representations by sharing initial encoder layers +across multiple INRs with independent decoder layers. At test time, the learned +encoder representations are transferred as initialization for an otherwise +randomly initialized INR. We find STRAINER to yield extremely powerful +initialization for fitting images from the same domain and allow for $\approx ++10dB$ gain in signal quality early on compared to an untrained INR itself. +STRAINER also provides a simple way to encode data-driven priors in INRs. We +evaluate STRAINER on multiple in-domain and out-of-domain signal fitting tasks +and inverse problems and further provide detailed analysis and discussion on +the transferability of STRAINER's features. Our demo can be accessed at +https://colab.research.google.com/drive/1fBZAwqE8C_lrRPAe-hQZJTWrMJuAKtG2?usp=sharing . + +
+
+ comment: Project Website: https://kushalvyas.github.io/strainer.html +
+
+
+
+
+ + ♻ ☆ Hierarchical Prompting Taxonomy: A Universal Evaluation Framework for + Large Language Models + + +
+ Assessing the effectiveness of large language models (LLMs) in performing +different tasks is crucial for understanding their strengths and weaknesses. +This paper presents the Hierarchical Prompting Taxonomy (HPT), grounded on +human cognitive principles and designed to assess LLMs by examining the +cognitive demands of various tasks. The HPT uses the Hierarchical Prompting +Framework (HPF), a prompt selection framework that organizes five distinct +prompting strategies by their cognitive load on LLMs. This study introduces the +Hierarchical Prompting Index (HPI) to measure task complexity, which +demonstrates LLMs' abilities across different datasets and serves as a +universal metric for task complexity. The HPT offers a reliable method for +evaluating LLMs' problem-solving skills in diverse scenarios, leading to +clearer conclusions. Extensive experiments with multiple datasets and LLMs show +that the HPF enhances LLM performance by 2\% to 63\% compared to standard +benchmark datasets, confirming the effectiveness of the HPT. To support future +research in this domain, the implementations of HPT and HPF are publicly +available + +
+
+
+
+
+ + ♻ ☆ DUET: A Tuning-Free Device-Cloud Collaborative Parameters Generation + Framework for Efficient Device Model Generalization WWW'23 + + +
+ Device Model Generalization (DMG) is a practical yet under-investigated +research topic for on-device machine learning applications. It aims to improve +the generalization ability of pre-trained models when deployed on +resource-constrained devices, such as improving the performance of pre-trained +cloud models on smart mobiles. While quite a lot of works have investigated the +data distribution shift across clouds and devices, most of them focus on model +fine-tuning on personalized data for individual devices to facilitate DMG. +Despite their promising, these approaches require on-device re-training, which +is practically infeasible due to the overfitting problem and high time delay +when performing gradient calculation on real-time data. In this paper, we argue +that the computational cost brought by fine-tuning can be rather unnecessary. +We consequently present a novel perspective to improving DMG without increasing +computational cost, i.e., device-specific parameter generation which directly +maps data distribution to parameters. Specifically, we propose an efficient +Device-cloUd collaborative parametErs generaTion framework DUET. DUET is +deployed on a powerful cloud server that only requires the low cost of +forwarding propagation and low time delay of data transmission between the +device and the cloud. By doing so, DUET can rehearse the device-specific model +weight realizations conditioned on the personalized real-time data for an +individual device. Importantly, our DUET elegantly connects the cloud and +device as a 'duet' collaboration, frees the DMG from fine-tuning, and enables a +faster and more accurate DMG paradigm. We conduct an extensive experimental +study of DUET on three public datasets, and the experimental results confirm +our framework's effectiveness and generalisability for different DMG tasks. + +
+
+ comment: Published on WWW'23: Proceedings of the ACM on Web Conference 2023 + (pp. 3077 - 3085) +
+
+
+
+
+ + ♻ ☆ Intelligent Model Update Strategy for Sequential Recommendation WWW'24 + + +
+ Modern online platforms are increasingly employing recommendation systems to +address information overload and improve user engagement. There is an evolving +paradigm in this research field that recommendation network learning occurs +both on the cloud and on edges with knowledge transfer in between (i.e., +edge-cloud collaboration). Recent works push this field further by enabling +edge-specific context-aware adaptivity, where model parameters are updated in +real-time based on incoming on-edge data. However, we argue that frequent data +exchanges between the cloud and edges often lead to inefficiency and waste of +communication/computation resources, as considerable parameter updates might be +redundant. To investigate this problem, we introduce Intelligent Edge-Cloud +Parameter Request Model, abbreviated as IntellectReq. + IntellectReq is designed to operate on edge, evaluating the cost-benefit +landscape of parameter requests with minimal computation and communication +overhead. We formulate this as a novel learning task, aimed at the detection of +out-of-distribution data, thereby fine-tuning adaptive communication +strategies. Further, we employ statistical mapping techniques to convert +real-time user behavior into a normal distribution, thereby employing +multi-sample outputs to quantify the model's uncertainty and thus its +generalization capabilities. Rigorous empirical validation on four +widely-adopted benchmarks evaluates our approach, evidencing a marked +improvement in the efficiency and generalizability of edge-cloud collaborative +and dynamic recommendation systems. + +
+
+ comment: Published on WWW'24(Oral): Proceedings of the ACM on Web Conference + 2024 (pp. 3117-3128) +
+
+
+
+
+ + ♻ ☆ No Size Fits All: The Perils and Pitfalls of Leveraging LLMs Vary with + Company Size COLING2025 + + +
+ Large language models (LLMs) are playing a pivotal role in deploying +strategic use cases across a range of organizations, from large pan-continental +companies to emerging startups. The issues and challenges involved in the +successful utilization of LLMs can vary significantly depending on the size of +the organization. It is important to study and discuss these pertinent issues +of LLM adaptation with a focus on the scale of the industrial concerns and +brainstorm possible solutions and prospective directions. Such a study has not +been prominently featured in the current research literature. In this study, we +adopt a threefold strategy: first, we conduct a case study with industry +practitioners to formulate the key research questions; second, we examine +existing industrial publications to address these questions; and finally, we +provide a practical guide for industries to utilize LLMs more efficiently. We +release the +GitHub\footnote{\url{https://github.com/vinayakcse/IndustrialLLMsPapers}} +repository with the most recent papers in the field. + +
+
+ comment: COLING2025 Industry track +
+
+
+
+
+ + ♻ ☆ Burning RED: Unlocking Subtask-Driven Reinforcement Learning and + Risk-Awareness in Average-Reward Markov Decision Processes + + +
+ Average-reward Markov decision processes (MDPs) provide a foundational +framework for sequential decision-making under uncertainty. However, +average-reward MDPs have remained largely unexplored in reinforcement learning +(RL) settings, with the majority of RL-based efforts having been allocated to +episodic and discounted MDPs. In this work, we study a unique structural +property of average-reward MDPs and utilize it to introduce Reward-Extended +Differential (or RED) reinforcement learning: a novel RL framework that can be +used to effectively and efficiently solve various subtasks simultaneously in +the average-reward setting. We introduce a family of RED learning algorithms +for prediction and control, including proven-convergent algorithms for the +tabular case. We then showcase the power of these algorithms by demonstrating +how they can be used to learn a policy that optimizes, for the first time, the +well-known conditional value-at-risk (CVaR) risk measure in a fully-online +manner, without the use of an explicit bi-level optimization scheme or an +augmented state-space. + +
+
+
+
+
+ + ♻ ☆ Separate Anything You Describe + + +
+ Language-queried audio source separation (LASS) is a new paradigm for +computational auditory scene analysis (CASA). LASS aims to separate a target +sound from an audio mixture given a natural language query, which provides a +natural and scalable interface for digital audio applications. Recent works on +LASS, despite attaining promising separation performance on specific sources +(e.g., musical instruments, limited classes of audio events), are unable to +separate audio concepts in the open domain. In this work, we introduce +AudioSep, a foundation model for open-domain audio source separation with +natural language queries. We train AudioSep on large-scale multimodal datasets +and extensively evaluate its capabilities on numerous tasks including audio +event separation, musical instrument separation, and speech enhancement. +AudioSep demonstrates strong separation performance and impressive zero-shot +generalization ability using audio captions or text labels as queries, +substantially outperforming previous audio-queried and language-queried sound +separation models. For reproducibility of this work, we will release the source +code, evaluation benchmark and pre-trained model at: +https://github.com/Audio-AGI/AudioSep. + +
+
+ comment: Code, benchmark and pre-trained models: + https://github.com/Audio-AGI/AudioSep +
+
+
+
+
+ + ♻ ☆ Recurrent Aggregators in Neural Algorithmic Reasoning + + +
+ Neural algorithmic reasoning (NAR) is an emerging field that seeks to design +neural networks that mimic classical algorithmic computations. Today, graph +neural networks (GNNs) are widely used in neural algorithmic reasoners due to +their message passing framework and permutation equivariance. In this extended +abstract, we challenge this design choice, and replace the equivariant +aggregation function with a recurrent neural network. While seemingly +counter-intuitive, this approach has appropriate grounding when nodes have a +natural ordering -- and this is the case frequently in established reasoning +benchmarks like CLRS-30. Indeed, our recurrent NAR (RNAR) model performs very +strongly on such tasks, while handling many others gracefully. A notable +achievement of RNAR is its decisive state-of-the-art result on the Heapsort and +Quickselect tasks, both deemed as a significant challenge for contemporary +neural algorithmic reasoners -- especially the latter, where RNAR achieves a +mean micro-F1 score of 87%. + +
+
+ comment: Presented at the Third Learning on Graphs Conference (LoG 2024). 10 + pages, 1 figure +
+
+
+
+
+ + ♻ ☆ MeshAnything V2: Artist-Created Mesh Generation With Adjacent Mesh + Tokenization + + +
+ Meshes are the de facto 3D representation in the industry but are +labor-intensive to produce. Recently, a line of research has focused on +autoregressively generating meshes. This approach processes meshes into a +sequence composed of vertices and then generates them vertex by vertex, similar +to how a language model generates text. These methods have achieved some +success but still struggle to generate complex meshes. One primary reason for +this limitation is their inefficient tokenization methods. To address this +issue, we introduce MeshAnything V2, an advanced mesh generation model designed +to create Artist-Created Meshes that align precisely with specified shapes. A +key innovation behind MeshAnything V2 is our novel Adjacent Mesh Tokenization +(AMT) method. Unlike traditional approaches that represent each face using +three vertices, AMT optimizes this by employing a single vertex wherever +feasible, effectively reducing the token sequence length by about half on +average. This not only streamlines the tokenization process but also results in +more compact and well-structured sequences, enhancing the efficiency of mesh +generation. With these improvements, MeshAnything V2 effectively doubles the +face limit compared to previous models, delivering superior performance without +increasing computational costs. We will make our code and models publicly +available. Project Page: https://buaacyw.github.io/meshanything-v2/ + +
+
+ comment: Project Page: https://buaacyw.github.io/meshanything-v2/ Github: + https://github.com/buaacyw/MeshAnythingV2 +
+
+
+
+
+ + ♻ ☆ Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise + with Median Anchored Clipping + + +
+ Leveraging over-the-air computations for model aggregation is an effective +approach to cope with the communication bottleneck in federated edge learning. +By exploiting the superposition properties of multi-access channels, this +approach facilitates an integrated design of communication and computation, +thereby enhancing system privacy while reducing implementation costs. However, +the inherent electromagnetic interference in radio channels often exhibits +heavy-tailed distributions, giving rise to exceptionally strong noise in +globally aggregated gradients that can significantly deteriorate the training +performance. To address this issue, we propose a novel gradient clipping +method, termed Median Anchored Clipping (MAC), to combat the detrimental +effects of heavy-tailed noise. We also derive analytical expressions for the +convergence rate of model training with analog over-the-air federated learning +under MAC, which quantitatively demonstrates the effect of MAC on training +performance. Extensive experimental results show that the proposed MAC +algorithm effectively mitigates the impact of heavy-tailed noise, hence +substantially enhancing system robustness. + +
+
+ comment: This is the full version of the paper, and the appendix contains a + complete convergence analysis under non-convex conditions +
+
+
+
+
+ + ♻ ☆ Strategic Demand-Planning in Wireless Networks: Can Generative-AI Save + Spectrum and Energy? + + +
+ Generative-AI (GenAI), a novel technology capable of producing various types +of outputs, including text, images, and videos, offers significant potential +for wireless communications. This article introduces the concept of strategic +demand-planning through demand-labeling, demand-shaping, and +demand-rescheduling. Accordingly, GenAI is proposed as a powerful tool to +facilitate demand-shaping in wireless networks. More specifically, GenAI is +used to compress and convert the content of various types (e.g., from a higher +bandwidth mode to a lower one, such as from a video to text), which +subsequently enhances performance of wireless networks in various usage +scenarios, such as cell-switching, user association and load balancing, +interference management, as well as disasters and unusual gatherings. +Therefore, GenAI can serve a function in saving energy and spectrum in wireless +networks. With recent advancements in AI, including sophisticated algorithms +like large language models and the development of more powerful hardware built +exclusively for AI tasks, such as AI accelerators, the concept of +demand-planning, particularly demand-shaping through GenAI, becomes +increasingly relevant. Furthermore, recent efforts to make GenAI accessible on +devices, such as user terminals, make the implementation of this concept even +more straightforward and feasible. + +
+
+
+
+
+ + ♻ ☆ From Open Vocabulary to Open World: Teaching Vision Language Models to + Detect Novel Objects + + +
+ Traditional object detection methods operate under the closed-set assumption, +where models can only detect a fixed number of objects predefined in the +training set. Recent works on open vocabulary object detection (OVD) enable the +detection of objects defined by an unbounded vocabulary, which reduces the cost +of training models for specific tasks. However, OVD heavily relies on accurate +prompts provided by an ''oracle'', which limits their use in critical +applications such as driving scene perception. OVD models tend to misclassify +near-out-of-distribution (NOOD) objects that have similar semantics to known +classes, and ignore far-out-of-distribution (FOOD) objects. To address theses +limitations, we propose a framework that enables OVD models to operate in open +world settings, by identifying and incrementally learning novel objects. To +detect FOOD objects, we propose Open World Embedding Learning (OWEL) and +introduce the concept of Pseudo Unknown Embedding which infers the location of +unknown classes in a continuous semantic space based on the information of +known classes. We also propose Multi-Scale Contrastive Anchor Learning (MSCAL), +which enables the identification of misclassified unknown objects by promoting +the intra-class consistency of object embeddings at different scales. The +proposed method achieves state-of-the-art performance in common open world +object detection and autonomous driving benchmarks. + +
+
+
+
+
+ + ♻ ☆ Corn Yield Prediction Model with Deep Neural Networks for Smallholder + Farmer Decision Support System + + +
+ Crop yield prediction has been modeled on the assumption that there is no +interaction between weather and soil variables. However, this paper argues that +an interaction exists, and it can be finely modelled using the Kendall +Correlation coefficient. Given the nonlinearity of the interaction between +weather and soil variables, a deep neural network regressor (DNNR) is carefully +designed with consideration to the depth, number of neurons of the hidden +layers, and the hyperparameters with their optimizations. Additionally, a new +metric, the average of absolute root squared error (ARSE) is proposed to +combine the strengths of root mean square error (RMSE) and mean absolute error +(MAE). With the ARSE metric, the proposed DNNR(s), optimised random forest +regressor (RFR) and the extreme gradient boosting regressor (XGBR) achieved +impressively small yield errors, 0.0172 t/ha, and 0.0243 t/ha, 0.0001 t/ha, and +0.001 t/ha, respectively. However, the DNNR(s), with changes to the explanatory +variables to ensure generalizability to unforeseen data, DNNR(s) performed +best. Further analysis reveals that a strong interaction does exist between +weather and soil variables. Precisely, yield is observed to increase when +precipitation is reduced and silt increased, and vice-versa. However, the +degree of decrease or increase is not quantified in this paper. Contrary to +existing yield models targeted towards agricultural policies and global food +security, the goal of the proposed corn yield model is to empower the +smallholder farmer to farm smartly and intelligently, thus the prediction model +is integrated into a mobile application that includes education, and a +farmer-to-market access module. + +
+
+ comment: 30 Pages, 11 Figures, 3 Tables +
+
+
+
+
+ + ♻ ☆ A Survey on Human-Centric LLMs + + +
+ The rapid evolution of large language models (LLMs) and their capacity to +simulate human cognition and behavior has given rise to LLM-based frameworks +and tools that are evaluated and applied based on their ability to perform +tasks traditionally performed by humans, namely those involving cognition, +decision-making, and social interaction. This survey provides a comprehensive +examination of such human-centric LLM capabilities, focusing on their +performance in both individual tasks (where an LLM acts as a stand-in for a +single human) and collective tasks (where multiple LLMs coordinate to mimic +group dynamics). We first evaluate LLM competencies across key areas including +reasoning, perception, and social cognition, comparing their abilities to +human-like skills. Then, we explore real-world applications of LLMs in +human-centric domains such as behavioral science, political science, and +sociology, assessing their effectiveness in replicating human behaviors and +interactions. Finally, we identify challenges and future research directions, +such as improving LLM adaptability, emotional intelligence, and cultural +sensitivity, while addressing inherent biases and enhancing frameworks for +human-AI collaboration. This survey aims to provide a foundational +understanding of LLMs from a human-centric perspective, offering insights into +their current capabilities and potential for future development. + +
+
+
+
+
+ + ♻ ☆ DoorINet: Door Heading Prediction through Inertial Deep Learning + + +
+ Inertial sensors are widely used in a variety of applications. A common task +is orientation estimation. To tackle such a task, attitude and heading +reference system algorithms are applied. Relying on the gyroscope readings, the +accelerometer measurements are used to update the attitude angles, and +magnetometer measurements are utilized to update the heading angle. In indoor +environments, magnetometers suffer from interference that degrades their +performance resulting in poor heading angle estimation. Therefore, applications +that estimate the heading angle of moving objects, such as walking pedestrians, +closets, and refrigerators, are prone to error. To circumvent such situations, +we propose DoorINet, an end-to-end deep-learning framework to calculate the +heading angle from door-mounted, low-cost inertial sensors without using +magnetometers. To evaluate our approach, we record a unique dataset containing +391 minutes of accelerometer and gyroscope measurements and corresponding +ground-truth heading angle. We show that our proposed approach outperforms +commonly used, model based approaches and data-driven methods. + +
+
+ comment: 10 pages, 14 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Geometric Point Attention Transformer for 3D Shape Reassembly + + +
+ Shape assembly, which aims to reassemble separate parts into a complete +object, has gained significant interest in recent years. Existing methods +primarily rely on networks to predict the poses of individual parts, but often +fail to effectively capture the geometric interactions between the parts and +their poses. In this paper, we present the Geometric Point Attention +Transformer (GPAT), a network specifically designed to address the challenges +of reasoning about geometric relationships. In the geometric point attention +module, we integrate both global shape information and local pairwise geometric +features, along with poses represented as rotation and translation vectors for +each part. To enable iterative updates and dynamic reasoning, we introduce a +geometric recycling scheme, where each prediction is fed into the next +iteration for refinement. We evaluate our model on both the semantic and +geometric assembly tasks, showing that it outperforms previous methods in +absolute pose estimation, achieving accurate pose predictions and high +alignment accuracy. + +
+
+
+
+
+ + ♻ ☆ HAAT: Hybrid Attention Aggregation Transformer for Image + Super-Resolution + + +
+ In the research area of image super-resolution, Swin-transformer-based models +are favored for their global spatial modeling and shifting window attention +mechanism. However, existing methods often limit self-attention to non +overlapping windows to cut costs and ignore the useful information that exists +across channels. To address this issue, this paper introduces a novel model, +the Hybrid Attention Aggregation Transformer (HAAT), designed to better +leverage feature information. HAAT is constructed by integrating +Swin-Dense-Residual-Connected Blocks (SDRCB) with Hybrid Grid Attention Blocks +(HGAB). SDRCB expands the receptive field while maintaining a streamlined +architecture, resulting in enhanced performance. HGAB incorporates channel +attention, sparse attention, and window attention to improve nonlocal feature +fusion and achieve more visually compelling results. Experimental evaluations +demonstrate that HAAT surpasses state-of-the-art methods on benchmark datasets. + Keywords: Image super-resolution, Computer vision, Attention mechanism, +Transformer + +
+
+ comment: 6 pages, 2 figures, 1 table +
+
+
+
+
+ + ♻ ☆ BiCo-Fusion: Bidirectional Complementary LiDAR-Camera Fusion for + Semantic- and Spatial-Aware 3D Object Detection + + +
+ 3D object detection is an important task that has been widely applied in +autonomous driving. To perform this task, a new trend is to fuse multi-modal +inputs, i.e., LiDAR and camera. Under such a trend, recent methods fuse these +two modalities by unifying them in the same 3D space. However, during direct +fusion in a unified space, the drawbacks of both modalities (LiDAR features +struggle with detailed semantic information and the camera lacks accurate 3D +spatial information) are also preserved, diluting semantic and spatial +awareness of the final unified representation. To address the issue, this +letter proposes a novel bidirectional complementary LiDAR-camera fusion +framework, called BiCo-Fusion that can achieve robust semantic- and +spatial-aware 3D object detection. The key insight is to fuse LiDAR and camera +features in a bidirectional complementary way to enhance the semantic awareness +of the LiDAR and the 3D spatial awareness of the camera. The enhanced features +from both modalities are then adaptively fused to build a semantic- and +spatial-aware unified representation. Specifically, we introduce Pre-Fusion +consisting of a Voxel Enhancement Module (VEM) to enhance the semantic +awareness of voxel features from 2D camera features and Image Enhancement +Module (IEM) to enhance the 3D spatial awareness of camera features from 3D +voxel features. We then introduce Unified Fusion (U-Fusion) to adaptively fuse +the enhanced features from the last stage to build a unified representation. +Extensive experiments demonstrate the superiority of our BiCo-Fusion against +the prior arts. Project page: https://t-ys.github.io/BiCo-Fusion/. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ PINNfluence: Influence Functions for Physics-Informed Neural Networks + + +
+ Recently, physics-informed neural networks (PINNs) have emerged as a flexible +and promising application of deep learning to partial differential equations in +the physical sciences. While offering strong performance and competitive +inference speeds on forward and inverse problems, their black-box nature limits +interpretability, particularly regarding alignment with expected physical +behavior. In the present work, we explore the application of influence +functions (IFs) to validate and debug PINNs post-hoc. Specifically, we apply +variations of IF-based indicators to gauge the influence of different types of +collocation points on the prediction of PINNs applied to a 2D Navier-Stokes +fluid flow problem. Our results demonstrate how IFs can be adapted to PINNs to +reveal the potential for further studies. The code is publicly available at +https://github.com/aleks-krasowski/PINNfluence. + +
+
+
+
+
+ + ♻ ☆ Circuit Complexity Bounds for RoPE-based Transformer Architecture + + +
+ Characterizing the express power of the Transformer architecture is critical +to understanding its capacity limits and scaling law. Recent works provide the +circuit complexity bounds to Transformer-like architecture. On the other hand, +Rotary Position Embedding ($\mathsf{RoPE}$) has emerged as a crucial technique +in modern large language models, offering superior performance in capturing +positional information compared to traditional position embeddings, which shows +great potential in application prospects, particularly for the long context +scenario. Empirical evidence also suggests that $\mathsf{RoPE}$-based +Transformer architectures demonstrate greater generalization capabilities +compared to conventional Transformer models. In this work, we establish a +circuit complexity bound for Transformers with $\mathsf{RoPE}$ attention. Our +key contribution is that we show that unless $\mathsf{TC}^0 = \mathsf{NC}^1$, a +$\mathsf{RoPE}$-based Transformer with $\mathrm{poly}(n)$-precision, $O(1)$ +layers, hidden dimension $d \leq O(n)$ cannot solve the Arithmetic formula +evaluation problem or the Boolean formula value problem. This result +significantly demonstrates the fundamental limitation of the expressivity of +the $\mathsf{RoPE}$-based Transformer architecture, although it achieves giant +empirical success. Our theoretical result not only establishes the complexity +bound but also may instruct further work on the $\mathsf{RoPE}$-based +Transformer. + +
+
+
+
+
+ + ♻ ☆ MM-Vet v2: A Challenging Benchmark to Evaluate Large Multimodal Models + for Integrated Capabilities + + +
+ MM-Vet, with open-ended vision-language questions targeting at evaluating +integrated capabilities, has become one of the most popular benchmarks for +large multimodal model evaluation. MM-Vet assesses six core vision-language +(VL) capabilities: recognition, knowledge, spatial awareness, language +generation, OCR, and math. However, its question format is restricted to single +image-text pairs, lacking the interleaved image and text sequences prevalent in +real-world scenarios. To address this limitation, we introduce MM-Vet v2, which +includes a new VL capability called "image-text sequence understanding", +evaluating models' ability to process VL sequences. Furthermore, we maintain +the high quality of evaluation samples while further expanding the evaluation +set size. Using MM-Vet v2 to benchmark large multimodal models, we found that +Claude 3.5 Sonnet is the best model with a score of 71.8, slightly +outperforming GPT-4o which scored 71.0. Among open-weight models, +InternVL2-Llama3-76B leads with a score of 68.4. The code, data, and +leaderboard are accessible at https://github.com/yuweihao/MM-Vet. + +
+
+ comment: Code, data and leaderboard: https://github.com/yuweihao/MM-Vet +
+
+
+
+
+ + ♻ ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities ICML 2024 + + +
+ We propose MM-Vet, an evaluation benchmark that examines large multimodal +models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various +intriguing abilities, such as solving math problems written on the blackboard, +reasoning about events and celebrities in news images, and explaining visual +jokes. Rapid model advancements pose challenges to evaluation benchmark +development. Problems include: (1) How to systematically structure and evaluate +the complicated multimodal tasks; (2) How to design evaluation metrics that +work well across question and answer types; and (3) How to give model insights +beyond a simple performance ranking. To this end, we present MM-Vet, designed +based on the insight that the intriguing ability to solve complicated tasks is +often achieved by a generalist model being able to integrate different core +vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and +examines the 16 integrations of interest derived from the capability +combination. For evaluation metrics, we propose an LLM-based evaluator for +open-ended outputs. The evaluator enables the evaluation across different +question types and answer styles, resulting in a unified scoring metric. We +evaluate representative LMMs on MM-Vet, providing insights into the +capabilities of different LMM system paradigms and models. + +
+
+ comment: ICML 2024. Code, data and leaderboard: + https://github.com/yuweihao/MM-Vet +
+
+
+
+
+ + ♻ ☆ Potential Field Based Deep Metric Learning + + +
+ Deep metric learning (DML) involves training a network to learn a +semantically meaningful representation space. Many current approaches mine +n-tuples of examples and model interactions within each tuplets. We present a +novel, compositional DML model, inspired by electrostatic fields in physics +that, instead of in tuples, represents the influence of each example +(embedding) by a continuous potential field, and superposes the fields to +obtain their combined global potential field. We use attractive/repulsive +potential fields to represent interactions among embeddings from images of the +same/different classes. Contrary to typical learning methods, where mutual +influence of samples is proportional to their distance, we enforce reduction in +such influence with distance, leading to a decaying field. We show that such +decay helps improve performance on real world datasets with large intra-class +variations and label noise. Like other proxy-based methods, we also use proxies +to succinctly represent sub-populations of examples. We evaluate our method on +three standard DML benchmarks- Cars-196, CUB-200-2011, and SOP datasets where +it outperforms state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ Introduction to Reinforcement Learning + + +
+ Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI), +focuses on training agents to make decisions by interacting with their +environment to maximize cumulative rewards. This paper provides an overview of +RL, covering its core concepts, methodologies, and resources for further +learning. It offers a thorough explanation of fundamental components such as +states, actions, policies, and reward signals, ensuring readers develop a solid +foundational understanding. Additionally, the paper presents a variety of RL +algorithms, categorized based on the key factors such as model-free, +model-based, value-based, policy-based, and other key factors. Resources for +learning and implementing RL, such as books, courses, and online communities +are also provided. By offering a clear, structured introduction, this paper +aims to simplify the complexities of RL for beginners, providing a +straightforward pathway to understanding and applying real-time techniques. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ SongBsAb: A Dual Prevention Approach against Singing Voice Conversion + based Illegal Song Covers NDSS + + +
+ Singing voice conversion (SVC) automates song covers by converting a source +singing voice from a source singer into a new singing voice with the same +lyrics and melody as the source, but sounds like being covered by the target +singer of some given target singing voices. However, it raises serious concerns +about copyright and civil right infringements. We propose SongBsAb, the first +proactive approach to tackle SVC-based illegal song covers. SongBsAb adds +perturbations to singing voices before releasing them, so that when they are +used, the process of SVC will be interfered, leading to unexpected singing +voices. Perturbations are carefully crafted to (1) provide a dual prevention, +i.e., preventing the singing voice from being used as the source and target +singing voice in SVC, by proposing a gender-transformation loss and a high/low +hierarchy multi-target loss, respectively; and (2) be harmless, i.e., no +side-effect on the enjoyment of protected songs, by refining a psychoacoustic +model-based loss with the backing track as an additional masker, a unique +accompanying element for singing voices compared to ordinary speech voices. We +also adopt a frame-level interaction reduction-based loss and encoder ensemble +to enhance the transferability of SongBsAb to unknown SVC models. We +demonstrate the prevention effectiveness, harmlessness, and robustness of +SongBsAb on five diverse and promising SVC models, using both English and +Chinese datasets, and both objective and human study-based subjective metrics. +Our work fosters an emerging research direction for mitigating illegal +automated song covers. + +
+
+ comment: In Proceedings of the 32nd Network and Distributed System Security + (NDSS) Symposium 2025 +
+
+
+
+
+ + ♻ ☆ Job-SDF: A Multi-Granularity Dataset for Job Skill Demand Forecasting + and Benchmarking NeurIPS 2024 + + +
+ In a rapidly evolving job market, skill demand forecasting is crucial as it +enables policymakers and businesses to anticipate and adapt to changes, +ensuring that workforce skills align with market needs, thereby enhancing +productivity and competitiveness. Additionally, by identifying emerging skill +requirements, it directs individuals towards relevant training and education +opportunities, promoting continuous self-learning and development. However, the +absence of comprehensive datasets presents a significant challenge, impeding +research and the advancement of this field. To bridge this gap, we present +Job-SDF, a dataset designed to train and benchmark job-skill demand forecasting +models. Based on 10.35 million public job advertisements collected from major +online recruitment platforms in China between 2021 and 2023, this dataset +encompasses monthly recruitment demand for 2,324 types of skills across 521 +companies. Our dataset uniquely enables evaluating skill demand forecasting +models at various granularities, including occupation, company, and regional +levels. We benchmark a range of models on this dataset, evaluating their +performance in standard scenarios, in predictions focused on lower value +ranges, and in the presence of structural breaks, providing new insights for +further research. Our code and dataset are publicly accessible via the +https://github.com/Job-SDF/benchmark. + +
+
+ comment: NeurIPS 2024 Accepted +
+
+
+
+
+ + ♻ ☆ Voice Attribute Editing with Text Prompt + + +
+ Despite recent advancements in speech generation with text prompt providing +control over speech style, voice attributes in synthesized speech remain +elusive and challenging to control. This paper introduces a novel task: voice +attribute editing with text prompt, with the goal of making relative +modifications to voice attributes according to the actions described in the +text prompt. To solve this task, VoxEditor, an end-to-end generative model, is +proposed. In VoxEditor, addressing the insufficiency of text prompt, a Residual +Memory (ResMem) block is designed, that efficiently maps voice attributes and +these descriptors into the shared feature space. Additionally, the ResMem block +is enhanced with a voice attribute degree prediction (VADP) block to align +voice attributes with corresponding descriptors, addressing the imprecision of +text prompt caused by non-quantitative descriptions of voice attributes. We +also establish the open-source VCTK-RVA dataset, which leads the way in manual +annotations detailing voice characteristic differences among different +speakers. Extensive experiments demonstrate the effectiveness and +generalizability of our proposed method in terms of both objective and +subjective metrics. The dataset and audio samples are available on the website. + +
+
+
+
+
+ + ♻ ☆ FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL NeurIPS '24 + + +
+ Multi-agent reinforcement learning has demonstrated significant potential in +addressing complex cooperative tasks across various real-world applications. +However, existing MARL approaches often rely on the restrictive assumption that +the number of entities (e.g., agents, obstacles) remains constant between +training and inference. This overlooks scenarios where entities are dynamically +removed or added during the inference trajectory -- a common occurrence in +real-world environments like search and rescue missions and dynamic combat +situations. In this paper, we tackle the challenge of intra-trajectory dynamic +entity composition under zero-shot out-of-domain (OOD) generalization, where +such dynamic changes cannot be anticipated beforehand. Our empirical studies +reveal that existing MARL methods suffer significant performance degradation +and increased uncertainty in these scenarios. In response, we propose +FlickerFusion, a novel OOD generalization method that acts as a universally +applicable augmentation technique for MARL backbone methods. FlickerFusion +stochastically drops out parts of the observation space, emulating being +in-domain when inferenced OOD. The results show that FlickerFusion not only +achieves superior inference rewards but also uniquely reduces uncertainty +vis-\`a-vis the backbone, compared to existing methods. Benchmarks, +implementations, and model weights are organized and open-sourced at +flickerfusion305.github.io, accompanied by ample demo video renderings. + +
+
+ comment: NeurIPS '24 Open-World Agents Workshop (v2: minor revision) +
+
+
+
+
+ + ♻ ☆ DistRL: An Asynchronous Distributed Reinforcement Learning Framework for + On-Device Control Agents + + +
+ On-device control agents, especially on mobile devices, are responsible for +operating mobile devices to fulfill users' requests, enabling seamless and +intuitive interactions. Integrating Multimodal Large Language Models (MLLMs) +into these agents enhances their ability to understand and execute complex +commands, thereby improving user experience. However, fine-tuning MLLMs for +on-device control presents significant challenges due to limited data +availability and inefficient online training processes. This paper introduces +DistRL, a novel framework designed to enhance the efficiency of online RL +fine-tuning for mobile device control agents. DistRL employs centralized +training and decentralized data acquisition to ensure efficient fine-tuning in +the context of dynamic online interactions. Additionally, the framework is +backed by our tailor-made RL algorithm, which effectively balances exploration +with the prioritized utilization of collected data to ensure stable and robust +training. Our experiments show that, on average, DistRL delivers a 3X +improvement in training efficiency and enables training data collection 2.4X +faster than the leading synchronous multi-machine methods. Notably, after +training, DistRL achieves a 20% relative improvement in success rate compared +to state-of-the-art methods on general Android tasks from an open benchmark, +significantly outperforming existing approaches while maintaining the same +training time. These results validate DistRL as a scalable and efficient +solution, offering substantial improvements in both training efficiency and +agent performance for real-world, in-the-wild device control tasks. + +
+
+ comment: Paper and Appendix, 26 pages +
+
+
+
+
+ + ♻ ☆ Batch Calibration: Rethinking Calibration for In-Context Learning and + Prompt Engineering ICLR 2024 + + +
+ Prompting and in-context learning (ICL) have become efficient learning +paradigms for large language models (LLMs). However, LLMs suffer from prompt +brittleness and various bias factors in the prompt, including but not limited +to the formatting, the choice verbalizers, and the ICL examples. To address +this problem that results in unexpected performance degradation, calibration +methods have been developed to mitigate the effects of these biases while +recovering LLM performance. In this work, we first conduct a systematic +analysis of the existing calibration methods, where we both provide a unified +view and reveal the failure cases. Inspired by these analyses, we propose Batch +Calibration (BC), a simple yet intuitive method that controls the contextual +bias from the batched input, unifies various prior approaches, and effectively +addresses the aforementioned issues. BC is zero-shot, inference-only, and +incurs negligible additional costs. In the few-shot setup, we further extend BC +to allow it to learn the contextual bias from labeled data. We validate the +effectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate +state-of-the-art performance over previous calibration baselines across more +than 10 natural language understanding and image classification tasks. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ G-RAG: Knowledge Expansion in Material Science + + +
+ In the field of Material Science, effective information retrieval systems are +essential for facilitating research. Traditional Retrieval-Augmented Generation +(RAG) approaches in Large Language Models (LLMs) often encounter challenges +such as outdated information, hallucinations, limited interpretability due to +context constraints, and inaccurate retrieval. To address these issues, Graph +RAG integrates graph databases to enhance the retrieval process. Our proposed +method processes Material Science documents by extracting key entities +(referred to as MatIDs) from sentences, which are then utilized to query +external Wikipedia knowledge bases (KBs) for additional relevant information. +We implement an agent-based parsing technique to achieve a more detailed +representation of the documents. Our improved version of Graph RAG called G-RAG +further leverages a graph database to capture relationships between these +entities, improving both retrieval accuracy and contextual understanding. This +enhanced approach demonstrates significant improvements in performance for +domains that require precise information retrieval, such as Material Science. + +
+
+
+
+
+ + ♻ ☆ Alternators For Sequence Modeling + + +
+ This paper introduces alternators, a novel family of non-Markovian dynamical +models for sequences. An alternator features two neural networks: the +observation trajectory network (OTN) and the feature trajectory network (FTN). +The OTN and the FTN work in conjunction, alternating between outputting samples +in the observation space and some feature space, respectively, over a cycle. +The parameters of the OTN and the FTN are not time-dependent and are learned +via a minimum cross-entropy criterion over the trajectories. Alternators are +versatile. They can be used as dynamical latent-variable generative models or +as sequence-to-sequence predictors. Alternators can uncover the latent dynamics +underlying complex sequential data, accurately forecast and impute missing +data, and sample new trajectories. We showcase the capabilities of alternators +in three applications. We first used alternators to model the Lorenz equations, +often used to describe chaotic behavior. We then applied alternators to +Neuroscience, to map brain activity to physical activity. Finally, we applied +alternators to Climate Science, focusing on sea-surface temperature +forecasting. In all our experiments, we found alternators are stable to train, +fast to sample from, yield high-quality generated samples and latent variables, +and often outperform strong baselines such as Mambas, neural ODEs, and +diffusion models in the domains we studied. + +
+
+ comment: A new versatile family of sequence models that can be used for both + generative modeling and supervised learning. The codebase will be made + available upon publication. This paper is dedicated to Thomas Sankara +
+
+
+
+
+ + ♻ ☆ Support-Set Context Matters for Bongard Problems + + +
+ Current machine learning methods struggle to solve Bongard problems, which +are a type of IQ test that requires deriving an abstract "concept" from a set +of positive and negative "support" images, and then classifying whether or not +a new query image depicts the key concept. On Bongard-HOI, a benchmark for +natural-image Bongard problems, most existing methods have reached at best 69% +accuracy (where chance is 50%). Low accuracy is often attributed to neural +nets' lack of ability to find human-like symbolic rules. In this work, we point +out that many existing methods are forfeiting accuracy due to a much simpler +problem: they do not adapt image features given information contained in the +support set as a whole, and rely instead on information extracted from +individual supports. This is a critical issue, because the "key concept" in a +typical Bongard problem can often only be distinguished using multiple +positives and multiple negatives. We explore simple methods to incorporate this +context and show substantial gains over prior works, leading to new +state-of-the-art accuracy on Bongard-LOGO (75.3%) and Bongard-HOI (76.4%) +compared to methods with equivalent vision backbone architectures and strong +performance on the original Bongard problem set (60.8%). + +
+
+ comment: TMLR October 2024. Code: + https://github.com/nraghuraman/bongard-context +
+
+
+
+
+
+
+
+ + Machine Learning 50 + +
+
+
+ + ♻ ☆ UTG: Towards a Unified View of Snapshot and Event Based Models for + Temporal Graphs + + +
+ Many real world graphs are inherently dynamic, constantly evolving with node +and edge additions. These graphs can be represented by temporal graphs, either +through a stream of edge events or a sequence of graph snapshots. Until now, +the development of machine learning methods for both types has occurred largely +in isolation, resulting in limited experimental comparison and theoretical +crosspollination between the two. In this paper, we introduce Unified Temporal +Graph (UTG), a framework that unifies snapshot-based and event-based machine +learning models under a single umbrella, enabling models developed for one +representation to be applied effectively to datasets of the other. We also +propose a novel UTG training procedure to boost the performance of +snapshot-based models in the streaming setting. We comprehensively evaluate +both snapshot and event-based models across both types of temporal graphs on +the temporal link prediction task. Our main findings are threefold: first, when +combined with UTG training, snapshot-based models can perform competitively +with event-based models such as TGN and GraphMixer even on event datasets. +Second, snapshot-based models are at least an order of magnitude faster than +most event-based models during inference. Third, while event-based methods such +as NAT and DyGFormer outperforms snapshot-based methods on both types of +temporal graphs, this is because they leverage joint neighborhood structural +features thus emphasizing the potential to incorporate these features into +snapshotbased models as well. These findings highlight the importance of +comparing model architectures independent of the data format and suggest the +potential of combining the efficiency of snapshot-based models with the +performance of event-based models in the future. + +
+
+
+
+
+ + ♻ ☆ Efficient Deployment of Transformer Models in Analog In-Memory Computing + Hardware + + +
+ Analog in-memory computing (AIMC) has emerged as a promising solution to +overcome the von Neumann bottleneck, accelerating neural network computations +and improving computational efficiency. While AIMC has demonstrated success +with architectures such as CNNs, MLPs, and RNNs, deploying transformer-based +models using AIMC presents unique challenges. Transformers are expected to +handle diverse downstream tasks and adapt to new user data or instructions +after deployment, which requires more flexible approaches to suit AIMC +constraints. + In this paper, we propose a novel method for deploying pre-trained +transformer models onto AIMC hardware. Unlike traditional approaches requiring +hardware-aware training, our technique allows direct deployment without the +need for retraining the original model. Instead, we utilize lightweight, +low-rank adapters -- compact modules stored in digital cores -- to adapt the +model to hardware constraints. We validate our approach on MobileBERT, +demonstrating accuracy on par with, or even exceeding, a traditional +hardware-aware training approach. Our method is particularly appealing in +multi-task scenarios, as it enables a single analog model to be reused across +multiple tasks. Moreover, it supports on-chip adaptation to new hardware +constraints and tasks without updating analog weights, providing a flexible and +versatile solution for real-world AI applications. Code is available. + +
+
+
+
+
+ + ♻ ☆ Enhancing supply chain security with automated machine learning + + +
+ The increasing scale and complexity of global supply chains have led to new +challenges spanning various fields, such as supply chain disruptions due to +long waiting lines at the ports, material shortages, and inflation. Coupled +with the size of supply chains and the availability of vast amounts of data, +efforts towards tackling such challenges have led to an increasing interest in +applying machine learning methods in many aspects of supply chains. Unlike +other solutions, ML techniques, including Random Forest, XGBoost, LightGBM, and +Neural Networks, make predictions and approximate optimal solutions faster. +This paper presents an automated ML framework to enhance supply chain security +by detecting fraudulent activities, predicting maintenance needs, and +forecasting material backorders. Using datasets of varying sizes, results show +that fraud detection achieves an 88% accuracy rate using sampling methods, +machine failure prediction reaches 93.4% accuracy, and material backorder +prediction achieves 89.3% accuracy. Hyperparameter tuning significantly +improved the performance of these models, with certain supervised techniques +like XGBoost and LightGBM reaching up to 100% precision. This research +contributes to supply chain security by streamlining data preprocessing, +feature selection, model optimization, and inference deployment, addressing +critical challenges and boosting operational efficiency. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ♻ ☆ Comprehensive framework for evaluation of deep neural networks in + detection and quantification of lymphoma from PET/CT images: clinical + insights, pitfalls, and observer agreement analyses + + +
+ This study addresses critical gaps in automated lymphoma segmentation from +PET/CT images, focusing on issues often overlooked in existing literature. +While deep learning has been applied for lymphoma lesion segmentation, few +studies incorporate out-of-distribution testing, raising concerns about model +generalizability across diverse imaging conditions and patient populations. We +highlight the need to compare model performance with expert human annotators, +including intra- and inter-observer variability, to understand task difficulty +better. Most approaches focus on overall segmentation accuracy but overlook +lesion-specific metrics important for precise lesion detection and disease +quantification.To address these gaps, we propose a clinically-relevant +framework for evaluating deep neural networks. Using this lesion-specific +evaluation, we assess the performance of four deep segmentation networks +(ResUNet, SegResNet, DynUNet, and SwinUNETR) across 611 cases from +multi-institutional datasets, covering various lymphoma subtypes and lesion +characteristics. Beyond standard metrics like the Dice similarity coefficient +(DSC), we evaluate clinical lesion measures and their prediction errors. We +also introduce detection criteria for lesion localization and propose a new +detection Criterion 3 based on metabolic characteristics. We show that networks +perform better on large, intense lesions with higher metabolic +activity.Finally, we compare network performance to expert human observers via +intra- and inter-observer variability analyses, demonstrating that network +errors closely resemble those made by experts. Some small, faint lesions remain +challenging for both humans and networks. This study aims to improve automated +lesion segmentation's clinical relevance, supporting better treatment decisions +for lymphoma patients. The code is available at: +https://github.com/microsoft/lymphoma-segmentation-dnn + +
+
+ comment: 32 pages, 15 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Instruction Tuning for Large Language Models: A Survey + + +
+ This paper surveys research works in the quickly advancing field of +instruction tuning (IT), which can also be referred to as supervised +fine-tuning (SFT)\footnote{In this paper, unless specified otherwise, +supervised fine-tuning (SFT) and instruction tuning (IT) are used +interchangeably.}, a crucial technique to enhance the capabilities and +controllability of large language models (LLMs). Instruction tuning refers to +the process of further training LLMs on a dataset consisting of +\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the +gap between the next-word prediction objective of LLMs and the users' objective +of having LLMs adhere to human instructions. In this work, we make a systematic +review of the literature, including the general methodology of SFT, the +construction of SFT datasets, the training of SFT models, and applications to +different modalities, domains and application, along with analysis on aspects +that influence the outcome of SFT (e.g., generation of instruction outputs, +size of the instruction dataset, etc). We also review the potential pitfalls of +SFT along with criticism against it, along with efforts pointing out current +deficiencies of existing strategies and suggest some avenues for fruitful +research. Project Page: github.com/xiaoya-li/Instruction-Tuning-Survey + +
+
+ comment: V5; Last update: Dec. 1, 2024 +
+
+
+
+
+ + ♻ ☆ Models That Prove Their Own Correctness + + +
+ How can we trust the correctness of a learned model on a particular input of +interest? Model accuracy is typically measured *on average* over a distribution +of inputs, giving no guarantee for any fixed input. This paper proposes a +theoretically-founded solution to this problem: to train *Self-Proving models* +that prove the correctness of their output to a verification algorithm $V$ via +an Interactive Proof. Self-Proving models satisfy that, with high probability +over a random input, the model generates a correct output *and* successfully +proves its correctness to $V\!$. The *soundness* property of $V$ guarantees +that, for *every* input, no model can convince $V$ of the correctness of an +incorrect output. Thus, a Self-Proving model proves correctness of most of its +outputs, while *all* incorrect outputs (of any model) are detected by $V$. We +devise a generic method for learning Self-Proving models, and we prove +convergence bounds under certain assumptions. The theoretical framework and +results are complemented by experiments on an arithmetic capability: computing +the greatest common divisor (GCD) of two integers. Our learning method is used +to train a Self-Proving transformer that computes the GCD *and* proves the +correctness of its answer. + +
+
+
+
+
+ + ♻ ☆ Adaptive Constraint Integration for Simultaneously Optimizing Crystal + Structures with Multiple Targeted Properties + + +
+ In materials science, finding crystal structures that have targeted +properties is crucial. While recent methodologies such as Bayesian optimization +and deep generative models have made some advances on this issue, these methods +often face difficulties in adaptively incorporating various constraints, such +as electrical neutrality and targeted properties optimization, while keeping +the desired specific crystal structure. To address these challenges, we have +developed the Simultaneous Multi-property Optimization using Adaptive Crystal +Synthesizer (SMOACS), which utilizes state-of-the-art property prediction +models and their gradients to directly optimize input crystal structures for +targeted properties simultaneously. SMOACS enables the integration of adaptive +constraints into the optimization process without necessitating model +retraining. Thanks to this feature, SMOACS has succeeded in simultaneously +optimizing targeted properties while maintaining perovskite structures, even +with models trained on diverse crystal types. We have demonstrated the band gap +optimization while meeting a challenging constraint, that is, maintaining +electrical neutrality in large atomic configurations up to 135 atom sites, +where the verification of the electrical neutrality is challenging. The +properties of the most promising materials have been confirmed by density +functional theory calculations. + +
+
+
+
+
+ + ♻ ☆ Counting Like Transformers: Compiling Temporal Counting Logic Into + Softmax Transformers + + +
+ Deriving formal bounds on the expressivity of transformers, as well as +studying transformers that are constructed to implement known algorithms, are +both effective methods for better understanding the computational power of +transformers. Towards both ends, we introduce the temporal counting logic +$\textsf{K}_\text{t}$[#] alongside the RASP variant $\textsf{C-RASP}$. We show +they are equivalent to each other, and that together they are the best-known +lower bound on the formal expressivity of future-masked soft attention +transformers with unbounded input size. We prove this by showing all +$\textsf{K}_\text{t}$[#] formulas can be compiled into these transformers. + +
+
+
+
+
+ + ♻ ☆ Combining Blockchain and Biometrics: A Survey on Technical Aspects and a + First Legal Analysis + + +
+ Biometric recognition as a unique, hard-to-forge, and efficient way of +identification and verification has become an indispensable part of the current +digital world. The fast evolution of this technology has been a strong +incentive for integrating it into many applications. Meanwhile, blockchain, the +very attractive decentralized ledger technology, has been widely received both +by the research and industry in the past years and it is being increasingly +deployed nowadays in many different applications, such as money transfer, IoT, +healthcare, or logistics. Recently, researchers have started to speculate what +would be the pros and cons and what would be the best applications when these +two technologies cross paths. This paper provides a survey of technical +literature research on the combination of blockchain and biometrics and +includes a first legal analysis of this integration to shed light on challenges +and potentials. While this combination is still in its infancy and a growing +body of literature discusses specific blockchain applications and solutions in +an advanced technological set-up, this paper presents a holistic understanding +of blockchains applicability in the biometric sector. This study demonstrates +that combining blockchain and biometrics would be beneficial for novel +applications in biometrics such as the PKI mechanism, distributed trusted +service, and identity management. However, blockchain networks at their current +stage are not efficient and economical for real-time applications. From a legal +point of view, the allocation of accountability remains a main issue, while +other difficulties remain, such as conducting a proper Data Protection Impact +Assessment. Finally, it supplies technical and legal recommendations to reap +the benefits and mitigate the risks of the combination. + +
+
+
+
+
+ + ♻ ☆ SaFL: Sybil-aware Federated Learning with Application to Face + Recognition + + +
+ Federated Learning (FL) is a machine learning paradigm to conduct +collaborative learning among clients on a joint model. The primary goal is to +share clients' local training parameters with an integrating server while +preserving their privacy. This method permits to exploit the potential of +massive mobile users' data for the benefit of machine learning models' +performance while keeping sensitive data on local devices. On the downside, FL +raises security and privacy concerns that have just started to be studied. To +address some of the key threats in FL, researchers have proposed to use secure +aggregation methods (e.g. homomorphic encryption, secure multiparty +computation, etc.). These solutions improve some security and privacy metrics, +but at the same time bring about other serious threats such as poisoning +attacks, backdoor attacks, and free running attacks. This paper proposes a new +defense method against poisoning attacks in FL called SaFL (Sybil-aware +Federated Learning) that minimizes the effect of sybils with a novel +time-variant aggregation scheme. + +
+
+
+
+
+ + ♻ ☆ Rotation Invariant Quantization for Model Compression + + +
+ Post-training Neural Network (NN) model compression is an attractive approach +for deploying large, memory-consuming models on devices with limited memory +resources. In this study, we investigate the rate-distortion tradeoff for NN +model compression. First, we suggest a Rotation-Invariant Quantization (RIQ) +technique that utilizes a single parameter to quantize the entire NN model, +yielding a different rate at each layer, i.e., mixed-precision quantization. +Then, we prove that our rotation-invariant approach is optimal in terms of +compression. We rigorously evaluate RIQ and demonstrate its capabilities on +various models and tasks. For example, RIQ facilitates $\times 19.4$ and +$\times 52.9$ compression ratios on pre-trained VGG dense and pruned models, +respectively, with $<0.4\%$ accuracy degradation. Code is available in +\href{https://github.com/ehaleva/RIQ}{github.com/ehaleva/RIQ}. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Improving Shift Invariance in Convolutional Neural Networks with + Translation Invariant Polyphase Sampling WACV 2025 + + +
+ Downsampling operators break the shift invariance of convolutional neural +networks (CNNs) and this affects the robustness of features learned by CNNs +when dealing with even small pixel-level shift. Through a large-scale +correlation analysis framework, we study shift invariance of CNNs by inspecting +existing downsampling operators in terms of their maximum-sampling bias (MSB), +and find that MSB is negatively correlated with shift invariance. Based on this +crucial insight, we propose a learnable pooling operator called Translation +Invariant Polyphase Sampling (TIPS) and two regularizations on the intermediate +feature maps of TIPS to reduce MSB and learn translation-invariant +representations. TIPS can be integrated into any CNN and can be trained +end-to-end with marginal computational overhead. Our experiments demonstrate +that TIPS results in consistent performance gains in terms of accuracy, shift +consistency, and shift fidelity on multiple benchmarks for image classification +and semantic segmentation compared to previous methods and also leads to +improvements in adversarial and distributional robustness. TIPS results in the +lowest MSB compared to all previous methods, thus explaining our strong +empirical results. + +
+
+ comment: Accepted to WACV 2025 +
+
+
+
+
+ + ♻ ☆ Decoupled Vertical Federated Learning for Practical Training on + Vertically Partitioned Data + + +
+ Vertical Federated Learning (VFL) is an emergent distributed machine learning +paradigm for collaborative learning between clients who have disjoint features +of common entities. However, standard VFL lacks fault tolerance, with each +participant and connection being a single point of failure. Prior attempts to +induce fault tolerance in VFL focus on the scenario of "straggling clients", +usually entailing that all messages eventually arrive or that there is an upper +bound on the number of late messages. To handle the more general problem of +arbitrary crashes, we propose Decoupled VFL (DVFL). To handle training with +faults, DVFL decouples training between communication rounds using local +unsupervised objectives. By further decoupling label supervision from +aggregation, DVFL also enables redundant aggregators. As secondary benefits, +DVFL can enhance data efficiency and provides immunity against gradient-based +attacks. In this work, we implement DVFL for split neural networks with a +self-supervised autoencoder loss. When there are faults, DVFL outperforms the +best VFL-based alternative (97.58% vs 96.95% on an MNIST task). Even under +perfect conditions, performance is comparable. + +
+
+ comment: Revised manuscript. Nothing removed, additional baseline results + added +
+
+
+
+
+ + ♻ ☆ BERT or FastText? A Comparative Analysis of Contextual as well as + Non-Contextual Embeddings + + +
+ Natural Language Processing (NLP) for low-resource languages presents +significant challenges, particularly due to the scarcity of high-quality +annotated data and linguistic resources. The choice of embeddings plays a +critical role in enhancing the performance of NLP tasks, such as news +classification, sentiment analysis, and hate speech detection, especially for +low-resource languages like Marathi. In this study, we investigate the impact +of various embedding techniques- Contextual BERT-based, Non-Contextual +BERT-based, and FastText-based on NLP classification tasks specific to the +Marathi language. Our research includes a thorough evaluation of both +compressed and uncompressed embeddings, providing a comprehensive overview of +how these embeddings perform across different scenarios. Specifically, we +compare two BERT model embeddings, Muril and MahaBERT, as well as two FastText +model embeddings, IndicFT and MahaFT. Our evaluation includes applying +embeddings to a Multiple Logistic Regression (MLR) classifier for task +performance assessment, as well as TSNE visualizations to observe the spatial +distribution of these embeddings. The results demonstrate that contextual +embeddings outperform non-contextual embeddings. Furthermore, BERT-based +non-contextual embeddings extracted from the first BERT embedding layer yield +better results than FastText-based embeddings, suggesting a potential +alternative to FastText embeddings. + +
+
+
+
+
+ + ♻ ☆ Unified Universality Theorem for Deep and Shallow + Joint-Group-Equivariant Machines + + +
+ We present a constructive universal approximation theorem for learning +machines equipped with joint-group-equivariant feature maps, called the +joint-equivariant machines, based on the group representation theory. +"Constructive" here indicates that the distribution of parameters is given in a +closed-form expression known as the ridgelet transform. +Joint-group-equivariance encompasses a broad class of feature maps that +generalize classical group-equivariance. Particularly, fully-connected networks +are not group-equivariant but are joint-group-equivariant. Our main theorem +also unifies the universal approximation theorems for both shallow and deep +networks. Until this study, the universality of deep networks has been shown in +a different manner from the universality of shallow networks, but our results +discuss them on common ground. Now we can understand the approximation schemes +of various learning machines in a unified manner. As applications, we show the +constructive universal approximation properties of four examples: depth-$n$ +joint-equivariant machine, depth-$n$ fully-connected network, depth-$n$ +group-convolutional network, and a new depth-$2$ network with quadratic forms +whose universality has not been known. + +
+
+
+
+
+ + ♻ ☆ WaKA: Data Attribution using K-Nearest Neighbors and Membership Privacy + Principles + + +
+ In this paper, we introduce WaKA (Wasserstein K-nearest-neighbors +Attribution), a novel attribution method that leverages principles from the +LiRA (Likelihood Ratio Attack) framework and k-nearest neighbors classifiers +(k-NN). WaKA efficiently measures the contribution of individual data points to +the model's loss distribution, analyzing every possible k-NN that can be +constructed using the training set, without requiring to sample subsets of the +training set. WaKA is versatile and can be used a posteriori as a membership +inference attack (MIA) to assess privacy risks or a priori for privacy +influence measurement and data valuation. Thus, WaKA can be seen as bridging +the gap between data attribution and membership inference attack (MIA) by +providing a unified framework to distinguish between a data point's value and +its privacy risk. For instance, we have shown that self-attribution values are +more strongly correlated with the attack success rate than the contribution of +a point to the model generalization. WaKA's different usage were also evaluated +across diverse real-world datasets, demonstrating performance very close to +LiRA when used as an MIA on k-NN classifiers, but with greater computational +efficiency. Additionally, WaKA shows greater robustness than Shapley Values for +data minimization tasks (removal or addition) on imbalanced datasets. + +
+
+
+
+
+ + ♻ ☆ PoCo: Policy Composition from and for Heterogeneous Robot Learning + + +
+ Training general robotic policies from heterogeneous data for different tasks +is a significant challenge. Existing robotic datasets vary in different +modalities such as color, depth, tactile, and proprioceptive information, and +collected in different domains such as simulation, real robots, and human +videos. Current methods usually collect and pool all data from one domain to +train a single policy to handle such heterogeneity in tasks and domains, which +is prohibitively expensive and difficult. In this work, we present a flexible +approach, dubbed Policy Composition, to combine information across such diverse +modalities and domains for learning scene-level and task-level generalized +manipulation skills, by composing different data distributions represented with +diffusion models. Our method can use task-level composition for multi-task +manipulation and be composed with analytic cost functions to adapt policy +behaviors at inference time. We train our method on simulation, human, and real +robot data and evaluate in tool-use tasks. The composed policy achieves robust +and dexterous performance under varying scenes and tasks and outperforms +baselines from a single data source in both simulation and real-world +experiments. See https://liruiw.github.io/policycomp for more details . + +
+
+ comment: R:SS 2024 +
+
+
+
+
+ + ♻ ☆ Burning RED: Unlocking Subtask-Driven Reinforcement Learning and + Risk-Awareness in Average-Reward Markov Decision Processes + + +
+ Average-reward Markov decision processes (MDPs) provide a foundational +framework for sequential decision-making under uncertainty. However, +average-reward MDPs have remained largely unexplored in reinforcement learning +(RL) settings, with the majority of RL-based efforts having been allocated to +episodic and discounted MDPs. In this work, we study a unique structural +property of average-reward MDPs and utilize it to introduce Reward-Extended +Differential (or RED) reinforcement learning: a novel RL framework that can be +used to effectively and efficiently solve various subtasks simultaneously in +the average-reward setting. We introduce a family of RED learning algorithms +for prediction and control, including proven-convergent algorithms for the +tabular case. We then showcase the power of these algorithms by demonstrating +how they can be used to learn a policy that optimizes, for the first time, the +well-known conditional value-at-risk (CVaR) risk measure in a fully-online +manner, without the use of an explicit bi-level optimization scheme or an +augmented state-space. + +
+
+
+
+
+ + ♻ ☆ Recurrent Aggregators in Neural Algorithmic Reasoning + + +
+ Neural algorithmic reasoning (NAR) is an emerging field that seeks to design +neural networks that mimic classical algorithmic computations. Today, graph +neural networks (GNNs) are widely used in neural algorithmic reasoners due to +their message passing framework and permutation equivariance. In this extended +abstract, we challenge this design choice, and replace the equivariant +aggregation function with a recurrent neural network. While seemingly +counter-intuitive, this approach has appropriate grounding when nodes have a +natural ordering -- and this is the case frequently in established reasoning +benchmarks like CLRS-30. Indeed, our recurrent NAR (RNAR) model performs very +strongly on such tasks, while handling many others gracefully. A notable +achievement of RNAR is its decisive state-of-the-art result on the Heapsort and +Quickselect tasks, both deemed as a significant challenge for contemporary +neural algorithmic reasoners -- especially the latter, where RNAR achieves a +mean micro-F1 score of 87%. + +
+
+ comment: Presented at the Third Learning on Graphs Conference (LoG 2024). 10 + pages, 1 figure +
+
+
+
+
+ + ♻ ☆ One to beat them all: "RYU" -- a unifying framework for the construction + of safe balls + + +
+ In this paper, we present a new framework, called "RYU" for constructing +"safe" regions -- specifically, bounded sets that are guaranteed to contain the +dual solution of a target optimization problem. Our framework applies to the +standard case where the objective function is composed of two components: a +closed, proper, convex function with Lipschitz-smooth gradient and another +closed, proper, convex function. We show that the RYU framework not only +encompasses but also improves upon the state-of-the-art methods proposed over +the past decade for this class of optimization problems. + +
+
+ comment: 19 pages, 1 table +
+
+
+
+
+ + ♻ ☆ Leveraging Retrieval-Augmented Generation for Persian University + Knowledge Retrieval + + +
+ This paper introduces an innovative approach using Retrieval-Augmented +Generation (RAG) pipelines with Large Language Models (LLMs) to enhance +information retrieval and query response systems for university-related +question answering. By systematically extracting data from the university +official webpage and employing advanced prompt engineering techniques, we +generate accurate, contextually relevant responses to user queries. + We developed a comprehensive university benchmark, UniversityQuestionBench +(UQB), to rigorously evaluate our system performance, based on common key +metrics in the filed of RAG pipelines, assessing accuracy and reliability +through various metrics and real-world scenarios. Our experimental results +demonstrate significant improvements in the precision and relevance of +generated responses, enhancing user experience and reducing the time required +to obtain relevant answers. In summary, this paper presents a novel application +of RAG pipelines and LLMs, supported by a meticulously prepared university +benchmark, offering valuable insights into advanced AI techniques for academic +data retrieval and setting the stage for future research in this domain. + +
+
+ comment: 6 pages, 2 figures, 1 table, Submitted to 15th IKT conference +
+
+
+
+
+ + ♻ ☆ Reducing Reasoning Costs -- The Path of Optimization for Chain of + Thought via Sparse Attention Mechanism NeurIPS 2024 + + +
+ In order to address the chain of thought in the large language model +inference cost surge, this research proposes to use a sparse attention +mechanism that only focuses on a few relevant tokens. The researcher +constructed a new attention mechanism and used GiantRabbit trained with custom +GPTs as an experimental tool. The experiment tested and compared the reasoning +time, correctness score and chain of thought length of this model and o1 +Preview in solving the linear algebra test questions of MIT OpenCourseWare. The +results show that GiantRabbit's reasoning time and chain of thought length are +significantly lower than o1 Preview. It verifies the feasibility of sparse +attention mechanism for optimizing chain of thought reasoning. Detailed +architectural details and experimental process have been uploaded to Github, +the link is:https://github.com/brucewang123456789/GeniusTrail.git. + +
+
+ comment: The main text is 5 pages, totaling 9 pages; 4 figures, 1 table. It + have been submitted to NeurIPS 2024 Workshop MusIML and OpenReview +
+
+
+
+
+ + ♻ ☆ Robust Federated Learning Over the Air: Combating Heavy-Tailed Noise + with Median Anchored Clipping + + +
+ Leveraging over-the-air computations for model aggregation is an effective +approach to cope with the communication bottleneck in federated edge learning. +By exploiting the superposition properties of multi-access channels, this +approach facilitates an integrated design of communication and computation, +thereby enhancing system privacy while reducing implementation costs. However, +the inherent electromagnetic interference in radio channels often exhibits +heavy-tailed distributions, giving rise to exceptionally strong noise in +globally aggregated gradients that can significantly deteriorate the training +performance. To address this issue, we propose a novel gradient clipping +method, termed Median Anchored Clipping (MAC), to combat the detrimental +effects of heavy-tailed noise. We also derive analytical expressions for the +convergence rate of model training with analog over-the-air federated learning +under MAC, which quantitatively demonstrates the effect of MAC on training +performance. Extensive experimental results show that the proposed MAC +algorithm effectively mitigates the impact of heavy-tailed noise, hence +substantially enhancing system robustness. + +
+
+ comment: This is the full version of the paper, and the appendix contains a + complete convergence analysis under non-convex conditions +
+
+
+
+
+ + ♻ ☆ Skew-Probabilistic Neural Networks for Learning from Imbalanced Data + + +
+ Real-world datasets often exhibit imbalanced data distribution, where certain +class levels are severely underrepresented. In such cases, traditional pattern +classifiers have shown a bias towards the majority class, impeding accurate +predictions for the minority class. This paper introduces an imbalanced +data-oriented classifier using probabilistic neural networks (PNN) with a +skew-normal kernel function to address this major challenge. PNN is known for +providing probabilistic outputs, enabling quantification of prediction +confidence, interpretability, and the ability to handle limited data. By +leveraging the skew-normal distribution, which offers increased flexibility, +particularly for imbalanced and non-symmetric data, our proposed +Skew-Probabilistic Neural Networks (SkewPNN) can better represent underlying +class densities. Hyperparameter fine-tuning is imperative to optimize the +performance of the proposed approach on imbalanced datasets. To this end, we +employ a population-based heuristic algorithm, the Bat optimization algorithm, +to explore the hyperparameter space effectively. We also prove the statistical +consistency of the density estimates, suggesting that the true distribution +will be approached smoothly as the sample size increases. Theoretical analysis +of the computational complexity of the proposed SkewPNN and BA-SkewPNN is also +provided. Numerical simulations have been conducted on different synthetic +datasets, comparing various benchmark-imbalanced learners. Real-data analysis +on several datasets shows that SkewPNN and BA-SkewPNN substantially outperform +most state-of-the-art machine-learning methods for both balanced and imbalanced +datasets (binary and multi-class categories) in most experimental settings. + +
+
+
+
+
+ + ♻ ☆ Computational Bottlenecks of Training Small-scale Large Language Models + + +
+ While large language models (LLMs) dominate the AI landscape, Small-scale +large Language Models (SLMs) are gaining attention due to cost and efficiency +demands from consumers. However, there is limited research on the training +behavior and computational requirements of SLMs. In this study, we explore the +computational bottlenecks of training SLMs (up to 2B parameters) by examining +the effects of various hyperparameters and configurations, including GPU type, +batch size, model size, communication protocol, attention type, and the number +of GPUs. We assess these factors on popular cloud services using metrics such +as loss per dollar and tokens per second. Our findings aim to support the +broader adoption and optimization of language model training for low-resource +AI research institutes. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Training a neural netwok for data reduction and better generalization + + +
+ The motivation for sparse learners is to compress the inputs (features) by +selecting only the ones needed for good generalization. Linear models with +LASSO-type regularization achieve this by setting the weights of irrelevant +features to zero, effectively identifying and ignoring them. In artificial +neural networks, this selective focus can be achieved by pruning the input +layer. Given a cost function enhanced with a sparsity-promoting penalty, our +proposal selects a regularization term $\lambda$ (without the use of +cross-validation or a validation set) that creates a local minimum in the cost +function at the origin where no features are selected. This local minimum acts +as a baseline, meaning that if there is no strong enough signal to justify a +feature inclusion, the local minimum remains at zero with a high prescribed +probability. The method is flexible, applying to complex models ranging from +shallow to deep artificial neural networks and supporting various cost +functions and sparsity-promoting penalties. We empirically show a remarkable +phase transition in the probability of retrieving the relevant features, as +well as good generalization thanks to the choice of $\lambda$, the non-convex +penalty and the optimization scheme developed. This approach can be seen as a +form of compressed sensing for complex models, allowing us to distill +high-dimensional data into a compact, interpretable subset of meaningful +features. + +
+
+
+
+
+ + ♻ ☆ Binary Feature Mask Optimization for Feature Selection + + +
+ We investigate feature selection problem for generic machine learning models. +We introduce a novel framework that selects features considering the outcomes +of the model. Our framework introduces a novel feature masking approach to +eliminate the features during the selection process, instead of completely +removing them from the dataset. This allows us to use the same machine learning +model during feature selection, unlike other feature selection methods where we +need to train the machine learning model again as the dataset has different +dimensions on each iteration. We obtain the mask operator using the predictions +of the machine learning model, which offers a comprehensive view on the subsets +of the features essential for the predictive performance of the model. A +variety of approaches exist in the feature selection literature. However, to +our knowledge, no study has introduced a training-free framework for a generic +machine learning model to select features while considering the importance of +the feature subsets as a whole, instead of focusing on the individual features. +We demonstrate significant performance improvements on the real-life datasets +under different settings using LightGBM and Multi-Layer Perceptron as our +machine learning models. The high performance of our General Binary Mask +Optimization algorithm stems from its feature masking approach to select +features and its flexibility in the number of selected features. The algorithm +selects features based on the validation performance of the machine learning +model. Hence, the number of selected features is not predetermined and adjusts +dynamically to the dataset. Additionally, we openly share the implementation or +our code to encourage further research in this area. + +
+
+
+
+
+ + ♻ ☆ Corn Yield Prediction Model with Deep Neural Networks for Smallholder + Farmer Decision Support System + + +
+ Crop yield prediction has been modeled on the assumption that there is no +interaction between weather and soil variables. However, this paper argues that +an interaction exists, and it can be finely modelled using the Kendall +Correlation coefficient. Given the nonlinearity of the interaction between +weather and soil variables, a deep neural network regressor (DNNR) is carefully +designed with consideration to the depth, number of neurons of the hidden +layers, and the hyperparameters with their optimizations. Additionally, a new +metric, the average of absolute root squared error (ARSE) is proposed to +combine the strengths of root mean square error (RMSE) and mean absolute error +(MAE). With the ARSE metric, the proposed DNNR(s), optimised random forest +regressor (RFR) and the extreme gradient boosting regressor (XGBR) achieved +impressively small yield errors, 0.0172 t/ha, and 0.0243 t/ha, 0.0001 t/ha, and +0.001 t/ha, respectively. However, the DNNR(s), with changes to the explanatory +variables to ensure generalizability to unforeseen data, DNNR(s) performed +best. Further analysis reveals that a strong interaction does exist between +weather and soil variables. Precisely, yield is observed to increase when +precipitation is reduced and silt increased, and vice-versa. However, the +degree of decrease or increase is not quantified in this paper. Contrary to +existing yield models targeted towards agricultural policies and global food +security, the goal of the proposed corn yield model is to empower the +smallholder farmer to farm smartly and intelligently, thus the prediction model +is integrated into a mobile application that includes education, and a +farmer-to-market access module. + +
+
+ comment: 30 Pages, 11 Figures, 3 Tables +
+
+
+
+
+ + ♻ ☆ DoorINet: Door Heading Prediction through Inertial Deep Learning + + +
+ Inertial sensors are widely used in a variety of applications. A common task +is orientation estimation. To tackle such a task, attitude and heading +reference system algorithms are applied. Relying on the gyroscope readings, the +accelerometer measurements are used to update the attitude angles, and +magnetometer measurements are utilized to update the heading angle. In indoor +environments, magnetometers suffer from interference that degrades their +performance resulting in poor heading angle estimation. Therefore, applications +that estimate the heading angle of moving objects, such as walking pedestrians, +closets, and refrigerators, are prone to error. To circumvent such situations, +we propose DoorINet, an end-to-end deep-learning framework to calculate the +heading angle from door-mounted, low-cost inertial sensors without using +magnetometers. To evaluate our approach, we record a unique dataset containing +391 minutes of accelerometer and gyroscope measurements and corresponding +ground-truth heading angle. We show that our proposed approach outperforms +commonly used, model based approaches and data-driven methods. + +
+
+ comment: 10 pages, 14 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Geometric Point Attention Transformer for 3D Shape Reassembly + + +
+ Shape assembly, which aims to reassemble separate parts into a complete +object, has gained significant interest in recent years. Existing methods +primarily rely on networks to predict the poses of individual parts, but often +fail to effectively capture the geometric interactions between the parts and +their poses. In this paper, we present the Geometric Point Attention +Transformer (GPAT), a network specifically designed to address the challenges +of reasoning about geometric relationships. In the geometric point attention +module, we integrate both global shape information and local pairwise geometric +features, along with poses represented as rotation and translation vectors for +each part. To enable iterative updates and dynamic reasoning, we introduce a +geometric recycling scheme, where each prediction is fed into the next +iteration for refinement. We evaluate our model on both the semantic and +geometric assembly tasks, showing that it outperforms previous methods in +absolute pose estimation, achieving accurate pose predictions and high +alignment accuracy. + +
+
+
+
+
+ + ♻ ☆ PINNfluence: Influence Functions for Physics-Informed Neural Networks + + +
+ Recently, physics-informed neural networks (PINNs) have emerged as a flexible +and promising application of deep learning to partial differential equations in +the physical sciences. While offering strong performance and competitive +inference speeds on forward and inverse problems, their black-box nature limits +interpretability, particularly regarding alignment with expected physical +behavior. In the present work, we explore the application of influence +functions (IFs) to validate and debug PINNs post-hoc. Specifically, we apply +variations of IF-based indicators to gauge the influence of different types of +collocation points on the prediction of PINNs applied to a 2D Navier-Stokes +fluid flow problem. Our results demonstrate how IFs can be adapted to PINNs to +reveal the potential for further studies. The code is publicly available at +https://github.com/aleks-krasowski/PINNfluence. + +
+
+
+
+
+ + ♻ ☆ Circuit Complexity Bounds for RoPE-based Transformer Architecture + + +
+ Characterizing the express power of the Transformer architecture is critical +to understanding its capacity limits and scaling law. Recent works provide the +circuit complexity bounds to Transformer-like architecture. On the other hand, +Rotary Position Embedding ($\mathsf{RoPE}$) has emerged as a crucial technique +in modern large language models, offering superior performance in capturing +positional information compared to traditional position embeddings, which shows +great potential in application prospects, particularly for the long context +scenario. Empirical evidence also suggests that $\mathsf{RoPE}$-based +Transformer architectures demonstrate greater generalization capabilities +compared to conventional Transformer models. In this work, we establish a +circuit complexity bound for Transformers with $\mathsf{RoPE}$ attention. Our +key contribution is that we show that unless $\mathsf{TC}^0 = \mathsf{NC}^1$, a +$\mathsf{RoPE}$-based Transformer with $\mathrm{poly}(n)$-precision, $O(1)$ +layers, hidden dimension $d \leq O(n)$ cannot solve the Arithmetic formula +evaluation problem or the Boolean formula value problem. This result +significantly demonstrates the fundamental limitation of the expressivity of +the $\mathsf{RoPE}$-based Transformer architecture, although it achieves giant +empirical success. Our theoretical result not only establishes the complexity +bound but also may instruct further work on the $\mathsf{RoPE}$-based +Transformer. + +
+
+
+
+
+ + ♻ ☆ MM-Vet: Evaluating Large Multimodal Models for Integrated Capabilities ICML 2024 + + +
+ We propose MM-Vet, an evaluation benchmark that examines large multimodal +models (LMMs) on complicated multimodal tasks. Recent LMMs have shown various +intriguing abilities, such as solving math problems written on the blackboard, +reasoning about events and celebrities in news images, and explaining visual +jokes. Rapid model advancements pose challenges to evaluation benchmark +development. Problems include: (1) How to systematically structure and evaluate +the complicated multimodal tasks; (2) How to design evaluation metrics that +work well across question and answer types; and (3) How to give model insights +beyond a simple performance ranking. To this end, we present MM-Vet, designed +based on the insight that the intriguing ability to solve complicated tasks is +often achieved by a generalist model being able to integrate different core +vision-language (VL) capabilities. MM-Vet defines 6 core VL capabilities and +examines the 16 integrations of interest derived from the capability +combination. For evaluation metrics, we propose an LLM-based evaluator for +open-ended outputs. The evaluator enables the evaluation across different +question types and answer styles, resulting in a unified scoring metric. We +evaluate representative LMMs on MM-Vet, providing insights into the +capabilities of different LMM system paradigms and models. + +
+
+ comment: ICML 2024. Code, data and leaderboard: + https://github.com/yuweihao/MM-Vet +
+
+
+
+
+ + ♻ ☆ Potential Field Based Deep Metric Learning + + +
+ Deep metric learning (DML) involves training a network to learn a +semantically meaningful representation space. Many current approaches mine +n-tuples of examples and model interactions within each tuplets. We present a +novel, compositional DML model, inspired by electrostatic fields in physics +that, instead of in tuples, represents the influence of each example +(embedding) by a continuous potential field, and superposes the fields to +obtain their combined global potential field. We use attractive/repulsive +potential fields to represent interactions among embeddings from images of the +same/different classes. Contrary to typical learning methods, where mutual +influence of samples is proportional to their distance, we enforce reduction in +such influence with distance, leading to a decaying field. We show that such +decay helps improve performance on real world datasets with large intra-class +variations and label noise. Like other proxy-based methods, we also use proxies +to succinctly represent sub-populations of examples. We evaluate our method on +three standard DML benchmarks- Cars-196, CUB-200-2011, and SOP datasets where +it outperforms state-of-the-art baselines. + +
+
+
+
+
+ + ♻ ☆ InvestESG: A multi-agent reinforcement learning benchmark for studying + climate investment as a social dilemma + + +
+ InvestESG is a novel multi-agent reinforcement learning (MARL) benchmark +designed to study the impact of Environmental, Social, and Governance (ESG) +disclosure mandates on corporate climate investments. Supported by both PyTorch +and JAX implementation, the benchmark models an intertemporal social dilemma +where companies balance short-term profit losses from climate mitigation +efforts and long-term benefits from reducing climate risk, while ESG-conscious +investors attempt to influence corporate behavior through their investment +decisions, in a scalable and hardware-accelerated manner. Companies allocate +capital across mitigation, greenwashing, and resilience, with varying +strategies influencing climate outcomes and investor preferences. Our +experiments show that without ESG-conscious investors with sufficient capital, +corporate mitigation efforts remain limited under the disclosure mandate. +However, when a critical mass of investors prioritizes ESG, corporate +cooperation increases, which in turn reduces climate risks and enhances +long-term financial stability. Additionally, providing more information about +global climate risks encourages companies to invest more in mitigation, even +without investor involvement. Our findings align with empirical research using +real-world data, highlighting MARL's potential to inform policy by providing +insights into large-scale socio-economic challenges through efficient testing +of alternative policy and market designs. + +
+
+
+
+
+ + ♻ ☆ Stochastic Hessian Fittings with Lie Groups + + +
+ This report studies the fitting of Hessian or its inverse for stochastic +optimizations using a Hessian fitting criterion from the preconditioned +stochastic gradient descent (PSGD) method, which is intimately related to many +commonly used second-order and adaptive gradient optimizers, e.g., BFGS, +Gaussian-Newton algorithm, natural gradient descent, AdaGrad, etc. Our analyses +reveal the efficiency and reliability differences among a wide range of +preconditioner fitting methods, from closed-form to iterative solutions, using +Hessian-vector products or stochastic gradients only, with Hessian fittings in +the Euclidean space, the manifold of symmetric positive definite (SPL) +matrices, to a variety of Lie groups. The most intriguing discovery is that the +Hessian fitting itself as an optimization problem is strongly convex under mild +conditions in certain general Lie groups. This discovery turns Hessian fitting +into a well-behaved Lie group optimization problem and facilitates the designs +of highly efficient and elegant Lie group sparse preconditioner fitting methods +for large-scale stochastic optimizations. + +
+
+ comment: 14 pages; 6 figures; 3 tables; code + https://github.com/lixilinx/psgd_torch +
+
+
+
+
+ + ♻ ☆ Unmasking Trees for Tabular Data + + +
+ Despite much work on advanced deep learning and generative modeling +techniques for tabular data generation and imputation, traditional methods have +continued to win on imputation benchmarks. We herein present UnmaskingTrees, a +simple method for tabular imputation (and generation) employing +gradient-boosted decision trees which are used to incrementally unmask +individual features. This approach offers state-of-the-art performance on +imputation, and on generation given training data with missingness; and it has +competitive performance on vanilla generation. To solve the conditional +generation subproblem, we propose a tabular probabilistic prediction method, +BaltoBot, which fits a balanced tree of boosted tree classifiers. Unlike older +methods, it requires no parametric assumption on the conditional distribution, +accommodating features with multimodal distributions; unlike newer diffusion +methods, it offers fast sampling, closed-form density estimation, and flexible +handling of discrete variables. We finally consider our two approaches as +meta-algorithms, demonstrating in-context learning-based generative modeling +with TabPFN. + +
+
+ comment: v0.3.0 of UnmaskingTrees software +
+
+
+
+
+ + ♻ ☆ Introduction to Reinforcement Learning + + +
+ Reinforcement Learning (RL), a subfield of Artificial Intelligence (AI), +focuses on training agents to make decisions by interacting with their +environment to maximize cumulative rewards. This paper provides an overview of +RL, covering its core concepts, methodologies, and resources for further +learning. It offers a thorough explanation of fundamental components such as +states, actions, policies, and reward signals, ensuring readers develop a solid +foundational understanding. Additionally, the paper presents a variety of RL +algorithms, categorized based on the key factors such as model-free, +model-based, value-based, policy-based, and other key factors. Resources for +learning and implementing RL, such as books, courses, and online communities +are also provided. By offering a clear, structured introduction, this paper +aims to simplify the complexities of RL for beginners, providing a +straightforward pathway to understanding and applying real-time techniques. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ SongBsAb: A Dual Prevention Approach against Singing Voice Conversion + based Illegal Song Covers NDSS + + +
+ Singing voice conversion (SVC) automates song covers by converting a source +singing voice from a source singer into a new singing voice with the same +lyrics and melody as the source, but sounds like being covered by the target +singer of some given target singing voices. However, it raises serious concerns +about copyright and civil right infringements. We propose SongBsAb, the first +proactive approach to tackle SVC-based illegal song covers. SongBsAb adds +perturbations to singing voices before releasing them, so that when they are +used, the process of SVC will be interfered, leading to unexpected singing +voices. Perturbations are carefully crafted to (1) provide a dual prevention, +i.e., preventing the singing voice from being used as the source and target +singing voice in SVC, by proposing a gender-transformation loss and a high/low +hierarchy multi-target loss, respectively; and (2) be harmless, i.e., no +side-effect on the enjoyment of protected songs, by refining a psychoacoustic +model-based loss with the backing track as an additional masker, a unique +accompanying element for singing voices compared to ordinary speech voices. We +also adopt a frame-level interaction reduction-based loss and encoder ensemble +to enhance the transferability of SongBsAb to unknown SVC models. We +demonstrate the prevention effectiveness, harmlessness, and robustness of +SongBsAb on five diverse and promising SVC models, using both English and +Chinese datasets, and both objective and human study-based subjective metrics. +Our work fosters an emerging research direction for mitigating illegal +automated song covers. + +
+
+ comment: In Proceedings of the 32nd Network and Distributed System Security + (NDSS) Symposium 2025 +
+
+
+
+
+ + ♻ ☆ Clustering with Neural Network and Index + + +
+ A new model called Clustering with Neural Network and Index (CNNI) is +introduced. CNNI uses a Neural Network to cluster data points. Training of the +Neural Network mimics supervised learning, with an internal clustering +evaluation index acting as the loss function. An experiment is conducted to +test the feasibility of the new model, and compared with results of other +clustering models like K-means and Gaussian Mixture Model (GMM). The result +shows CNNI can work properly for clustering data; CNNI equipped with MMJ-SC, +achieves the first parametric (inductive) clustering model that can deal with +non-convex shaped (non-flat geometry) data. + +
+
+
+
+
+ + ♻ ☆ Sliced-Wasserstein-based Anomaly Detection and Open Dataset for + Localized Critical Peak Rebates + + +
+ In this work, we present a new unsupervised anomaly (outlier) detection (AD) +method using the sliced-Wasserstein metric. This filtering technique is +conceptually interesting for MLOps pipelines deploying machine learning models +in critical sectors, e.g., energy, as it offers a conservative data selection. +Additionally, we open the first dataset showcasing localized critical peak +rebate demand response in a northern climate. We demonstrate the capabilities +of our method on synthetic datasets as well as standard AD datasets and use it +in the making of a first benchmark for our open-source localized critical peak +rebate dataset. + +
+
+
+
+
+ + ♻ ☆ Job-SDF: A Multi-Granularity Dataset for Job Skill Demand Forecasting + and Benchmarking NeurIPS 2024 + + +
+ In a rapidly evolving job market, skill demand forecasting is crucial as it +enables policymakers and businesses to anticipate and adapt to changes, +ensuring that workforce skills align with market needs, thereby enhancing +productivity and competitiveness. Additionally, by identifying emerging skill +requirements, it directs individuals towards relevant training and education +opportunities, promoting continuous self-learning and development. However, the +absence of comprehensive datasets presents a significant challenge, impeding +research and the advancement of this field. To bridge this gap, we present +Job-SDF, a dataset designed to train and benchmark job-skill demand forecasting +models. Based on 10.35 million public job advertisements collected from major +online recruitment platforms in China between 2021 and 2023, this dataset +encompasses monthly recruitment demand for 2,324 types of skills across 521 +companies. Our dataset uniquely enables evaluating skill demand forecasting +models at various granularities, including occupation, company, and regional +levels. We benchmark a range of models on this dataset, evaluating their +performance in standard scenarios, in predictions focused on lower value +ranges, and in the presence of structural breaks, providing new insights for +further research. Our code and dataset are publicly accessible via the +https://github.com/Job-SDF/benchmark. + +
+
+ comment: NeurIPS 2024 Accepted +
+
+
+
+
+ + ♻ ☆ Towards Dynamic Message Passing on Graphs NeurIPS 2024 + + +
+ Message passing plays a vital role in graph neural networks (GNNs) for +effective feature learning. However, the over-reliance on input topology +diminishes the efficacy of message passing and restricts the ability of GNNs. +Despite efforts to mitigate the reliance, existing study encounters +message-passing bottlenecks or high computational expense problems, which +invokes the demands for flexible message passing with low complexity. In this +paper, we propose a novel dynamic message-passing mechanism for GNNs. It +projects graph nodes and learnable pseudo nodes into a common space with +measurable spatial relations between them. With nodes moving in the space, +their evolving relations facilitate flexible pathway construction for a dynamic +message-passing process. Associating pseudo nodes to input graphs with their +measured relations, graph nodes can communicate with each other intermediately +through pseudo nodes under linear complexity. We further develop a GNN model +named $\mathtt{\mathbf{N^2}}$ based on our dynamic message-passing mechanism. +$\mathtt{\mathbf{N^2}}$ employs a single recurrent layer to recursively +generate the displacements of nodes and construct optimal dynamic pathways. +Evaluation on eighteen benchmarks demonstrates the superior performance of +$\mathtt{\mathbf{N^2}}$ over popular GNNs. $\mathtt{\mathbf{N^2}}$ successfully +scales to large-scale benchmarks and requires significantly fewer parameters +for graph classification with the shared recurrent layer. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ FlickerFusion: Intra-trajectory Domain Generalizing Multi-Agent RL NeurIPS '24 + + +
+ Multi-agent reinforcement learning has demonstrated significant potential in +addressing complex cooperative tasks across various real-world applications. +However, existing MARL approaches often rely on the restrictive assumption that +the number of entities (e.g., agents, obstacles) remains constant between +training and inference. This overlooks scenarios where entities are dynamically +removed or added during the inference trajectory -- a common occurrence in +real-world environments like search and rescue missions and dynamic combat +situations. In this paper, we tackle the challenge of intra-trajectory dynamic +entity composition under zero-shot out-of-domain (OOD) generalization, where +such dynamic changes cannot be anticipated beforehand. Our empirical studies +reveal that existing MARL methods suffer significant performance degradation +and increased uncertainty in these scenarios. In response, we propose +FlickerFusion, a novel OOD generalization method that acts as a universally +applicable augmentation technique for MARL backbone methods. FlickerFusion +stochastically drops out parts of the observation space, emulating being +in-domain when inferenced OOD. The results show that FlickerFusion not only +achieves superior inference rewards but also uniquely reduces uncertainty +vis-\`a-vis the backbone, compared to existing methods. Benchmarks, +implementations, and model weights are organized and open-sourced at +flickerfusion305.github.io, accompanied by ample demo video renderings. + +
+
+ comment: NeurIPS '24 Open-World Agents Workshop (v2: minor revision) +
+
+
+
+
+ + ♻ ☆ Tree-Wasserstein Distance for High Dimensional Data with a Latent + Feature Hierarchy + + +
+ Finding meaningful distances between high-dimensional data samples is an +important scientific task. To this end, we propose a new tree-Wasserstein +distance (TWD) for high-dimensional data with two key aspects. First, our TWD +is specifically designed for data with a latent feature hierarchy, i.e., the +features lie in a hierarchical space, in contrast to the usual focus on +embedding samples in hyperbolic space. Second, while the conventional use of +TWD is to speed up the computation of the Wasserstein distance, we use its +inherent tree as a means to learn the latent feature hierarchy. The key idea of +our method is to embed the features into a multi-scale hyperbolic space using +diffusion geometry and then present a new tree decoding method by establishing +analogies between the hyperbolic embedding and trees. We show that our TWD +computed based on data observations provably recovers the TWD defined with the +latent feature hierarchy and that its computation is efficient and scalable. We +showcase the usefulness of the proposed TWD in applications to word-document +and single-cell RNA-sequencing datasets, demonstrating its advantages over +existing TWDs and methods based on pre-trained models. + +
+
+
+
+
+ + ♻ ☆ DistRL: An Asynchronous Distributed Reinforcement Learning Framework for + On-Device Control Agents + + +
+ On-device control agents, especially on mobile devices, are responsible for +operating mobile devices to fulfill users' requests, enabling seamless and +intuitive interactions. Integrating Multimodal Large Language Models (MLLMs) +into these agents enhances their ability to understand and execute complex +commands, thereby improving user experience. However, fine-tuning MLLMs for +on-device control presents significant challenges due to limited data +availability and inefficient online training processes. This paper introduces +DistRL, a novel framework designed to enhance the efficiency of online RL +fine-tuning for mobile device control agents. DistRL employs centralized +training and decentralized data acquisition to ensure efficient fine-tuning in +the context of dynamic online interactions. Additionally, the framework is +backed by our tailor-made RL algorithm, which effectively balances exploration +with the prioritized utilization of collected data to ensure stable and robust +training. Our experiments show that, on average, DistRL delivers a 3X +improvement in training efficiency and enables training data collection 2.4X +faster than the leading synchronous multi-machine methods. Notably, after +training, DistRL achieves a 20% relative improvement in success rate compared +to state-of-the-art methods on general Android tasks from an open benchmark, +significantly outperforming existing approaches while maintaining the same +training time. These results validate DistRL as a scalable and efficient +solution, offering substantial improvements in both training efficiency and +agent performance for real-world, in-the-wild device control tasks. + +
+
+ comment: Paper and Appendix, 26 pages +
+
+
+
+
+ + ♻ ☆ Batch Calibration: Rethinking Calibration for In-Context Learning and + Prompt Engineering ICLR 2024 + + +
+ Prompting and in-context learning (ICL) have become efficient learning +paradigms for large language models (LLMs). However, LLMs suffer from prompt +brittleness and various bias factors in the prompt, including but not limited +to the formatting, the choice verbalizers, and the ICL examples. To address +this problem that results in unexpected performance degradation, calibration +methods have been developed to mitigate the effects of these biases while +recovering LLM performance. In this work, we first conduct a systematic +analysis of the existing calibration methods, where we both provide a unified +view and reveal the failure cases. Inspired by these analyses, we propose Batch +Calibration (BC), a simple yet intuitive method that controls the contextual +bias from the batched input, unifies various prior approaches, and effectively +addresses the aforementioned issues. BC is zero-shot, inference-only, and +incurs negligible additional costs. In the few-shot setup, we further extend BC +to allow it to learn the contextual bias from labeled data. We validate the +effectiveness of BC with PaLM 2-(S, M, L) and CLIP models and demonstrate +state-of-the-art performance over previous calibration baselines across more +than 10 natural language understanding and image classification tasks. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Alternators For Sequence Modeling + + +
+ This paper introduces alternators, a novel family of non-Markovian dynamical +models for sequences. An alternator features two neural networks: the +observation trajectory network (OTN) and the feature trajectory network (FTN). +The OTN and the FTN work in conjunction, alternating between outputting samples +in the observation space and some feature space, respectively, over a cycle. +The parameters of the OTN and the FTN are not time-dependent and are learned +via a minimum cross-entropy criterion over the trajectories. Alternators are +versatile. They can be used as dynamical latent-variable generative models or +as sequence-to-sequence predictors. Alternators can uncover the latent dynamics +underlying complex sequential data, accurately forecast and impute missing +data, and sample new trajectories. We showcase the capabilities of alternators +in three applications. We first used alternators to model the Lorenz equations, +often used to describe chaotic behavior. We then applied alternators to +Neuroscience, to map brain activity to physical activity. Finally, we applied +alternators to Climate Science, focusing on sea-surface temperature +forecasting. In all our experiments, we found alternators are stable to train, +fast to sample from, yield high-quality generated samples and latent variables, +and often outperform strong baselines such as Mambas, neural ODEs, and +diffusion models in the domains we studied. + +
+
+ comment: A new versatile family of sequence models that can be used for both + generative modeling and supervised learning. The codebase will be made + available upon publication. This paper is dedicated to Thomas Sankara +
+
+
+
+
+ + ♻ ☆ Provably Scalable Black-Box Variational Inference with Structured + Variational Families ICML'24 + + +
+ Variational families with full-rank covariance approximations are known not +to work well in black-box variational inference (BBVI), both empirically and +theoretically. In fact, recent computational complexity results for BBVI have +established that full-rank variational families scale poorly with the +dimensionality of the problem compared to e.g. mean-field families. This is +particularly critical to hierarchical Bayesian models with local variables; +their dimensionality increases with the size of the datasets. Consequently, one +gets an iteration complexity with an explicit $\mathcal{O}(N^2)$ dependence on +the dataset size $N$. In this paper, we explore a theoretical middle ground +between mean-field variational families and full-rank families: structured +variational families. We rigorously prove that certain scale matrix structures +can achieve a better iteration complexity of $\mathcal{O}\left(N\right)$, +implying better scaling with respect to $N$. We empirically verify our +theoretical results on large-scale hierarchical models. + +
+
+ comment: Accepted to ICML'24; v3: fixed typos +
+
+
+
+
+ + ♻ ☆ Support-Set Context Matters for Bongard Problems + + +
+ Current machine learning methods struggle to solve Bongard problems, which +are a type of IQ test that requires deriving an abstract "concept" from a set +of positive and negative "support" images, and then classifying whether or not +a new query image depicts the key concept. On Bongard-HOI, a benchmark for +natural-image Bongard problems, most existing methods have reached at best 69% +accuracy (where chance is 50%). Low accuracy is often attributed to neural +nets' lack of ability to find human-like symbolic rules. In this work, we point +out that many existing methods are forfeiting accuracy due to a much simpler +problem: they do not adapt image features given information contained in the +support set as a whole, and rely instead on information extracted from +individual supports. This is a critical issue, because the "key concept" in a +typical Bongard problem can often only be distinguished using multiple +positives and multiple negatives. We explore simple methods to incorporate this +context and show substantial gains over prior works, leading to new +state-of-the-art accuracy on Bongard-LOGO (75.3%) and Bongard-HOI (76.4%) +compared to methods with equivalent vision backbone architectures and strong +performance on the original Bongard problem set (60.8%). + +
+
+ comment: TMLR October 2024. Code: + https://github.com/nraghuraman/bongard-context +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ♻ ☆ Separate Anything You Describe + + +
+ Language-queried audio source separation (LASS) is a new paradigm for +computational auditory scene analysis (CASA). LASS aims to separate a target +sound from an audio mixture given a natural language query, which provides a +natural and scalable interface for digital audio applications. Recent works on +LASS, despite attaining promising separation performance on specific sources +(e.g., musical instruments, limited classes of audio events), are unable to +separate audio concepts in the open domain. In this work, we introduce +AudioSep, a foundation model for open-domain audio source separation with +natural language queries. We train AudioSep on large-scale multimodal datasets +and extensively evaluate its capabilities on numerous tasks including audio +event separation, musical instrument separation, and speech enhancement. +AudioSep demonstrates strong separation performance and impressive zero-shot +generalization ability using audio captions or text labels as queries, +substantially outperforming previous audio-queried and language-queried sound +separation models. For reproducibility of this work, we will release the source +code, evaluation benchmark and pre-trained model at: +https://github.com/Audio-AGI/AudioSep. + +
+
+ comment: Code, benchmark and pre-trained models: + https://github.com/Audio-AGI/AudioSep +
+
+
+
+
+ + ♻ ☆ SongBsAb: A Dual Prevention Approach against Singing Voice Conversion + based Illegal Song Covers NDSS + + +
+ Singing voice conversion (SVC) automates song covers by converting a source +singing voice from a source singer into a new singing voice with the same +lyrics and melody as the source, but sounds like being covered by the target +singer of some given target singing voices. However, it raises serious concerns +about copyright and civil right infringements. We propose SongBsAb, the first +proactive approach to tackle SVC-based illegal song covers. SongBsAb adds +perturbations to singing voices before releasing them, so that when they are +used, the process of SVC will be interfered, leading to unexpected singing +voices. Perturbations are carefully crafted to (1) provide a dual prevention, +i.e., preventing the singing voice from being used as the source and target +singing voice in SVC, by proposing a gender-transformation loss and a high/low +hierarchy multi-target loss, respectively; and (2) be harmless, i.e., no +side-effect on the enjoyment of protected songs, by refining a psychoacoustic +model-based loss with the backing track as an additional masker, a unique +accompanying element for singing voices compared to ordinary speech voices. We +also adopt a frame-level interaction reduction-based loss and encoder ensemble +to enhance the transferability of SongBsAb to unknown SVC models. We +demonstrate the prevention effectiveness, harmlessness, and robustness of +SongBsAb on five diverse and promising SVC models, using both English and +Chinese datasets, and both objective and human study-based subjective metrics. +Our work fosters an emerging research direction for mitigating illegal +automated song covers. + +
+
+ comment: In Proceedings of the 32nd Network and Distributed System Security + (NDSS) Symposium 2025 +
+
+
+
+
+
+
+
+ + Genomics 1 + +
+
+
+ + Towards Unified Molecule-Enhanced Pathology Image Representation + Learning via Integrating Spatial Transcriptomics + + +
+ Recent advancements in multimodal pre-training models have significantly +advanced computational pathology. However, current approaches predominantly +rely on visual-language models, which may impose limitations from a molecular +perspective and lead to performance bottlenecks. Here, we introduce a Unified +Molecule-enhanced Pathology Image REpresentationn Learning framework (UMPIRE). +UMPIRE aims to leverage complementary information from gene expression profiles +to guide the multimodal pre-training, enhancing the molecular awareness of +pathology image representation learning. We demonstrate that this molecular +perspective provides a robust, task-agnostic training signal for learning +pathology image embeddings. Due to the scarcity of paired data, approximately 4 +million entries of spatial transcriptomics gene expression were collected to +train the gene encoder. By leveraging powerful pre-trained encoders, UMPIRE +aligns the encoders across over 697K pathology image-gene expression pairs. The +performance of UMPIRE is demonstrated across various molecular-related +downstream tasks, including gene expression prediction, spot classification, +and mutation state prediction in whole slide images. Our findings highlight the +effectiveness of multimodal data integration and open new avenues for exploring +computational pathology enhanced by molecular perspectives. The code and +pre-trained weights are available at https://github.com/Hanminghao/UMPIRE. + +
+
+ comment: 21 pages, 11 figures, 7 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 24 + +
+
+
+ + ♻ ☆ Exploring Superpixel Segmentation Methods in the Context of Citizen + Science and Deforestation Detection + + +
+ Tropical forests play an essential role in the planet's ecosystem, making the +conservation of these biomes a worldwide priority. However, ongoing +deforestation and degradation pose a significant threat to their existence, +necessitating effective monitoring and the proposal of actions to mitigate the +damage caused by these processes. In this regard, initiatives range from +government and private sector monitoring programs to solutions based on citizen +science campaigns, for example. Particularly in the context of citizen science +campaigns, the segmentation of remote sensing images to identify deforested +areas and subsequently submit them to analysis by non-specialized volunteers is +necessary. Thus, segmentation using superpixel-based techniques proves to be a +viable solution for this important task. Therefore, this paper presents an +analysis of 22 superpixel-based segmentation methods applied to remote sensing +images, aiming to identify which of them are more suitable for generating +segments for citizen science campaigns. The results reveal that seven of the +segmentation methods outperformed the baseline method (SLIC) currently employed +in the ForestEyes citizen science project, indicating an opportunity for +improvement in this important stage of campaign development. + +
+
+ comment: Paper was accepted for presentation at SAC 2025 +
+
+
+
+
+ + ♻ ☆ SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking + with Motion-Aware Memory + + +
+ The Segment Anything Model 2 (SAM 2) has demonstrated strong performance in +object segmentation tasks but faces challenges in visual object tracking, +particularly when managing crowded scenes with fast-moving or self-occluding +objects. Furthermore, the fixed-window memory approach in the original model +does not consider the quality of memories selected to condition the image +features for the next frame, leading to error propagation in videos. This paper +introduces SAMURAI, an enhanced adaptation of SAM 2 specifically designed for +visual object tracking. By incorporating temporal motion cues with the proposed +motion-aware memory selection mechanism, SAMURAI effectively predicts object +motion and refines mask selection, achieving robust, accurate tracking without +the need for retraining or fine-tuning. SAMURAI operates in real-time and +demonstrates strong zero-shot performance across diverse benchmark datasets, +showcasing its ability to generalize without fine-tuning. In evaluations, +SAMURAI achieves significant improvements in success rate and precision over +existing trackers, with a 7.1% AUC gain on LaSOT$_{\text{ext}}$ and a 3.5% AO +gain on GOT-10k. Moreover, it achieves competitive results compared to fully +supervised methods on LaSOT, underscoring its robustness in complex tracking +scenarios and its potential for real-world applications in dynamic +environments. + +
+
+ comment: Project page is available at https://yangchris11.github.io/samurai/ +
+
+
+
+
+ + ♻ ☆ Long-Term Ad Memorability: Understanding & Generating Memorable Ads WACV-2025 + + +
+ Despite the importance of long-term memory in marketing and brand building, +until now, there has been no large-scale study on the memorability of ads. All +previous memorability studies have been conducted on short-term recall on +specific content types like action videos. On the other hand, long-term +memorability is crucial for the advertising industry, and ads are almost always +highly multimodal. Therefore, we release the first memorability dataset, +LAMBDA, consisting of 1749 participants and 2205 ads covering 276 brands. +Running statistical tests over different participant subpopulations and ad +types, we find many interesting insights into what makes an ad memorable, e.g., +fast-moving ads are more memorable than those with slower scenes; people who +use ad-blockers remember a lower number of ads than those who don't. Next, we +present a model, Henry, to predict the memorability of a content. Henry +achieves state-of-the-art performance across all prominent literature +memorability datasets. It shows strong generalization performance with better +results in 0-shot on unseen datasets. Finally, with the intent of memorable ad +generation, we present a scalable method to build a high-quality memorable ad +generation model by leveraging automatically annotated data. Our approach, SEED +(Self rEwarding mEmorability Modeling), starts with a language model trained on +LAMBDA as seed data and progressively trains an LLM to generate more memorable +ads. We show that the generated advertisements have 44% higher memorability +scores than the original ads. We release this large-scale ad dataset, +UltraLAMBDA, consisting of 5 million ads. Our code and the datasets, LAMBDA and +UltraLAMBDA, are open-sourced at +https://behavior-in-the-wild.github.io/memorability. + +
+
+ comment: Published in WACV-2025 +
+
+
+
+
+ + ♻ ☆ ExpertAF: Expert Actionable Feedback from Video + + +
+ Feedback is essential for learning a new skill or improving one's current +skill-level. However, current methods for skill-assessment from video only +provide scores or compare demonstrations, leaving the burden of knowing what to +do differently on the user. We introduce a novel method to generate actionable +feedback from video of a person doing a physical activity, such as basketball +or soccer. Our method takes a video demonstration and its accompanying 3D body +pose and generates (1) free-form expert commentary describing what the person +is doing well and what they could improve, and (2) a visual expert +demonstration that incorporates the required corrections. We show how to +leverage Ego-Exo4D's videos of skilled activity and expert commentary together +with a strong language model to create a weakly-supervised training dataset for +this task, and we devise a multimodal video-language model to infer coaching +feedback. Our method is able to reason across multi-modal input combinations to +output full-spectrum, actionable coaching -- expert commentary, expert video +retrieval, and expert pose generation -- outperforming strong vision-language +models on both established metrics and human preference studies. Code and data +will be publicly released. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Spatial and Spatial-Spectral Morphological Mamba for Hyperspectral Image + Classification + + +
+ Recent advancements in transformers, specifically self-attention mechanisms, +have significantly improved hyperspectral image (HSI) classification. However, +these models often suffer from inefficiencies, as their computational +complexity scales quadratically with sequence length. To address these +challenges, we propose the morphological spatial mamba (SMM) and morphological +spatial-spectral Mamba (SSMM) model (MorpMamba), which combines the strengths +of morphological operations and the state space model framework, offering a +more computationally efficient alternative to transformers. In MorpMamba, a +novel token generation module first converts HSI patches into spatial-spectral +tokens. These tokens are then processed through morphological operations such +as erosion and dilation, utilizing depthwise separable convolutions to capture +structural and shape information. A token enhancement module refines these +features by dynamically adjusting the spatial and spectral tokens based on +central HSI regions, ensuring effective feature fusion within each block. +Subsequently, multi-head self-attention is applied to further enrich the +feature representations, allowing the model to capture complex relationships +and dependencies within the data. Finally, the enhanced tokens are fed into a +state space module, which efficiently models the temporal evolution of the +features for classification. Experimental results on widely used HSI datasets +demonstrate that MorpMamba achieves superior parametric efficiency compared to +traditional CNN and transformer models while maintaining high accuracy. The +code will be made publicly available at +\url{https://github.com/mahmad000/MorpMamba}. + +
+
+
+
+
+ + ♻ ☆ Cross-Subject Domain Adaptation for Classifying Working Memory Load with + Multi-Frame EEG Images + + +
+ Working memory (WM), denoting the information temporally stored in the mind, +is a fundamental research topic in the field of human cognition. +Electroencephalograph (EEG), which can monitor the electrical activity of the +brain, has been widely used in measuring the level of WM. However, one of the +critical challenges is that individual differences may cause ineffective +results, especially when the established model meets an unfamiliar subject. In +this work, we propose a cross-subject deep adaptation model with spatial +attention (CS-DASA) to generalize the workload classifications across subjects. +First, we transform EEG time series into multi-frame EEG images incorporating +spatial, spectral, and temporal information. First, the Subject-Shared module +in CS-DASA receives multi-frame EEG image data from both source and target +subjects and learns the common feature representations. Then, in the +subject-specific module, the maximum mean discrepancy is implemented to measure +the domain distribution divergence in a reproducing kernel Hilbert space, which +can add an effective penalty loss for domain adaptation. Additionally, the +subject-to-subject spatial attention mechanism is employed to focus on the +discriminative spatial features from the target image data. Experiments +conducted on a public WM EEG dataset containing 13 subjects show that the +proposed model is capable of achieving better performance than existing +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Good Grasps Only: A data engine for self-supervised fine-tuning of pose + estimation using grasp poses for verification + + +
+ In this paper, we present a novel method for self-supervised fine-tuning of +pose estimation. Leveraging zero-shot pose estimation, our approach enables the +robot to automatically obtain training data without manual labeling. After pose +estimation the object is grasped, and in-hand pose estimation is used for data +validation. Our pipeline allows the system to fine-tune while the process is +running, removing the need for a learning phase. The motivation behind our work +lies in the need for rapid setup of pose estimation solutions. Specifically, we +address the challenging task of bin picking, which plays a pivotal role in +flexible robotic setups. Our method is implemented on a robotics work-cell, and +tested with four different objects. For all objects, our method increases the +performance and outperforms a state-of-the-art method trained on the CAD model +of the objects. + +
+
+ comment: 8 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ PassionSR: Post-Training Quantization with Adaptive Scale in One-Step + Diffusion based Image Super-Resolution + + +
+ Diffusion-based image super-resolution (SR) models have shown superior +performance at the cost of multiple denoising steps. However, even though the +denoising step has been reduced to one, they require high computational costs +and storage requirements, making it difficult for deployment on hardware +devices. To address these issues, we propose a novel post-training quantization +approach with adaptive scale in one-step diffusion (OSD) image SR, PassionSR. +First, we simplify OSD model to two core components, UNet and Variational +Autoencoder (VAE) by removing the CLIPEncoder. Secondly, we propose Learnable +Boundary Quantizer (LBQ) and Learnable Equivalent Transformation (LET) to +optimize the quantization process and manipulate activation distributions for +better quantization. Finally, we design a Distributed Quantization Calibration +(DQC) strategy that stabilizes the training of quantized parameters for rapid +convergence. Comprehensive experiments demonstrate that PassionSR with 8-bit +and 6-bit obtains comparable visual results with full-precision model. +Moreover, our PassionSR achieves significant advantages over recent leading +low-bit quantization methods for image SR. Our code will be at +https://github.com/libozhu03/PassionSR. + +
+
+ comment: https://github.com/libozhu03/PassionSR +
+
+
+
+
+ + ♻ ☆ MC$^2$: Multi-concept Guidance for Customized Multi-concept Generation + + +
+ Customized text-to-image generation, which synthesizes images based on +user-specified concepts, has made significant progress in handling individual +concepts. However, when extended to multiple concepts, existing methods often +struggle with properly integrating different models and avoiding the unintended +blending of characteristics from distinct concepts. In this paper, we propose +MC$^2$, a novel approach for multi-concept customization that enhances +flexibility and fidelity through inference-time optimization. MC$^2$ enables +the integration of multiple single-concept models with heterogeneous +architectures. By adaptively refining attention weights between visual and +textual tokens, our method ensures that image regions accurately correspond to +their associated concepts while minimizing interference between concepts. +Extensive experiments demonstrate that MC$^2$ outperforms training-based +methods in terms of prompt-reference alignment. Furthermore, MC$^2$ can be +seamlessly applied to text-to-image generation, providing robust compositional +capabilities. To facilitate the evaluation of multi-concept customization, we +also introduce a new benchmark, MC++. The code will be publicly available at +https://github.com/JIANGJiaXiu/MC-2. + +
+
+ comment: 14 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Generalizing Deepfake Video Detection with Plug-and-Play: Video-Level + Blending and Spatiotemporal Adapter Tuning + + +
+ Three key challenges hinder the development of current deepfake video +detection: (1) Temporal features can be complex and diverse: how can we +identify general temporal artifacts to enhance model generalization? (2) +Spatiotemporal models often lean heavily on one type of artifact and ignore the +other: how can we ensure balanced learning from both? (3) Videos are naturally +resource-intensive: how can we tackle efficiency without compromising accuracy? +This paper attempts to tackle the three challenges jointly. First, inspired by +the notable generality of using image-level blending data for image forgery +detection, we investigate whether and how video-level blending can be effective +in video. We then perform a thorough analysis and identify a previously +underexplored temporal forgery artifact: Facial Feature Drift (FFD), which +commonly exists across different forgeries. To reproduce FFD, we then propose a +novel Video-level Blending data (VB), where VB is implemented by blending the +original image and its warped version frame-by-frame, serving as a hard +negative sample to mine more general artifacts. Second, we carefully design a +lightweight Spatiotemporal Adapter (StA) to equip a pretrained image model +(both ViTs and CNNs) with the ability to capture both spatial and temporal +features jointly and efficiently. StA is designed with two-stream 3D-Conv with +varying kernel sizes, allowing it to process spatial and temporal features +separately. Extensive experiments validate the effectiveness of the proposed +methods; and show our approach can generalize well to previously unseen forgery +videos, even the latest generation methods. + +
+
+
+
+
+ + ♻ ☆ A Concept-Based Explainability Framework for Large Multimodal Models NeurIPS 2024 + + +
+ Large multimodal models (LMMs) combine unimodal encoders and large language +models (LLMs) to perform multimodal tasks. Despite recent advancements towards +the interpretability of these models, understanding internal representations of +LMMs remains largely a mystery. In this paper, we present a novel framework for +the interpretation of LMMs. We propose a dictionary learning based approach, +applied to the representation of tokens. The elements of the learned dictionary +correspond to our proposed concepts. We show that these concepts are well +semantically grounded in both vision and text. Thus we refer to these as +``multi-modal concepts''. We qualitatively and quantitatively evaluate the +results of the learnt concepts. We show that the extracted multimodal concepts +are useful to interpret representations of test samples. Finally, we evaluate +the disentanglement between different concepts and the quality of grounding +concepts visually and textually. Our code is publicly available at +https://github.com/mshukor/xl-vlms + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion + Model + + +
+ Advancements in 3D scene reconstruction have transformed 2D images from the +real world into 3D models, producing realistic 3D results from hundreds of +input photos. Despite great success in dense-view reconstruction scenarios, +rendering a detailed scene from insufficient captured views is still an +ill-posed optimization problem, often resulting in artifacts and distortions in +unseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction +paradigm that reframes the ambiguous reconstruction challenge as a temporal +generation task. The key insight is to unleash the strong generative prior of +large pre-trained video diffusion models for sparse-view reconstruction. +However, 3D view consistency struggles to be accurately preserved in directly +generated video frames from pre-trained models. To address this, given limited +input views, the proposed ReconX first constructs a global point cloud and +encodes it into a contextual space as the 3D structure condition. Guided by the +condition, the video diffusion model then synthesizes video frames that are +both detail-preserved and exhibit a high degree of 3D consistency, ensuring the +coherence of the scene from various perspectives. Finally, we recover the 3D +scene from the generated video through a confidence-aware 3D Gaussian Splatting +optimization scheme. Extensive experiments on various real-world datasets show +the superiority of our ReconX over state-of-the-art methods in terms of quality +and generalizability. + +
+
+ comment: Project page: https://liuff19.github.io/ReconX +
+
+
+
+
+ + ♻ ☆ An Information-Theoretic Regularizer for Lossy Neural Image Compression + + +
+ Lossy image compression networks aim to minimize the latent entropy of images +while adhering to specific distortion constraints. However, optimizing the +neural network can be challenging due to its nature of learning quantized +latent representations. In this paper, our key finding is that minimizing the +latent entropy is, to some extent, equivalent to maximizing the conditional +source entropy, an insight that is deeply rooted in information-theoretic +equalities. Building on this insight, we propose a novel structural +regularization method for the neural image compression task by incorporating +the negative conditional source entropy into the training objective, such that +both the optimization efficacy and the model's generalization ability can be +promoted. The proposed information-theoretic regularizer is interpretable, +plug-and-play, and imposes no inference overheads. Extensive experiments +demonstrate its superiority in regularizing the models and further squeezing +bits from the latent representation across various compression structures and +unseen domains. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Fine-Grained Alignment in Vision-and-Language Navigation through + Bayesian Optimization + + +
+ This paper addresses the challenge of fine-grained alignment in +Vision-and-Language Navigation (VLN) tasks, where robots navigate realistic 3D +environments based on natural language instructions. Current approaches use +contrastive learning to align language with visual trajectory sequences. +Nevertheless, they encounter difficulties with fine-grained vision negatives. +To enhance cross-modal embeddings, we introduce a novel Bayesian +Optimization-based adversarial optimization framework for creating fine-grained +contrastive vision samples. To validate the proposed methodology, we conduct a +series of experiments to assess the effectiveness of the enriched embeddings on +fine-grained vision negatives. We conduct experiments on two common VLN +benchmarks R2R and REVERIE, experiments on the them demonstrate that these +embeddings benefit navigation, and can lead to a promising performance +enhancement. Our source code and trained models are available at: +https://anonymous.4open.science/r/FGVLN. + +
+
+
+
+
+ + ♻ ☆ MotionCharacter: Identity-Preserving and Motion Controllable Human Video + Generation + + +
+ Recent advancements in personalized Text-to-Video (T2V) generation highlight +the importance of integrating character-specific identities and actions. +However, previous T2V models struggle with identity consistency and +controllable motion dynamics, mainly due to limited fine-grained facial and +action-based textual prompts, and datasets that overlook key human attributes +and actions. To address these challenges, we propose MotionCharacter, an +efficient and high-fidelity human video generation framework designed for +identity preservation and fine-grained motion control. We introduce an +ID-preserving module to maintain identity fidelity while allowing flexible +attribute modifications, and further integrate ID-consistency and region-aware +loss mechanisms, significantly enhancing identity consistency and detail +fidelity. Additionally, our approach incorporates a motion control module that +prioritizes action-related text while maintaining subject consistency, along +with a dataset, Human-Motion, which utilizes large language models to generate +detailed motion descriptions. For simplify user control during inference, we +parameterize motion intensity through a single coefficient, allowing for easy +adjustments. Extensive experiments highlight the effectiveness of +MotionCharacter, demonstrating significant improvements in ID-preserving, +high-quality video generation. + +
+
+
+
+
+ + ♻ ☆ InstantDrag: Improving Interactivity in Drag-based Image Editing SIGGRAPH + + +
+ Drag-based image editing has recently gained popularity for its interactivity +and precision. However, despite the ability of text-to-image models to generate +samples within a second, drag editing still lags behind due to the challenge of +accurately reflecting user interaction while maintaining image content. Some +existing approaches rely on computationally intensive per-image optimization or +intricate guidance-based methods, requiring additional inputs such as masks for +movable regions and text prompts, thereby compromising the interactivity of the +editing process. We introduce InstantDrag, an optimization-free pipeline that +enhances interactivity and speed, requiring only an image and a drag +instruction as input. InstantDrag consists of two carefully designed networks: +a drag-conditioned optical flow generator (FlowGen) and an optical +flow-conditioned diffusion model (FlowDiffusion). InstantDrag learns motion +dynamics for drag-based image editing in real-world video datasets by +decomposing the task into motion generation and motion-conditioned image +generation. We demonstrate InstantDrag's capability to perform fast, +photo-realistic edits without masks or text prompts through experiments on +facial video datasets and general scenes. These results highlight the +efficiency of our approach in handling drag-based image editing, making it a +promising solution for interactive, real-time applications. + +
+
+ comment: SIGGRAPH Asia 2024. Project webpage: + https://joonghyuk.com/instantdrag-web/ +
+
+
+
+
+ + ♻ ☆ REACT: Real-time Efficiency and Accuracy Compromise for Tradeoffs in + Scene Graph Generation + + +
+ Scene Graph Generation (SGG) is a task that encodes visual relationships +between objects in images as graph structures. SGG shows significant promise as +a foundational component for downstream tasks, such as reasoning for embodied +agents. To enable real-time applications, SGG must address the trade-off +between performance and inference speed. However, current methods tend to focus +on one of the following: (1) improving relation prediction accuracy, (2) +enhancing object detection accuracy, or (3) reducing latency, without aiming to +balance all three objectives simultaneously. To address this limitation, we +propose a novel architecture, inference method, and relation prediction model. +Our proposed solution, the REACT model, achieves the highest inference speed +among existing SGG models, improving object detection accuracy without +sacrificing relation prediction performance. Compared to state-of-the-art +approaches, REACT is 2.7 times faster (with a latency of 23 ms) and improves +object detection accuracy by 58.51%. Furthermore, our proposal significantly +reduces model size, with an average of 5.5x fewer parameters. Code is available +at https://github.com/Maelic/SGG-Benchmark + +
+
+
+
+
+ + ♻ ☆ Croc: Pretraining Large Multimodal Models with Cross-Modal Comprehension + + +
+ Recent advances in Large Language Models (LLMs) have catalyzed the +development of Large Multimodal Models (LMMs). However, existing research +primarily focuses on tuning language and image instructions, ignoring the +critical pretraining phase where models learn to process textual and visual +modalities jointly. In this paper, we propose a new pretraining paradigm for +LMMs to enhance the visual comprehension capabilities of LLMs by introducing a +novel cross-modal comprehension stage. Specifically, we design a dynamically +learnable prompt token pool and employ the Hungarian algorithm to replace part +of the original visual tokens with the most relevant prompt tokens. Then, we +conceptualize visual tokens as analogous to a "foreign language" for the LLMs +and propose a mixed attention mechanism with bidirectional visual attention and +unidirectional textual attention to comprehensively enhance the understanding +of visual tokens. Meanwhile, we integrate a detailed caption generation task, +leveraging rich descriptions to further facilitate LLMs in understanding visual +semantic information. After pretraining on 1.5 million publicly accessible +data, we present a new foundation model called Croc. Experimental results +demonstrate that Croc achieves new state-of-the-art performance on massive +vision-language benchmarks. To support reproducibility and facilitate further +research, we release the training code and pre-trained model weights at +https://github.com/deepglint/Croc. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Treat Visual Tokens as Text? But Your MLLM Only Needs Fewer Efforts to + See + + +
+ By treating visual tokens from visual encoders as text tokens, Multimodal +Large Language Models (MLLMs) have achieved remarkable progress across diverse +visual understanding tasks, leveraging the robust architectures of Large +Language Models (LLMs). However, as token counts grow, the quadratic scaling of +computation in LLMs introduces a significant efficiency bottleneck, impeding +further scalability. Although recent approaches have explored pruning visual +tokens or employing lighter LLM architectures, the computational overhead from +an increasing number of visual tokens remains a substantial challenge. + In this study, we investigate the redundancy in visual computation at both +the parameter and computational pattern levels within LLaVA, a representative +MLLM, and introduce a suite of streamlined strategies to enhance efficiency. +These include neighbor-aware visual token attention, pruning of inactive visual +attention heads, and selective layer dropping for visual computations. By +implementing these strategies in LLaVA, we achieve a reduction in computational +demands of 88% while maintaining model performance across key benchmarks. +Additionally, we validate the existence of visual computational redundancy in +other MLLMs, such as Qwen2-VL-7B and InternVL-2.0-4B/8B/26B. These results +present a novel pathway for MLLMs to handle dense visual tokens with minimal +computational costs. Code and model checkpoints will be released to support +further research. + +
+
+
+
+
+ + ♻ ☆ I2VControl: Disentangled and Unified Video Motion Synthesis Control + + +
+ Video synthesis techniques are undergoing rapid progress, with +controllability being a significant aspect of practical usability for +end-users. Although text condition is an effective way to guide video +synthesis, capturing the correct joint distribution between text descriptions +and video motion remains a substantial challenge. In this paper, we present a +disentangled and unified framework, namely I2VControl, that unifies multiple +motion control tasks in image-to-video synthesis. Our approach partitions the +video into individual motion units and represents each unit with disentangled +control signals, which allows for various control types to be flexibly combined +within our single system. Furthermore, our methodology seamlessly integrates as +a plug-in for pre-trained models and remains agnostic to specific model +architectures. We conduct extensive experiments, achieving excellent +performance on various control tasks, and our method further facilitates +user-driven creative combinations, enhancing innovation and creativity. The +project page is: https://wanquanf.github.io/I2VControl . + +
+
+ comment: Project page: https://wanquanf.github.io/I2VControl +
+
+
+
+
+ + ♻ ☆ Exploring Homogeneous and Heterogeneous Consistent Label Associations + for Unsupervised Visible-Infrared Person ReID + + +
+ Unsupervised visible-infrared person re-identification (USL-VI-ReID) +endeavors to retrieve pedestrian images of the same identity from different +modalities without annotations. While prior work focuses on establishing +cross-modality pseudo-label associations to bridge the modality-gap, they +ignore maintaining the instance-level homogeneous and heterogeneous consistency +between the feature space and the pseudo-label space, resulting in coarse +associations. In response, we introduce a Modality-Unified Label Transfer +(MULT) module that simultaneously accounts for both homogeneous and +heterogeneous fine-grained instance-level structures, yielding high-quality +cross-modality label associations. It models both homogeneous and heterogeneous +affinities, leveraging them to quantify the inconsistency between the +pseudo-label space and the feature space, subsequently minimizing it. The +proposed MULT ensures that the generated pseudo-labels maintain alignment +across modalities while upholding structural consistency within intra-modality. +Additionally, a straightforward plug-and-play Online Cross-memory Label +Refinement (OCLR) module is proposed to further mitigate the side effects of +noisy pseudo-labels while simultaneously aligning different modalities, coupled +with an Alternative Modality-Invariant Representation Learning (AMIRL) +framework. Experiments demonstrate that our proposed method outperforms +existing state-of-the-art USL-VI-ReID methods, highlighting the superiority of +our MULT in comparison to other cross-modality association methods. Code is +available at https://github.com/FranklinLingfeng/code_for_MULT. + +
+
+ comment: Accepted by IJCV2024 +
+
+
+
+
+ + ♻ ☆ A Deep Learning Approach to Predict the Fall [of Price] of + Cryptocurrency Long Before its Actual Fall + + +
+ In modern times, the cryptocurrency market is one of the world's most rapidly +rising financial markets. The cryptocurrency market is regarded to be more +volatile and illiquid than traditional markets such as equities, foreign +exchange, and commodities. The risk of this market creates an uncertain +condition among the investors. The purpose of this research is to predict the +magnitude of the risk factor of the cryptocurrency market. Risk factor is also +called volatility. Our approach will assist people who invest in the +cryptocurrency market by overcoming the problems and difficulties they +experience. Our approach starts with calculating the risk factor of the +cryptocurrency market from the existing parameters. In twenty elements of the +cryptocurrency market, the risk factor has been predicted using different +machine learning algorithms such as CNN, LSTM, BiLSTM, and GRU. All of the +models have been applied to the calculated risk factor parameter. A new model +has been developed to predict better than the existing models. Our proposed +model gives the highest RMSE value of 1.3229 and the lowest RMSE value of +0.0089. Following our model, it will be easier for investors to trade in +complicated and challenging financial assets like bitcoin, Ethereum, dogecoin, +etc. Where the other existing models, the highest RMSE was 14.5092, and the +lower was 0.02769. So, the proposed model performs much better than models with +proper generalization. Using our approach, it will be easier for investors to +trade in complicated and challenging financial assets like Bitcoin, Ethereum, +and Dogecoin. + +
+
+ comment: I am writing to formally request the withdrawal, which is necessary + due to issues with the author list and the need for improvements to the + manuscript. We apologize for any inconvenience caused by this request and + appreciate your understanding +
+
+
+
+
+ + ♻ ☆ ViBiDSampler: Enhancing Video Interpolation Using Bidirectional + Diffusion Sampler + + +
+ Recent progress in large-scale text-to-video (T2V) and image-to-video (I2V) +diffusion models has greatly enhanced video generation, especially in terms of +keyframe interpolation. However, current image-to-video diffusion models, while +powerful in generating videos from a single conditioning frame, need adaptation +for two-frame (start & end) conditioned generation, which is essential for +effective bounded interpolation. Unfortunately, existing approaches that fuse +temporally forward and backward paths in parallel often suffer from +off-manifold issues, leading to artifacts or requiring multiple iterative +re-noising steps. In this work, we introduce a novel, bidirectional sampling +strategy to address these off-manifold issues without requiring extensive +re-noising or fine-tuning. Our method employs sequential sampling along both +forward and backward paths, conditioned on the start and end frames, +respectively, ensuring more coherent and on-manifold generation of intermediate +frames. Additionally, we incorporate advanced guidance techniques, CFG++ and +DDS, to further enhance the interpolation process. By integrating these, our +method achieves state-of-the-art performance, efficiently generating +high-quality, smooth videos between keyframes. On a single 3090 GPU, our method +can interpolate 25 frames at 1024 x 576 resolution in just 195 seconds, +establishing it as a leading solution for keyframe interpolation. + +
+
+ comment: Project page: https://vibidsampler.github.io/ +
+
+
+
+
+ + ♻ ☆ Solving Video Inverse Problems Using Image Diffusion Models + + +
+ Recently, diffusion model-based inverse problem solvers (DIS) have emerged as +state-of-the-art approaches for addressing inverse problems, including image +super-resolution, deblurring, inpainting, etc. However, their application to +video inverse problems arising from spatio-temporal degradation remains largely +unexplored due to the challenges in training video diffusion models. To address +this issue, here we introduce an innovative video inverse solver that leverages +only image diffusion models. Specifically, by drawing inspiration from the +success of the recent decomposed diffusion sampler (DDS), our method treats the +time dimension of a video as the batch dimension of image diffusion models and +solves spatio-temporal optimization problems within denoised spatio-temporal +batches derived from each image diffusion model. Moreover, we introduce a +batch-consistent diffusion sampling strategy that encourages consistency across +batches by synchronizing the stochastic noise components in image diffusion +models. Our approach synergistically combines batch-consistent sampling with +simultaneous optimization of denoised spatio-temporal batches at each reverse +diffusion step, resulting in a novel and efficient diffusion sampling strategy +for video inverse problems. Experimental results demonstrate that our method +effectively addresses various spatio-temporal degradations in video inverse +problems, achieving state-of-the-art reconstructions. Project page: +https://svi-diffusion.github.io + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+
+
+
+ + Artificial Intelligence 25 + +
+
+
+ + ♻ ☆ Are Large Language Models Memorizing Bug Benchmarks? + + +
+ Large Language Models (LLMs) have become integral to various software +engineering tasks, including code generation, bug detection, and repair. To +evaluate model performance in these domains, numerous bug benchmarks containing +real-world bugs from software projects have been developed. However, a growing +concern within the software engineering community is that these benchmarks may +not reliably reflect true LLM performance due to the risk of data leakage. +Despite this concern, limited research has been conducted to quantify the +impact of potential leakage. In this paper, we systematically evaluate popular +LLMs to assess their susceptibility to data leakage from widely used bug +benchmarks. To identify potential leakage, we use multiple metrics, including a +study of benchmark membership within commonly used training datasets, as well +as analyses of negative log-likelihood and n-gram accuracy. Our findings show +that certain models, in particular codegen-multi, exhibit significant evidence +of memorization in widely used benchmarks like Defects4J, while newer models +trained on larger datasets like LLaMa 3.1 exhibit limited signs of leakage. +These results highlight the need for careful benchmark selection and the +adoption of robust metrics to adequately assess models capabilities. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ A Review of Prominent Paradigms for LLM-Based Agents: Tool Use + (Including RAG), Planning, and Feedback Learning + + +
+ Tool use, planning, and feedback learning are currently three prominent +paradigms for developing Large Language Model (LLM)-based agents across various +tasks. Although numerous frameworks have been devised for each paradigm, their +intricate workflows and inconsistent taxonomy create challenges in +understanding and reviewing the frameworks across different paradigms. This +survey introduces a unified taxonomy to systematically review and discuss these +frameworks. Specifically, 1) the taxonomy defines environments/tasks, common +LLM-profiled roles or LMPRs (policy models, evaluators, and dynamic models), +and universally applicable workflows found in prior work, and 2) it enables a +comparison of key perspectives on the implementations of LMPRs and workflow +designs across different agent paradigms and frameworks. 3) Finally, we +identify three limitations in existing workflow designs and systematically +discuss the future work. Resources have been made publicly available at in our +GitHub repository https://github.com/xinzhel/LLM-Agent-Survey. + +
+
+ comment: CoLing 2025 Camera Ready (extended to 9 pages) +
+
+
+
+
+ + ♻ ☆ A Survey on Large Language Model-empowered Autonomous Driving + + +
+ Artificial intelligence (AI) plays a crucial role in autonomous driving (AD) +research, propelling its development towards intelligence and efficiency. +Currently, the development of AD technology follows two main technical paths: +modularization and end-to-end. Modularization decompose the driving task into +modules such as perception, prediction, planning, and control, and train them +separately. Due to the inconsistency of training objectives between modules, +the integrated effect suffers from bias. End-to-end attempts to address this +issue by utilizing a single model that directly maps from sensor data to +control signals. This path has limited learning capabilities in a comprehensive +set of features and struggles to handle unpredictable long-tail events and +complex urban traffic scenarios. In the face of challenges encountered in both +paths, many researchers believe that large language models (LLMs) with powerful +reasoning capabilities and extensive knowledge understanding may be the +solution, expecting LLMs to provide AD systems with deeper levels of +understanding and decision-making capabilities. In light of the challenges +faced by both paths, many researchers believe that LLMs, with their powerful +reasoning abilities and extensive knowledge, could offer a solution. To +understand if LLMs could enhance AD, this paper conducts a thorough analysis of +the potential applications of LLMs in AD systems, including exploring their +optimization strategies in both modular and end-to-end approaches, with a +particular focus on how LLMs can tackle the problems and challenges present in +current solutions. Furthermore, we discuss an important question: Can LLM-based +artificial general intelligence (AGI) be a key to achieve high-level AD? We +further analyze the potential limitations and challenges that LLMs may +encounter in promoting the development of AD technology. + +
+
+
+
+
+ + ♻ ☆ LLM Pruning and Distillation in Practice: The Minitron Approach + + +
+ We present a comprehensive report on compressing the Llama 3.1 8B and Mistral +NeMo 12B models to 4B and 8B parameters, respectively, using pruning and +distillation. We explore two distinct pruning strategies: (1) depth pruning and +(2) joint hidden/attention/MLP (width) pruning, and evaluate the results on +common benchmarks from the LM Evaluation Harness. The models are then aligned +with NeMo Aligner and tested in instruct-tuned versions. This approach produces +a compelling 4B model from Llama 3.1 8B and a state-of-the-art +Mistral-NeMo-Minitron-8B (MN-Minitron-8B for brevity) model from Mistral NeMo +12B. We found that with no access to the original data, it is beneficial to +slightly fine-tune teacher models on the distillation dataset. We open-source +our base model weights on Hugging Face with a permissive license. + +
+
+ comment: v3: Update author list, other changes +
+
+
+
+
+ + ♻ ☆ CEASEFIRE: An AI-powered system for combatting illicit firearms + trafficking + + +
+ Modern technologies have led illicit firearms trafficking to partially merge +with cybercrime, while simultaneously permitting its off-line aspects to become +more sophisticated. Law enforcement officers face difficult challenges that +require hi-tech solutions. This article presents a real-world system, powered +by advanced Artificial Intelligence, for facilitating them in their everyday +work. + +
+
+
+
+
+ + ♻ ☆ Test Time Learning for Time Series Forecasting + + +
+ Time-series forecasting has seen significant advancements with the +introduction of token prediction mechanisms such as multi-head attention. +However, these methods often struggle to achieve the same performance as in +language modeling, primarily due to the quadratic computational cost and the +complexity of capturing long-range dependencies in time-series data. +State-space models (SSMs), such as Mamba, have shown promise in addressing +these challenges by offering efficient solutions with linear RNNs capable of +modeling long sequences with larger context windows. However, there remains +room for improvement in accuracy and scalability. + We propose the use of Test-Time Training (TTT) modules in a parallel +architecture to enhance performance in long-term time series forecasting. +Through extensive experiments on standard benchmark datasets, we demonstrate +that TTT modules consistently outperform state-of-the-art models, including the +Mamba-based TimeMachine, particularly in scenarios involving extended sequence +and prediction lengths. Our results show significant improvements in Mean +Squared Error (MSE) and Mean Absolute Error (MAE), especially on larger +datasets such as Electricity, Traffic, and Weather, underscoring the +effectiveness of TTT in capturing long-range dependencies. Additionally, we +explore various convolutional architectures within the TTT framework, showing +that even simple configurations like 1D convolution with small filters can +achieve competitive results. This work sets a new benchmark for time-series +forecasting and lays the groundwork for future research in scalable, +high-performance forecasting models. + +
+
+
+
+
+ + ♻ ☆ Is Behavior Cloning All You Need? Understanding Horizon in Imitation + Learning NeurIPS 2024 + + +
+ Imitation learning (IL) aims to mimic the behavior of an expert in a +sequential decision making task by learning from demonstrations, and has been +widely applied to robotics, autonomous driving, and autoregressive text +generation. The simplest approach to IL, behavior cloning (BC), is thought to +incur sample complexity with unfavorable quadratic dependence on the problem +horizon, motivating a variety of different online algorithms that attain +improved linear horizon dependence under stronger assumptions on the data and +the learner's access to the expert. + We revisit the apparent gap between offline and online IL from a +learning-theoretic perspective, with a focus on the realizable/well-specified +setting with general policy classes up to and including deep neural networks. +Through a new analysis of behavior cloning with the logarithmic loss, we show +that it is possible to achieve horizon-independent sample complexity in offline +IL whenever (i) the range of the cumulative payoffs is controlled, and (ii) an +appropriate notion of supervised learning complexity for the policy class is +controlled. Specializing our results to deterministic, stationary policies, we +show that the gap between offline and online IL is smaller than previously +thought: (i) it is possible to achieve linear dependence on horizon in offline +IL under dense rewards (matching what was previously only known to be +achievable in online IL); and (ii) without further assumptions on the policy +class, online IL cannot improve over offline IL with the logarithmic loss, even +in benign MDPs. We complement our theoretical results with experiments on +standard RL tasks and autoregressive language generation to validate the +practical relevance of our findings. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Imagination Policy: Using Generative Point Cloud Models for Learning + Manipulation Policies + + +
+ Humans can imagine goal states during planning and perform actions to match +those goals. In this work, we propose Imagination Policy, a novel multi-task +key-frame policy network for solving high-precision pick and place tasks. +Instead of learning actions directly, Imagination Policy generates point clouds +to imagine desired states which are then translated to actions using rigid +action estimation. This transforms action inference into a local generative +task. We leverage pick and place symmetries underlying the tasks in the +generation process and achieve extremely high sample efficiency and +generalizability to unseen configurations. Finally, we demonstrate +state-of-the-art performance across various tasks on the RLbench benchmark +compared with several strong baselines and validate our approach on a real +robot. + +
+
+
+
+
+ + ♻ ☆ Provable Privacy Advantages of Decentralized Federated Learning via + Distributed Optimization + + +
+ Federated learning (FL) emerged as a paradigm designed to improve data +privacy by enabling data to reside at its source, thus embedding privacy as a +core consideration in FL architectures, whether centralized or decentralized. +Contrasting with recent findings by Pasquini et al., which suggest that +decentralized FL does not empirically offer any additional privacy or security +benefits over centralized models, our study provides compelling evidence to the +contrary. We demonstrate that decentralized FL, when deploying distributed +optimization, provides enhanced privacy protection - both theoretically and +empirically - compared to centralized approaches. The challenge of quantifying +privacy loss through iterative processes has traditionally constrained the +theoretical exploration of FL protocols. We overcome this by conducting a +pioneering in-depth information-theoretical privacy analysis for both +frameworks. Our analysis, considering both eavesdropping and passive adversary +models, successfully establishes bounds on privacy leakage. We show information +theoretically that the privacy loss in decentralized FL is upper bounded by the +loss in centralized FL. Compared to the centralized case where local gradients +of individual participants are directly revealed, a key distinction of +optimization-based decentralized FL is that the relevant information includes +differences of local gradients over successive iterations and the aggregated +sum of different nodes' gradients over the network. This information +complicates the adversary's attempt to infer private data. To bridge our +theoretical insights with practical applications, we present detailed case +studies involving logistic regression and deep neural networks. These examples +demonstrate that while privacy leakage remains comparable in simpler models, +complex models like deep neural networks exhibit lower privacy risks under +decentralized FL. + +
+
+
+
+
+ + ♻ ☆ FedMKT: Federated Mutual Knowledge Transfer for Large and Small Language + Models + + +
+ Recent research in federated large language models (LLMs) has primarily +focused on enabling clients to fine-tune their locally deployed homogeneous +LLMs collaboratively or on transferring knowledge from server-based LLMs to +small language models (SLMs) at downstream clients. However, a significant gap +remains in the simultaneous mutual enhancement of both the server's LLM and +clients' SLMs. To bridge this gap, we propose FedMKT, a parameter-efficient +federated mutual knowledge transfer framework for large and small language +models. This framework is designed to adaptively transfer knowledge from the +server's LLM to clients' SLMs while concurrently enriching the LLM with +clients' unique domain insights. We facilitate token alignment using minimum +edit distance (MinED) and then selective mutual knowledge transfer between +client-side SLMs and a server-side LLM, aiming to collectively enhance their +performance. Through extensive experiments across three distinct scenarios, we +evaluate the effectiveness of FedMKT using various public LLMs and SLMs on a +range of NLP text generation tasks. Empirical results demonstrate that FedMKT +simultaneously boosts the performance of both LLMs and SLMs. + +
+
+
+
+
+ + ♻ ☆ A Concept-Based Explainability Framework for Large Multimodal Models NeurIPS 2024 + + +
+ Large multimodal models (LMMs) combine unimodal encoders and large language +models (LLMs) to perform multimodal tasks. Despite recent advancements towards +the interpretability of these models, understanding internal representations of +LMMs remains largely a mystery. In this paper, we present a novel framework for +the interpretation of LMMs. We propose a dictionary learning based approach, +applied to the representation of tokens. The elements of the learned dictionary +correspond to our proposed concepts. We show that these concepts are well +semantically grounded in both vision and text. Thus we refer to these as +``multi-modal concepts''. We qualitatively and quantitatively evaluate the +results of the learnt concepts. We show that the extracted multimodal concepts +are useful to interpret representations of test samples. Finally, we evaluate +the disentanglement between different concepts and the quality of grounding +concepts visually and textually. Our code is publicly available at +https://github.com/mshukor/xl-vlms + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Kalahi: A handcrafted, grassroots cultural LLM evaluation suite for + Filipino + + +
+ Multilingual large language models (LLMs) today may not necessarily provide +culturally appropriate and relevant responses to its Filipino users. We +introduce Kalahi, a cultural LLM evaluation suite collaboratively created by +native Filipino speakers. It is composed of 150 high-quality, handcrafted and +nuanced prompts that test LLMs for generations that are relevant to shared +Filipino cultural knowledge and values. Strong LLM performance in Kalahi +indicates a model's ability to generate responses similar to what an average +Filipino would say or do in a given situation. We conducted experiments on LLMs +with multilingual and Filipino language support. Results show that Kalahi, +while trivial for Filipinos, is challenging for LLMs, with the best model +answering only 46.0% of the questions correctly compared to native Filipino +performance of 89.10%. Thus, Kalahi can be used to accurately and reliably +evaluate Filipino cultural representation in LLMs. + +
+
+ comment: Accepted for presentation at Paclic 38, 2024 +
+
+
+
+
+ + ♻ ☆ Combining Threat Intelligence with IoT Scanning to Predict Cyber Attack + + +
+ While the Web has become a worldwide platform for communication, hackers and +hacktivists share their ideology and communicate with members on the "Dark +Web"-the reverse of the Web. Currently, the problems of information overload +and difficulty to obtain a comprehensive picture of hackers and cyber-attackers +hinder the effective analysis of predicting their activities on the Web. Also, +there are currently more objects connected to the internet than there are +people in the world and this gap will continue to grow as more and more objects +gain ability to directly interface with the Internet. Many technical +communities are vigorously pursuing research topics that contribute to the +Internet of Things (IoT). In this paper I have proposed a novel methodology for +collecting and analyzing the Dark Web information to identify websites of +hackers from the Web sea, and how this information can help us in predicting +IoT vulnerabilities. This methodology incorporates information collection, +analysis, visualization techniques, and exploits some of the IoT devices. +Through this research I want to contribute to the existing literature on +cyber-security that could potentially guide in both policy-making and +intelligence research. + +
+
+ comment: 8 pages, 6 figures, 2 tables. This manuscript has been submitted to + Springer for review (Manuscript ID: PDSE-D-24-00163) and is under + consideration. It has not yet been peer-reviewed or published. Researchers + are welcome to read and build upon this work; please cite it appropriately. + For questions or clarifications, feel free to contact me +
+
+
+
+
+ + ♻ ☆ ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion + Model + + +
+ Advancements in 3D scene reconstruction have transformed 2D images from the +real world into 3D models, producing realistic 3D results from hundreds of +input photos. Despite great success in dense-view reconstruction scenarios, +rendering a detailed scene from insufficient captured views is still an +ill-posed optimization problem, often resulting in artifacts and distortions in +unseen areas. In this paper, we propose ReconX, a novel 3D scene reconstruction +paradigm that reframes the ambiguous reconstruction challenge as a temporal +generation task. The key insight is to unleash the strong generative prior of +large pre-trained video diffusion models for sparse-view reconstruction. +However, 3D view consistency struggles to be accurately preserved in directly +generated video frames from pre-trained models. To address this, given limited +input views, the proposed ReconX first constructs a global point cloud and +encodes it into a contextual space as the 3D structure condition. Guided by the +condition, the video diffusion model then synthesizes video frames that are +both detail-preserved and exhibit a high degree of 3D consistency, ensuring the +coherence of the scene from various perspectives. Finally, we recover the 3D +scene from the generated video through a confidence-aware 3D Gaussian Splatting +optimization scheme. Extensive experiments on various real-world datasets show +the superiority of our ReconX over state-of-the-art methods in terms of quality +and generalizability. + +
+
+ comment: Project page: https://liuff19.github.io/ReconX +
+
+
+
+
+ + ♻ ☆ Unraveling Movie Genres through Cross-Attention Fusion of Bi-Modal + Synergy of Poster + + +
+ Movie posters are not just decorative; they are meticulously designed to +capture the essence of a movie, such as its genre, storyline, and tone/vibe. +For decades, movie posters have graced cinema walls, billboards, and now our +digital screens as a form of digital posters. Movie genre classification plays +a pivotal role in film marketing, audience engagement, and recommendation +systems. Previous explorations into movie genre classification have been mostly +examined in plot summaries, subtitles, trailers and movie scenes. Movie posters +provide a pre-release tantalizing glimpse into a film's key aspects, which can +ignite public interest. In this paper, we presented the framework that exploits +movie posters from a visual and textual perspective to address the multilabel +movie genre classification problem. Firstly, we extracted text from movie +posters using an OCR and retrieved the relevant embedding. Next, we introduce a +cross-attention-based fusion module to allocate attention weights to visual and +textual embedding. In validating our framework, we utilized 13882 posters +sourced from the Internet Movie Database (IMDb). The outcomes of the +experiments indicate that our model exhibited promising performance and +outperformed even some prominent contemporary architectures. + +
+
+
+
+
+ + ♻ ☆ Nl2Hltl2Plan: Scaling Up Natural Language Understanding for Multi-Robots + Through Hierarchical Temporal Logic Task Representation + + +
+ To enable non-experts to specify long-horizon, multi-robot collaborative +tasks, language models are increasingly used to translate natural language +commands into formal specifications. However, because translation can occur in +multiple ways, such translations may lack accuracy or lead to inefficient +multi-robot planning. Our key insight is that concise hierarchical +specifications can simplify planning while remaining straightforward to derive +from human instructions. We propose~\acronym{}, a framework that translates +natural language commands into hierarchical Linear Temporal Logic (LTL) and +solves the corresponding planning problem. The translation involves two steps +leveraging Large Language Models (LLMs). First, an LLM transforms instructions +into a Hierarchical Task Tree, capturing logical and temporal relations. Next, +a fine-tuned LLM converts sub-tasks into flat LTL formulas, which are +aggregated into hierarchical specifications, with the lowest level +corresponding to ordered robot actions. These specifications are then used with +off-the-shelf planners. Our~\acronym{} demonstrates the potential of LLMs in +hierarchical reasoning for multi-robot task planning. Evaluations in simulation +and real-world experiments with human participants show that~\acronym{} +outperforms existing methods, handling more complex instructions while +achieving higher success rates and lower costs in task allocation and planning. +Additional details are available at https://nl2hltl2plan.github.io . + +
+
+
+
+
+ + ♻ ☆ Stock Recommendations for Individual Investors: A Temporal Graph Network + Approach with Mean-Variance Efficient Sampling + + +
+ Recommender systems can be helpful for individuals to make well-informed +decisions in complex financial markets. While many studies have focused on +predicting stock prices, even advanced models fall short of accurately +forecasting them. Additionally, previous studies indicate that individual +investors often disregard established investment theories, favoring their +personal preferences instead. This presents a challenge for stock +recommendation systems, which must not only provide strong investment +performance but also respect these individual preferences. To create effective +stock recommender systems, three critical elements must be incorporated: 1) +individual preferences, 2) portfolio diversification, and 3) the temporal +dynamics of the first two. In response, we propose a new model, Portfolio +Temporal Graph Network Recommender PfoTGNRec, which can handle time-varying +collaborative signals and incorporates diversification-enhancing sampling. On +real-world individual trading data, our approach demonstrates superior +performance compared to state-of-the-art baselines, including cutting-edge +dynamic embedding models and existing stock recommendation models. Indeed, we +show that PfoTGNRec is an effective solution that can balance customer +preferences with the need to suggest portfolios with high Return-on-Investment. +The source code and data are available at +https://github.com/youngandbin/PfoTGNRec. + +
+
+ comment: ICAIF 2024 (https://dl.acm.org/doi/10.1145/3677052.3698662) +
+
+
+
+
+ + ♻ ☆ Noise-powered Multi-modal Knowledge Graph Representation Framework COLING 2025 + + +
+ The rise of Multi-modal Pre-training highlights the necessity for a unified +Multi-Modal Knowledge Graph (MMKG) representation learning framework. Such a +framework is essential for embedding structured knowledge into multi-modal +Large Language Models effectively, alleviating issues like knowledge +misconceptions and multi-modal hallucinations. In this work, we explore the +efficacy of models in accurately embedding entities within MMKGs through two +pivotal tasks: Multi-modal Knowledge Graph Completion (MKGC) and Multi-modal +Entity Alignment (MMEA). Building on this foundation, we propose a novel SNAG +method that utilizes a Transformer-based architecture equipped with +modality-level noise masking to robustly integrate multi-modal entity features +in KGs. By incorporating specific training objectives for both MKGC and MMEA, +our approach achieves SOTA performance across a total of ten datasets, +demonstrating its versatility. Moreover, SNAG can not only function as a +standalone model but also enhance other existing methods, providing stable +performance improvements. Code and data are available at +https://github.com/zjukg/SNAG. + +
+
+ comment: COLING 2025 Accpeted, Repo is available at + https://github.com/zjukg/SNAG +
+
+
+
+
+ + ♻ ☆ Exploring Homogeneous and Heterogeneous Consistent Label Associations + for Unsupervised Visible-Infrared Person ReID + + +
+ Unsupervised visible-infrared person re-identification (USL-VI-ReID) +endeavors to retrieve pedestrian images of the same identity from different +modalities without annotations. While prior work focuses on establishing +cross-modality pseudo-label associations to bridge the modality-gap, they +ignore maintaining the instance-level homogeneous and heterogeneous consistency +between the feature space and the pseudo-label space, resulting in coarse +associations. In response, we introduce a Modality-Unified Label Transfer +(MULT) module that simultaneously accounts for both homogeneous and +heterogeneous fine-grained instance-level structures, yielding high-quality +cross-modality label associations. It models both homogeneous and heterogeneous +affinities, leveraging them to quantify the inconsistency between the +pseudo-label space and the feature space, subsequently minimizing it. The +proposed MULT ensures that the generated pseudo-labels maintain alignment +across modalities while upholding structural consistency within intra-modality. +Additionally, a straightforward plug-and-play Online Cross-memory Label +Refinement (OCLR) module is proposed to further mitigate the side effects of +noisy pseudo-labels while simultaneously aligning different modalities, coupled +with an Alternative Modality-Invariant Representation Learning (AMIRL) +framework. Experiments demonstrate that our proposed method outperforms +existing state-of-the-art USL-VI-ReID methods, highlighting the superiority of +our MULT in comparison to other cross-modality association methods. Code is +available at https://github.com/FranklinLingfeng/code_for_MULT. + +
+
+ comment: Accepted by IJCV2024 +
+
+
+
+
+ + ♻ ☆ AgentOps: Enabling Observability of LLM Agents + + +
+ Large language model (LLM) agents have demonstrated remarkable capabilities +across various domains, gaining extensive attention from academia and industry. +However, these agents raise significant concerns on AI safety due to their +autonomous and non-deterministic behavior, as well as continuous evolving +nature . From a DevOps perspective, enabling observability in agents is +necessary to ensuring AI safety, as stakeholders can gain insights into the +agents' inner workings, allowing them to proactively understand the agents, +detect anomalies, and prevent potential failures. Therefore, in this paper, we +present a comprehensive taxonomy of AgentOps, identifying the artifacts and +associated data that should be traced throughout the entire lifecycle of agents +to achieve effective observability. The taxonomy is developed based on a +systematic mapping study of existing AgentOps tools. Our taxonomy serves as a +reference template for developers to design and implement AgentOps +infrastructure that supports monitoring, logging, and analytics. thereby +ensuring AI safety. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ ViBiDSampler: Enhancing Video Interpolation Using Bidirectional + Diffusion Sampler + + +
+ Recent progress in large-scale text-to-video (T2V) and image-to-video (I2V) +diffusion models has greatly enhanced video generation, especially in terms of +keyframe interpolation. However, current image-to-video diffusion models, while +powerful in generating videos from a single conditioning frame, need adaptation +for two-frame (start & end) conditioned generation, which is essential for +effective bounded interpolation. Unfortunately, existing approaches that fuse +temporally forward and backward paths in parallel often suffer from +off-manifold issues, leading to artifacts or requiring multiple iterative +re-noising steps. In this work, we introduce a novel, bidirectional sampling +strategy to address these off-manifold issues without requiring extensive +re-noising or fine-tuning. Our method employs sequential sampling along both +forward and backward paths, conditioned on the start and end frames, +respectively, ensuring more coherent and on-manifold generation of intermediate +frames. Additionally, we incorporate advanced guidance techniques, CFG++ and +DDS, to further enhance the interpolation process. By integrating these, our +method achieves state-of-the-art performance, efficiently generating +high-quality, smooth videos between keyframes. On a single 3090 GPU, our method +can interpolate 25 frames at 1024 x 576 resolution in just 195 seconds, +establishing it as a leading solution for keyframe interpolation. + +
+
+ comment: Project page: https://vibidsampler.github.io/ +
+
+
+
+
+ + ♻ ☆ Towards Neural Scaling Laws on Graphs + + +
+ Deep graph models (e.g., graph neural networks and graph transformers) have +become important techniques for leveraging knowledge across various types of +graphs. Yet, the neural scaling laws on graphs, i.e., how the performance of +deep graph models changes with model and dataset sizes, have not been +systematically investigated, casting doubts on the feasibility of achieving +large graph models. To fill this gap, we benchmark many graph datasets from +different tasks and make an attempt to establish the neural scaling laws on +graphs from both model and data perspectives. The model size we investigated is +up to 100 million parameters, and the dataset size investigated is up to 50 +million samples. We first verify the validity of such laws on graphs, +establishing proper formulations to describe the scaling behaviors. For model +scaling, we identify that despite the parameter numbers, the model depth also +plays an important role in affecting the model scaling behaviors, which differs +from observations in other domains such as CV and NLP. For data scaling, we +suggest that the number of graphs can not effectively measure the graph data +volume in scaling law since the sizes of different graphs are highly irregular. +Instead, we reform the data scaling law with the number of nodes or edges as +the metric to address the irregular graph sizes. We further demonstrate that +the reformed law offers a unified view of the data scaling behaviors for +various fundamental graph tasks including node classification, link prediction, +and graph classification. This work provides valuable insights into neural +scaling laws on graphs, which can serve as an important tool for collecting new +graph data and developing large graph models. + +
+
+
+
+
+ + ♻ ☆ Solving Video Inverse Problems Using Image Diffusion Models + + +
+ Recently, diffusion model-based inverse problem solvers (DIS) have emerged as +state-of-the-art approaches for addressing inverse problems, including image +super-resolution, deblurring, inpainting, etc. However, their application to +video inverse problems arising from spatio-temporal degradation remains largely +unexplored due to the challenges in training video diffusion models. To address +this issue, here we introduce an innovative video inverse solver that leverages +only image diffusion models. Specifically, by drawing inspiration from the +success of the recent decomposed diffusion sampler (DDS), our method treats the +time dimension of a video as the batch dimension of image diffusion models and +solves spatio-temporal optimization problems within denoised spatio-temporal +batches derived from each image diffusion model. Moreover, we introduce a +batch-consistent diffusion sampling strategy that encourages consistency across +batches by synchronizing the stochastic noise components in image diffusion +models. Our approach synergistically combines batch-consistent sampling with +simultaneous optimization of denoised spatio-temporal batches at each reverse +diffusion step, resulting in a novel and efficient diffusion sampling strategy +for video inverse problems. Experimental results demonstrate that our method +effectively addresses various spatio-temporal degradations in video inverse +problems, achieving state-of-the-art reconstructions. Project page: +https://svi-diffusion.github.io + +
+
+ comment: 22 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Construction numbers: How to build a graph? + + +
+ A construction sequence for a graph is a listing of the elements of the graph +(the set of vertices and edges) such that each edge follows both its endpoints. +The construction number of the graph is the number of such sequences. We +determine this number for various graph families. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ A Perspective for Adapting Generalist AI to Specialized Medical AI + Applications and Their Challenges + + +
+ The integration of Large Language Models (LLMs) into medical applications has +sparked widespread interest across the healthcare industry, from drug discovery +and development to clinical decision support, assisting telemedicine, medical +devices, and healthcare insurance applications. This perspective paper aims to +discuss the inner workings of building LLM-powered medical AI applications and +introduces a comprehensive framework for their development. We review existing +literature and outline the unique challenges of applying LLMs in specialized +medical contexts. Additionally, we introduce a three-step framework to organize +medical LLM research activities: 1) Modeling: breaking down complex medical +workflows into manageable steps for developing medical-specific models; 2) +Optimization: optimizing the model performance with crafted prompts and +integrating external knowledge and tools, and 3) System engineering: +decomposing complex tasks into subtasks and leveraging human expertise for +building medical AI applications. Furthermore, we offer a detailed use case +playbook that describes various LLM-powered medical AI applications, such as +optimizing clinical trial design, enhancing clinical decision support, and +advancing medical imaging analysis. Finally, we discuss various challenges and +considerations for building medical AI applications with LLMs, such as handling +hallucination issues, data ownership and compliance, privacy, intellectual +property considerations, compute cost, sustainability issues, and responsible +AI requirements. + +
+
+
+
+
+
+
+
+ + Machine Learning 3 + +
+
+
+ + ♻ ☆ Are Large Language Models Memorizing Bug Benchmarks? + + +
+ Large Language Models (LLMs) have become integral to various software +engineering tasks, including code generation, bug detection, and repair. To +evaluate model performance in these domains, numerous bug benchmarks containing +real-world bugs from software projects have been developed. However, a growing +concern within the software engineering community is that these benchmarks may +not reliably reflect true LLM performance due to the risk of data leakage. +Despite this concern, limited research has been conducted to quantify the +impact of potential leakage. In this paper, we systematically evaluate popular +LLMs to assess their susceptibility to data leakage from widely used bug +benchmarks. To identify potential leakage, we use multiple metrics, including a +study of benchmark membership within commonly used training datasets, as well +as analyses of negative log-likelihood and n-gram accuracy. Our findings show +that certain models, in particular codegen-multi, exhibit significant evidence +of memorization in widely used benchmarks like Defects4J, while newer models +trained on larger datasets like LLaMa 3.1 exhibit limited signs of leakage. +These results highlight the need for careful benchmark selection and the +adoption of robust metrics to adequately assess models capabilities. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Quantum Mixed-State Self-Attention Network + + +
+ Attention mechanisms have revolutionized natural language processing. +Combining them with quantum computing aims to further advance this technology. +This paper introduces a novel Quantum Mixed-State Self-Attention Network +(QMSAN) for natural language processing tasks. Our model leverages quantum +computing principles to enhance the effectiveness of self-attention mechanisms. +QMSAN uses a quantum attention mechanism based on mixed state, allowing for +direct similarity estimation between queries and keys in the quantum domain. +This approach leads to more effective attention coefficient calculations. We +also propose an innovative quantum positional encoding scheme, implemented +through fixed quantum gates within the circuit, improving the model's ability +to capture sequence information without additional qubit resources. In +numerical experiments of text classification tasks on public datasets, QMSAN +outperforms Quantum Self-Attention Neural Network (QSANN). Furthermore, we +demonstrate QMSAN's robustness in different quantum noise environments, +highlighting its potential for near-term quantum devices. + +
+
+
+
+
+ + ♻ ☆ A Survey on Large Language Model-empowered Autonomous Driving + + +
+ Artificial intelligence (AI) plays a crucial role in autonomous driving (AD) +research, propelling its development towards intelligence and efficiency. +Currently, the development of AD technology follows two main technical paths: +modularization and end-to-end. Modularization decompose the driving task into +modules such as perception, prediction, planning, and control, and train them +separately. Due to the inconsistency of training objectives between modules, +the integrated effect suffers from bias. End-to-end attempts to address this +issue by utilizing a single model that directly maps from sensor data to +control signals. This path has limited learning capabilities in a comprehensive +set of features and struggles to handle unpredictable long-tail events and +complex urban traffic scenarios. In the face of challenges encountered in both +paths, many researchers believe that large language models (LLMs) with powerful +reasoning capabilities and extensive knowledge understanding may be the +solution, expecting LLMs to provide AD systems with deeper levels of +understanding and decision-making capabilities. In light of the challenges +faced by both paths, many researchers believe that LLMs, with their powerful +reasoning abilities and extensive knowledge, could offer a solution. To +understand if LLMs could enhance AD, this paper conducts a thorough analysis of +the potential applications of LLMs in AD systems, including exploring their +optimization strategies in both modular and end-to-end approaches, with a +particular focus on how LLMs can tackle the problems and challenges present in +current solutions. Furthermore, we discuss an important question: Can LLM-based +artificial general intelligence (AGI) be a key to achieve high-level AD? We +further analyze the potential limitations and challenges that LLMs may +encounter in promoting the development of AD technology. + +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ Hybrid Local-Global Context Learning for Neural Video Compression + + +
+ In neural video codecs, current state-of-the-art methods typically adopt +multi-scale motion compensation to handle diverse motions. These methods +estimate and compress either optical flow or deformable offsets to reduce +inter-frame redundancy. However, flow-based methods often suffer from +inaccurate motion estimation in complicated scenes. Deformable +convolution-based methods are more robust but have a higher bit cost for motion +coding. In this paper, we propose a hybrid context generation module, which +combines the advantages of the above methods in an optimal way and achieves +accurate compensation at a low bit cost. Specifically, considering the +characteristics of features at different scales, we adopt flow-guided +deformable compensation at largest-scale to produce accurate alignment in +detailed regions. For smaller-scale features, we perform flow-based warping to +save the bit cost for motion coding. Furthermore, we design a local-global +context enhancement module to fully explore the local-global information of +previous reconstructed signals. Experimental results demonstrate that our +proposed Hybrid Local-Global Context learning (HLGC) method can significantly +enhance the state-of-the-art methods on standard test datasets. + +
+
+ comment: Accepted to DCC 2024 +
+
+
+
+
+ + ♻ ☆ Unraveling Movie Genres through Cross-Attention Fusion of Bi-Modal + Synergy of Poster + + +
+ Movie posters are not just decorative; they are meticulously designed to +capture the essence of a movie, such as its genre, storyline, and tone/vibe. +For decades, movie posters have graced cinema walls, billboards, and now our +digital screens as a form of digital posters. Movie genre classification plays +a pivotal role in film marketing, audience engagement, and recommendation +systems. Previous explorations into movie genre classification have been mostly +examined in plot summaries, subtitles, trailers and movie scenes. Movie posters +provide a pre-release tantalizing glimpse into a film's key aspects, which can +ignite public interest. In this paper, we presented the framework that exploits +movie posters from a visual and textual perspective to address the multilabel +movie genre classification problem. Firstly, we extracted text from movie +posters using an OCR and retrieved the relevant embedding. Next, we introduce a +cross-attention-based fusion module to allocate attention weights to visual and +textual embedding. In validating our framework, we utilized 13882 posters +sourced from the Internet Movie Database (IMDb). The outcomes of the +experiments indicate that our model exhibited promising performance and +outperformed even some prominent contemporary architectures. + +
+
+
+
+
+
+
+
+ + Genomics 1 + +
+
+
+ + ☆ LLaMA-Gene: A General-purpose Gene Task Large Language Model Based on + Instruction Fine-tuning + + +
+ Building a general-purpose task model similar to ChatGPT has been an +important research direction for gene large language models. Instruction +fine-tuning is a key component in building ChatGPT, but existing instructions +are primarily based on natural language. Natural language and gene sequences +have significant differences in tokenization and encoding. Therefore, +constructing a multilingual model that can handle both natural language and +gene sequences is crucial for solving this problem.In this paper, we expand the +capabilities of the LLaMA large language model to include gene language. This +involves expanding the vocabulary using the Byte Pair Encoding (BPE) method, +specifically tailored for DNA and protein sequences, and conducting further +pre-training on these sequences. We then convert various downstream gene task +data into a unified format for instruction fine-tuning and further fine-tune +the model on this data.Our study demonstrates that a mixed model of gene and +natural language, fine-tuned with instructions, achieves results comparable to +the current state-of-the-art (SOTA) in tasks such as gene classification and +gene sequence interaction. This provides a promising direction for building a +unified large language model for gene tasks. + +
+
+ comment: 15 pages, 2 figures +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 0000000..69f5da7 --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`