paper_data.json

[
  {
    "title": "AgentGen: Enhancing Planning Abilities for Large Language Model based Agent via Environment and Task Generation",
    "authors": "Mengkang Hu, Pu Zhao, Can Xu, Qingfeng Sun, Jianguang Lou, Qingwei Lin, Ping Luo, Saravan Rajmohan, Dongmei Zhang",
    "abstract": "Large Language Model (LLM) based agents have garnered significant attention\nand are becoming increasingly popular. Furthermore, planning ability is a\ncrucial component of an LLM-based agent, involving interaction with the\nenvironment and executing actions to complete a planning task, which generally\nentails achieving a desired goal from an initial state. This paper investigates\nenhancing the planning abilities of LLMs through instruction tuning, referred\nto as agent training. Recent studies have demonstrated that utilizing\nexpert-level trajectory for instruction-tuning LLMs effectively enhances their\nplanning capabilities. However, existing work primarily focuses on synthesizing\ntrajectories from manually designed planning tasks and environments. The\nlabor-intensive nature of creating these environments and tasks impedes the\ngeneration of sufficiently varied and extensive trajectories. To address this\nlimitation, this paper explores the automated synthesis of diverse environments\nand a gradual range of planning tasks, from easy to difficult. We introduce a\nframework, AgentGen, that leverages LLMs first to generate environments and\nsubsequently generate planning tasks conditioned on these environments.\nSpecifically, to improve environmental diversity, we propose using an\ninspiration corpus composed of various domain-specific text segments as the\ncontext for synthesizing environments. Moreover, to increase the difficulty\ndiversity of generated planning tasks, we propose a bidirectional evolution\nmethod, Bi-Evol, that evolves planning tasks from easier and harder directions\nto synthesize a task set with a smoother difficulty curve. The evaluation\nresults derived from AgentBoard show that AgentGen greatly improves LLMs'\nplanning ability, e.g., the AgentGen instruction-tuned Llama-3 8B surpasses\nGPT-3.5 in overall performance. Moreover, in certain tasks, it even outperforms\nGPT-4.",
    "arxiv_id": "http://arxiv.org/abs/2408.00764v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00764v1",
    "primary_category": "cs.CL",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Tamper-Resistant Safeguards for Open-Weight LLMs",
    "authors": "Rishub Tamirisa, Bhrugu Bharathi, Long Phan, Andy Zhou, Alice Gatti, Tarun Suresh, Maxwell Lin, Justin Wang, Rowan Wang, Ron Arel, Andy Zou, Dawn Song, Bo Li, Dan Hendrycks, Mantas Mazeika",
    "abstract": "Rapid advances in the capabilities of large language models (LLMs) have\nraised widespread concerns regarding their potential for malicious use.\nOpen-weight LLMs present unique challenges, as existing safeguards lack\nrobustness to tampering attacks that modify model weights. For example, recent\nworks have demonstrated that refusal and unlearning safeguards can be trivially\nremoved with a few steps of fine-tuning. These vulnerabilities necessitate new\napproaches for enabling the safe release of open-weight LLMs. We develop a\nmethod, called TAR, for building tamper-resistant safeguards into open-weight\nLLMs such that adversaries cannot remove the safeguards even after thousands of\nsteps of fine-tuning. In extensive evaluations and red teaming analyses, we\nfind that our method greatly improves tamper-resistance while preserving benign\ncapabilities. Our results demonstrate that tamper-resistance is a tractable\nproblem, opening up a promising new avenue to improve the safety and security\nof open-weight LLMs.",
    "arxiv_id": "http://arxiv.org/abs/2408.00761v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00761v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Smoothed Energy Guidance: Guiding Diffusion Models with Reduced Energy Curvature of Attention",
    "authors": "Susung Hong",
    "abstract": "Conditional diffusion models have shown remarkable success in visual content\ngeneration, producing high-quality samples across various domains, largely due\nto classifier-free guidance (CFG). Recent attempts to extend guidance to\nunconditional models have relied on heuristic techniques, resulting in\nsuboptimal generation quality and unintended effects. In this work, we propose\nSmoothed Energy Guidance (SEG), a novel training- and condition-free approach\nthat leverages the energy-based perspective of the self-attention mechanism to\nenhance image generation. By defining the energy of self-attention, we\nintroduce a method to reduce the curvature of the energy landscape of attention\nand use the output as the unconditional prediction. Practically, we control the\ncurvature of the energy landscape by adjusting the Gaussian kernel parameter\nwhile keeping the guidance scale parameter fixed. Additionally, we present a\nquery blurring method that is equivalent to blurring the entire attention\nweights without incurring quadratic complexity in the number of tokens. In our\nexperiments, SEG achieves a Pareto improvement in both quality and the\nreduction of side effects. The code is available at\n\\url{https://github.com/SusungHong/SEG-SDXL}.",
    "arxiv_id": "http://arxiv.org/abs/2408.00760v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00760v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Coarse Correspondence Elicit 3D Spacetime Understanding in Multimodal Language Model",
    "authors": "Benlin Liu, Yuhao Dong, Yiqin Wang, Yongming Rao, Yansong Tang, Wei-Chiu Ma, Ranjay Krishna",
    "abstract": "Multimodal language models (MLLMs) are increasingly being implemented in\nreal-world environments, necessitating their ability to interpret 3D spaces and\ncomprehend temporal dynamics. Despite their potential, current top models\nwithin our community still fall short in adequately understanding spatial and\ntemporal dimensions. We introduce Coarse Correspondence, a simple,\ntraining-free, effective, and general-purpose visual prompting method to elicit\n3D and temporal understanding in multimodal LLMs. Our method uses a lightweight\ntracking model to find object correspondences between frames in a video or\nbetween sets of image viewpoints. It selects the most frequent object instances\nand visualizes them with markers with unique IDs in the image. With this simple\napproach, we achieve state-of-the-art results on 3D understanding benchmarks\nincluding ScanQA (+20.5\\%) and a subset of OpenEQA (+9.7\\%), and on long-form\nvideo benchmarks such as EgoSchema (+6.0\\%). We also curate a small diagnostic\ndataset to evaluate whether MLLMs can reason about space from a described\nviewpoint other than the camera viewpoint. Again, Coarse Correspondence\nimproves spatial perspective-taking abilities but we highlight that MLLMs\nstruggle with this task. Together, we demonstrate that our simple prompting\nmethod can significantly aid downstream tasks that require 3D or temporal\nreasoning.",
    "arxiv_id": "http://arxiv.org/abs/2408.00754v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00754v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "A Policy-Gradient Approach to Solving Imperfect-Information Games with Iterate Convergence",
    "authors": "Mingyang Liu, Gabriele Farina, Asuman Ozdaglar",
    "abstract": "Policy gradient methods have become a staple of any single-agent\nreinforcement learning toolbox, due to their combination of desirable\nproperties: iterate convergence, efficient use of stochastic trajectory\nfeedback, and theoretically-sound avoidance of importance sampling corrections.\nIn multi-agent imperfect-information settings (extensive-form games), however,\nit is still unknown whether the same desiderata can be guaranteed while\nretaining theoretical guarantees. Instead, sound methods for extensive-form\ngames rely on approximating counterfactual values (as opposed to Q values),\nwhich are incompatible with policy gradient methodologies. In this paper, we\ninvestigate whether policy gradient can be safely used in two-player zero-sum\nimperfect-information extensive-form games (EFGs). We establish positive\nresults, showing for the first time that a policy gradient method leads to\nprovable best-iterate convergence to a regularized Nash equilibrium in\nself-play.",
    "arxiv_id": "http://arxiv.org/abs/2408.00751v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00751v1",
    "primary_category": "cs.GT",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Leaf Angle Estimation using Mask R-CNN and LETR Vision Transformer",
    "authors": "Venkat Margapuri, Prapti Thapaliya, Trevor Rife",
    "abstract": "Modern day studies show a high degree of correlation between high yielding\ncrop varieties and plants with upright leaf angles. It is observed that plants\nwith upright leaf angles intercept more light than those without upright leaf\nangles, leading to a higher rate of photosynthesis. Plant scientists and\nbreeders benefit from tools that can directly measure plant parameters in the\nfield i.e. on-site phenotyping. The estimation of leaf angles by manual means\nin a field setting is tedious and cumbersome. We mitigate the tedium using a\ncombination of the Mask R-CNN instance segmentation neural network, and Line\nSegment Transformer (LETR), a vision transformer. The proposed Computer Vision\n(CV) pipeline is applied on two image datasets, Summer 2015-Ames ULA and Summer\n2015- Ames MLA, with a combined total of 1,827 plant images collected in the\nfield using FieldBook, an Android application aimed at on-site phenotyping. The\nleaf angles estimated by the proposed pipeline on the image datasets are\ncompared to two independent manual measurements using ImageJ, a Java-based\nimage processing program developed at the National Institutes of Health and the\nLaboratory for Optical and Computational Instrumentation. The results, when\ncompared for similarity using the Cosine Similarity measure, exhibit 0.98\nsimilarity scores on both independent measurements of Summer 2015-Ames ULA and\nSummer 2015-Ames MLA image datasets, demonstrating the feasibility of the\nproposed pipeline for on-site measurement of leaf angles.",
    "arxiv_id": "http://arxiv.org/abs/2408.00749v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00749v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "CERT-ED: Certifiably Robust Text Classification for Edit Distance",
    "authors": "Zhuoqun Huang, Neil G Marchant, Olga Ohrimenko, Benjamin I. P. Rubinstein",
    "abstract": "With the growing integration of AI in daily life, ensuring the robustness of\nsystems to inference-time attacks is crucial. Among the approaches for\ncertifying robustness to such adversarial examples, randomized smoothing has\nemerged as highly promising due to its nature as a wrapper around arbitrary\nblack-box models. Previous work on randomized smoothing in natural language\nprocessing has primarily focused on specific subsets of edit distance\noperations, such as synonym substitution or word insertion, without exploring\nthe certification of all edit operations. In this paper, we adapt Randomized\nDeletion (Huang et al., 2023) and propose, CERTified Edit Distance defense\n(CERT-ED) for natural language classification. Through comprehensive\nexperiments, we demonstrate that CERT-ED outperforms the existing Hamming\ndistance method RanMASK (Zeng et al., 2023) in 4 out of 5 datasets in terms of\nboth accuracy and the cardinality of the certificate. By covering various\nthreat models, including 5 direct and 5 transfer attacks, our method improves\nempirical robustness in 38 out of 50 settings.",
    "arxiv_id": "http://arxiv.org/abs/2408.00728v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00728v1",
    "primary_category": "cs.CL",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "A Natural Language Processing Framework for Hotel Recommendation Based on Users' Text Reviews",
    "authors": "Lavrentia Aravani, Emmanuel Pintelas, Christos Pierrakeas, Panagiotis Pintelas",
    "abstract": "Recently, the application of Artificial Intelligence algorithms in hotel\nrecommendation systems has become an increasingly popular topic. One such\nmethod that has proven to be effective in this field is Deep Learning,\nespecially Natural Language processing models, which are able to extract\nsemantic knowledge from user's text reviews to create more efficient\nrecommendation systems. This can lead to the development of intelligent models\nthat can classify a user's preferences and emotions based on their feedback in\nthe form of text reviews about their hotel stay experience. In this study, we\npropose a Natural Language Processing framework that utilizes customer text\nreviews to provide personalized recommendations for the most appropriate hotel\nbased on their preferences. The framework is based on Bidirectional Encoder\nRepresentations from Transformers (BERT) and a fine-tuning/validation pipeline\nthat categorizes customer hotel review texts into \"Bad,\" \"Good,\" or \"Excellent\"\nrecommended hotels. Our findings indicate that the hotel recommendation system\nwe propose can significantly enhance the user experience of booking\naccommodations by providing personalized recommendations based on user\npreferences and previous booking history.",
    "arxiv_id": "http://arxiv.org/abs/2408.00716v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00716v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "SAM 2: Segment Anything in Images and Videos",
    "authors": "Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman R\u00e4dle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Doll\u00e1r, Christoph Feichtenhofer",
    "abstract": "We present Segment Anything Model 2 (SAM 2), a foundation model towards\nsolving promptable visual segmentation in images and videos. We build a data\nengine, which improves model and data via user interaction, to collect the\nlargest video segmentation dataset to date. Our model is a simple transformer\narchitecture with streaming memory for real-time video processing. SAM 2\ntrained on our data provides strong performance across a wide range of tasks.\nIn video segmentation, we observe better accuracy, using 3x fewer interactions\nthan prior approaches. In image segmentation, our model is more accurate and 6x\nfaster than the Segment Anything Model (SAM). We believe that our data, model,\nand insights will serve as a significant milestone for video segmentation and\nrelated perception tasks. We are releasing a version of our model, the dataset\nand an interactive demo.",
    "arxiv_id": "http://arxiv.org/abs/2408.00714v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00714v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Insurance Portfolio Pursuit with Reinforcement Learning",
    "authors": "Edward James Young, Alistair Rogers, Elliott Tong, James Jordon",
    "abstract": "When faced with a new customer, many factors contribute to an insurance\nfirm's decision of what offer to make to that customer. In addition to the\nexpected cost of providing the insurance, the firm must consider the other\noffers likely to be made to the customer, and how sensitive the customer is to\ndifferences in price. Moreover, firms often target a specific portfolio of\ncustomers that could depend on, e.g., age, location, and occupation. Given such\na target portfolio, firms may choose to modulate an individual customer's offer\nbased on whether the firm desires the customer within their portfolio. Given a\ntarget portfolio, we term the problem of modulating offers to achieve this\ntarget portfolio the portfolio pursuit problem. We give a formulation of\nportfolio pursuit as a sequential decision making problem, and devise a novel\nreinforcement learning algorithm for its solution. We test our method on a\ncomplex synthetic market environment, and demonstrate that it outperforms a\nbaseline method which mimics current industry approaches to portfolio pursuit.",
    "arxiv_id": "http://arxiv.org/abs/2408.00713v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00713v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Synthetic dual image generation for reduction of labeling efforts in semantic segmentation of micrographs with a customized metric function",
    "authors": "Matias Oscar Volman Stern, Dominic Hohs, Andreas Jansche, Timo Bernthaler, Gerhard Schneider",
    "abstract": "Training of semantic segmentation models for material analysis requires\nmicrographs and their corresponding masks. It is quite unlikely that perfect\nmasks will be drawn, especially at the edges of objects, and sometimes the\namount of data that can be obtained is small, since only a few samples are\navailable. These aspects make it very problematic to train a robust model. We\ndemonstrate a workflow for the improvement of semantic segmentation models of\nmicrographs through the generation of synthetic microstructural images in\nconjunction with masks. The workflow only requires joining a few micrographs\nwith their respective masks to create the input for a Vector\nQuantised-Variational AutoEncoder model that includes an embedding space, which\nis trained such that a generative model (PixelCNN) learns the distribution of\neach input, transformed into discrete codes, and can be used to sample new\ncodes. The latter will eventually be decoded by VQ-VAE to generate images\nalongside corresponding masks for semantic segmentation. To evaluate the\nsynthetic data, we have trained U-Net models with different amounts of these\nsynthetic data in conjunction with real data. These models were then evaluated\nusing non-synthetic images only. Additionally, we introduce a customized metric\nderived from the mean Intersection over Union (mIoU). The proposed metric\nprevents a few falsely predicted pixels from greatly reducing the value of the\nmIoU. We have achieved a reduction in sample preparation and acquisition times,\nas well as the efforts, needed for image processing and labeling tasks, are\nless when it comes to training semantic segmentation model. The approach could\nbe generalized to various types of image data such that it serves as a\nuser-friendly solution for training models with a small number of real images.",
    "arxiv_id": "http://arxiv.org/abs/2408.00707v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00707v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Point-supervised Brain Tumor Segmentation with Box-prompted MedSAM",
    "authors": "Xiaofeng Liu, Jonghye Woo, Chao Ma, Jinsong Ouyang, Georges El Fakhri",
    "abstract": "Delineating lesions and anatomical structure is important for image-guided\ninterventions. Point-supervised medical image segmentation (PSS) has great\npotential to alleviate costly expert delineation labeling. However, due to the\nlack of precise size and boundary guidance, the effectiveness of PSS often\nfalls short of expectations. Although recent vision foundational models, such\nas the medical segment anything model (MedSAM), have made significant\nadvancements in bounding-box-prompted segmentation, it is not straightforward\nto utilize point annotation, and is prone to semantic ambiguity. In this\npreliminary study, we introduce an iterative framework to facilitate\nsemantic-aware point-supervised MedSAM. Specifically, the semantic box-prompt\ngenerator (SBPG) module has the capacity to convert the point input into\npotential pseudo bounding box suggestions, which are explicitly refined by the\nprototype-based semantic similarity. This is then succeeded by a prompt-guided\nspatial refinement (PGSR) module that harnesses the exceptional\ngeneralizability of MedSAM to infer the segmentation mask, which also updates\nthe box proposal seed in SBPG. Performance can be progressively improved with\nadequate iterations. We conducted an evaluation on BraTS2018 for the\nsegmentation of whole brain tumors and demonstrated its superior performance\ncompared to traditional PSS methods and on par with box-supervised methods.",
    "arxiv_id": "http://arxiv.org/abs/2408.00706v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00706v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "You Can't Ignore Either: Unifying Structure and Feature Denoising for Robust Graph Learning",
    "authors": "Tianmeng Yang, Jiahao Meng, Min Zhou, Yaming Yang, Yujing Wang, Xiangtai Li, Yunhai Tong",
    "abstract": "Recent research on the robustness of Graph Neural Networks (GNNs) under\nnoises or attacks has attracted great attention due to its importance in\nreal-world applications. Most previous methods explore a single noise source,\nrecovering corrupt node embedding by reliable structures bias or developing\nstructure learning with reliable node features. However, the noises and attacks\nmay come from both structures and features in graphs, making the graph\ndenoising a dilemma and challenging problem. In this paper, we develop a\nunified graph denoising (UGD) framework to unravel the deadlock between\nstructure and feature denoising. Specifically, a high-order neighborhood\nproximity evaluation method is proposed to recognize noisy edges, considering\nfeatures may be perturbed simultaneously. Moreover, we propose to refine noisy\nfeatures with reconstruction based on a graph auto-encoder. An iterative\nupdating algorithm is further designed to optimize the framework and acquire a\nclean graph, thus enabling robust graph learning for downstream tasks. Our UGD\nframework is self-supervised and can be easily implemented as a plug-and-play\nmodule. We carry out extensive experiments, which proves the effectiveness and\nadvantages of our method. Code is avalaible at\nhttps://github.com/YoungTimmy/UGD.",
    "arxiv_id": "http://arxiv.org/abs/2408.00700v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00700v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Granular-Balls based Fuzzy Twin Support Vector Machine for Classification",
    "authors": "Lixi Zhao, Weiping Ding, Duoqian Miao, Guangming Lang",
    "abstract": "The twin support vector machine (TWSVM) classifier has attracted increasing\nattention because of its low computational complexity. However, its performance\ntends to degrade when samples are affected by noise. The granular-ball fuzzy\nsupport vector machine (GBFSVM) classifier partly alleviates the adverse\neffects of noise, but it relies solely on the distance between the\ngranular-ball's center and the class center to design the granular-ball\nmembership function. In this paper, we first introduce the granular-ball twin\nsupport vector machine (GBTWSVM) classifier, which integrates granular-ball\ncomputing (GBC) with the twin support vector machine (TWSVM) classifier. By\nreplacing traditional point inputs with granular-balls, we demonstrate how to\nderive a pair of non-parallel hyperplanes for the GBTWSVM classifier by solving\na quadratic programming problem. Subsequently, we design the membership and\nnon-membership functions of granular-balls using Pythagorean fuzzy sets to\ndifferentiate the contributions of granular-balls in various regions.\nAdditionally, we develop the granular-ball fuzzy twin support vector machine\n(GBFTSVM) classifier by incorporating GBC with the fuzzy twin support vector\nmachine (FTSVM) classifier. We demonstrate how to derive a pair of non-parallel\nhyperplanes for the GBFTSVM classifier by solving a quadratic programming\nproblem. We also design algorithms for the GBTSVM classifier and the GBFTSVM\nclassifier. Finally, the superior classification performance of the GBTWSVM\nclassifier and the GBFTSVM classifier on 20 benchmark datasets underscores\ntheir scalability, efficiency, and robustness in tackling classification tasks.",
    "arxiv_id": "http://arxiv.org/abs/2408.00699v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00699v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Accelerating Full Waveform Inversion By Transfer Learning",
    "authors": "Divya Shyam Singh, Leon Herrmann, Qing Sun, Tim B\u00fcrchner, Felix Dietrich, Stefan Kollmannsberger",
    "abstract": "Full waveform inversion (FWI) is a powerful tool for reconstructing material\nfields based on sparsely measured data obtained by wave propagation. For\nspecific problems, discretizing the material field with a neural network (NN)\nimproves the robustness and reconstruction quality of the corresponding\noptimization problem. We call this method NN-based FWI. Starting from an\ninitial guess, the weights of the NN are iteratively updated to fit the\nsimulated wave signals to the sparsely measured data set. For gradient-based\noptimization, a suitable choice of the initial guess, i.e., a suitable NN\nweight initialization, is crucial for fast and robust convergence.\n  In this paper, we introduce a novel transfer learning approach to further\nimprove NN-based FWI. This approach leverages supervised pretraining to provide\na better NN weight initialization, leading to faster convergence of the\nsubsequent optimization problem. Moreover, the inversions yield physically more\nmeaningful local minima. The network is pretrained to predict the unknown\nmaterial field using the gradient information from the first iteration of\nconventional FWI. In our computational experiments on two-dimensional domains,\nthe training data set consists of reference simulations with arbitrarily\npositioned elliptical voids of different shapes and orientations. We compare\nthe performance of the proposed transfer learning NN-based FWI with three other\nmethods: conventional FWI, NN-based FWI without pretraining and conventional\nFWI with an initial guess predicted from the pretrained NN. Our results show\nthat transfer learning NN-based FWI outperforms the other methods in terms of\nconvergence speed and reconstruction quality.",
    "arxiv_id": "http://arxiv.org/abs/2408.00695v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00695v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Alpha-VI DeepONet: A prior-robust variational Bayesian approach for enhancing DeepONets with uncertainty quantification",
    "authors": "Soban Nasir Lone, Subhayan De, Rajdip Nayek",
    "abstract": "We introduce a novel deep operator network (DeepONet) framework that\nincorporates generalised variational inference (GVI) using R\\'enyi's\n$\\alpha$-divergence to learn complex operators while quantifying uncertainty.\nBy incorporating Bayesian neural networks as the building blocks for the branch\nand trunk networks, our framework endows DeepONet with uncertainty\nquantification. The use of R\\'enyi's $\\alpha$-divergence, instead of the\nKullback-Leibler divergence (KLD), commonly used in standard variational\ninference, mitigates issues related to prior misspecification that are\nprevalent in Variational Bayesian DeepONets. This approach offers enhanced\nflexibility and robustness. We demonstrate that modifying the variational\nobjective function yields superior results in terms of minimising the mean\nsquared error and improving the negative log-likelihood on the test set. Our\nframework's efficacy is validated across various mechanical systems, where it\noutperforms both deterministic and standard KLD-based VI DeepONets in\npredictive accuracy and uncertainty quantification. The hyperparameter\n$\\alpha$, which controls the degree of robustness, can be tuned to optimise\nperformance for specific problems. We apply this approach to a range of\nmechanics problems, including gravity pendulum, advection-diffusion, and\ndiffusion-reaction systems. Our findings underscore the potential of\n$\\alpha$-VI DeepONet to advance the field of data-driven operator learning and\nits applications in engineering and scientific domains.",
    "arxiv_id": "http://arxiv.org/abs/2408.00681v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00681v1",
    "primary_category": "stat.ML",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "An effect analysis of the balancing techniques on the counterfactual explanations of student success prediction models",
    "authors": "Mustafa Cavus, Jakub Kuzilek",
    "abstract": "In the past decade, we have experienced a massive boom in the usage of\ndigital solutions in higher education. Due to this boom, large amounts of data\nhave enabled advanced data analysis methods to support learners and examine\nlearning processes. One of the dominant research directions in learning\nanalytics is predictive modeling of learners' success using various machine\nlearning methods. To build learners' and teachers' trust in such methods and\nsystems, exploring the methods and methodologies that enable relevant\nstakeholders to deeply understand the underlying machine-learning models is\nnecessary. In this context, counterfactual explanations from explainable\nmachine learning tools are promising. Several counterfactual generation methods\nhold much promise, but the features must be actionable and causal to be\neffective. Thus, obtaining which counterfactual generation method suits the\nstudent success prediction models in terms of desiderata, stability, and\nrobustness is essential. Although a few studies have been published in recent\nyears on the use of counterfactual explanations in educational sciences, they\nhave yet to discuss which counterfactual generation method is more suitable for\nthis problem. This paper analyzed the effectiveness of commonly used\ncounterfactual generation methods, such as WhatIf Counterfactual Explanations,\nMulti-Objective Counterfactual Explanations, and Nearest Instance\nCounterfactual Explanations after balancing. This contribution presents a case\nstudy using the Open University Learning Analytics dataset to demonstrate the\npractical usefulness of counterfactual explanations. The results illustrate the\nmethod's effectiveness and describe concrete steps that could be taken to alter\nthe model's prediction.",
    "arxiv_id": "http://arxiv.org/abs/2408.00676v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00676v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "ChordSync: Conformer-Based Alignment of Chord Annotations to Music Audio",
    "authors": "Andrea Poltronieri, Valentina Presutti, Mart\u00edn Rocamora",
    "abstract": "In the Western music tradition, chords are the main constituent components of\nharmony, a fundamental dimension of music. Despite its relevance for several\nMusic Information Retrieval (MIR) tasks, chord-annotated audio datasets are\nlimited and need more diversity. One way to improve those resources is to\nleverage the large number of chord annotations available online, but this\nrequires aligning them with music audio. However, existing audio-to-score\nalignment techniques, which typically rely on Dynamic Time Warping (DTW), fail\nto address this challenge, as they require weakly aligned data for precise\nsynchronisation. In this paper, we introduce ChordSync, a novel conformer-based\nmodel designed to seamlessly align chord annotations with audio, eliminating\nthe need for weak alignment. We also provide a pre-trained model and a\nuser-friendly library, enabling users to synchronise chord annotations with\naudio tracks effortlessly. In this way, ChordSync creates opportunities for\nharnessing crowd-sourced chord data for MIR, especially in audio chord\nestimation, thereby facilitating the generation of novel datasets.\nAdditionally, our system extends its utility to music education, enhancing\nmusic learning experiences by providing accurately aligned annotations, thus\nenabling learners to engage in synchronised musical practices.",
    "arxiv_id": "http://arxiv.org/abs/2408.00674v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00674v1",
    "primary_category": "cs.SD",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "AutoM3L: An Automated Multimodal Machine Learning Framework with Large Language Models",
    "authors": "Daqin Luo, Chengjian Feng, Yuxuan Nong, Yiqing Shen",
    "abstract": "Automated Machine Learning (AutoML) offers a promising approach to streamline\nthe training of machine learning models. However, existing AutoML frameworks\nare often limited to unimodal scenarios and require extensive manual\nconfiguration. Recent advancements in Large Language Models (LLMs) have\nshowcased their exceptional abilities in reasoning, interaction, and code\ngeneration, presenting an opportunity to develop a more automated and\nuser-friendly framework. To this end, we introduce AutoM3L, an innovative\nAutomated Multimodal Machine Learning framework that leverages LLMs as\ncontrollers to automatically construct multimodal training pipelines. AutoM3L\ncomprehends data modalities and selects appropriate models based on user\nrequirements, providing automation and interactivity. By eliminating the need\nfor manual feature engineering and hyperparameter optimization, our framework\nsimplifies user engagement and enables customization through directives,\naddressing the limitations of previous rule-based AutoML approaches. We\nevaluate the performance of AutoM3L on six diverse multimodal datasets spanning\nclassification, regression, and retrieval tasks, as well as a comprehensive set\nof unimodal datasets. The results demonstrate that AutoM3L achieves competitive\nor superior performance compared to traditional rule-based AutoML methods.\nFurthermore, a user study highlights the user-friendliness and usability of our\nframework, compared to the rule-based AutoML methods.",
    "arxiv_id": "http://arxiv.org/abs/2408.00665v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00665v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Aligning Multiple Knowledge Graphs in a Single Pass",
    "authors": "Yaming Yang, Zhe Wang, Ziyu Guan, Wei Zhao, Weigang Lu, Xinyan Huang",
    "abstract": "Entity alignment (EA) is to identify equivalent entities across different\nknowledge graphs (KGs), which can help fuse these KGs into a more comprehensive\none. Previous EA methods mainly focus on aligning a pair of KGs, and to the\nbest of our knowledge, no existing EA method considers aligning multiple (more\nthan two) KGs. To fill this research gap, in this work, we study a novel\nproblem of aligning multiple KGs and propose an effective framework named\nMultiEA to solve the problem. First, we embed the entities of all the candidate\nKGs into a common feature space by a shared KG encoder. Then, we explore three\nalignment strategies to minimize the distances among pre-aligned entities. In\nparticular, we propose an innovative inference enhancement technique to improve\nthe alignment performance by incorporating high-order similarities. Finally, to\nverify the effectiveness of MultiEA, we construct two new real-world benchmark\ndatasets and conduct extensive experiments on them. The results show that our\nMultiEA can effectively and efficiently align multiple KGs in a single pass.",
    "arxiv_id": "http://arxiv.org/abs/2408.00662v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00662v1",
    "primary_category": "cs.CL",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Disentangling Dense Embeddings with Sparse Autoencoders",
    "authors": "Charles O'Neill, Christine Ye, Kartheik Iyer, John F. Wu",
    "abstract": "Sparse autoencoders (SAEs) have shown promise in extracting interpretable\nfeatures from complex neural networks. We present one of the first applications\nof SAEs to dense text embeddings from large language models, demonstrating\ntheir effectiveness in disentangling semantic concepts. By training SAEs on\nembeddings of over 420,000 scientific paper abstracts from computer science and\nastronomy, we show that the resulting sparse representations maintain semantic\nfidelity while offering interpretability. We analyse these learned features,\nexploring their behaviour across different model capacities and introducing a\nnovel method for identifying ``feature families'' that represent related\nconcepts at varying levels of abstraction. To demonstrate the practical utility\nof our approach, we show how these interpretable features can be used to\nprecisely steer semantic search, allowing for fine-grained control over query\nsemantics. This work bridges the gap between the semantic richness of dense\nembeddings and the interpretability of sparse representations. We open source\nour embeddings, trained sparse autoencoders, and interpreted features, as well\nas a web app for exploring them.",
    "arxiv_id": "http://arxiv.org/abs/2408.00657v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00657v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Enhancing Multistep Prediction of Multivariate Market Indices Using Weighted Optical Reservoir Computing",
    "authors": "Fang Wang, Ting Bu, Yuping Huang",
    "abstract": "We propose and experimentally demonstrate an innovative stock index\nprediction method using a weighted optical reservoir computing system. We\nconstruct fundamental market data combined with macroeconomic data and\ntechnical indicators to capture the broader behavior of the stock market. Our\napproach shows significant higher performance than state-of-the-art methods\nsuch as linear regression, decision trees, and neural network architectures\nincluding long short-term memory. It captures well the market's high volatility\nand nonlinear behaviors despite limited data, demonstrating great potential for\nreal-time, parallel, multi-dimensional data processing and predictions.",
    "arxiv_id": "http://arxiv.org/abs/2408.00652v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00652v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Enhancing Ethereum Fraud Detection via Generative and Contrastive Self-supervision",
    "authors": "Chenxiang Jin, Jiajun Zhou, Chenxuan Xie, Shanqing Yu, Qi Xuan, Xiaoniu Yang",
    "abstract": "The rampant fraudulent activities on Ethereum hinder the healthy development\nof the blockchain ecosystem, necessitating the reinforcement of regulations.\nHowever, multiple imbalances involving account interaction frequencies and\ninteraction types in the Ethereum transaction environment pose significant\nchallenges to data mining-based fraud detection research. To address this, we\nfirst propose the concept of meta-interactions to refine interaction behaviors\nin Ethereum, and based on this, we present a dual self-supervision enhanced\nEthereum fraud detection framework, named Meta-IFD. This framework initially\nintroduces a generative self-supervision mechanism to augment the interaction\nfeatures of accounts, followed by a contrastive self-supervision mechanism to\ndifferentiate various behavior patterns, and ultimately characterizes the\nbehavioral representations of accounts and mines potential fraud risks through\nmulti-view interaction feature learning. Extensive experiments on real Ethereum\ndatasets demonstrate the effectiveness and superiority of our framework in\ndetecting common Ethereum fraud behaviors such as Ponzi schemes and phishing\nscams. Additionally, the generative module can effectively alleviate the\ninteraction distribution imbalance in Ethereum data, while the contrastive\nmodule significantly enhances the framework's ability to distinguish different\nbehavior patterns. The source code will be released on GitHub soon.",
    "arxiv_id": "http://arxiv.org/abs/2408.00641v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00641v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Privacy-preserving datasets by capturing feature distributions with Conditional VAEs",
    "authors": "Francesco Di Salvo, David Tafler, Sebastian Doerrich, Christian Ledig",
    "abstract": "Large and well-annotated datasets are essential for advancing deep learning\napplications, however often costly or impossible to obtain by a single entity.\nIn many areas, including the medical domain, approaches relying on data sharing\nhave become critical to address those challenges. While effective in increasing\ndataset size and diversity, data sharing raises significant privacy concerns.\nCommonly employed anonymization methods based on the k-anonymity paradigm often\nfail to preserve data diversity, affecting model robustness. This work\nintroduces a novel approach using Conditional Variational Autoencoders (CVAEs)\ntrained on feature vectors extracted from large pre-trained vision foundation\nmodels. Foundation models effectively detect and represent complex patterns\nacross diverse domains, allowing the CVAE to faithfully capture the embedding\nspace of a given data distribution to generate (sample) a diverse,\nprivacy-respecting, and potentially unbounded set of synthetic feature vectors.\nOur method notably outperforms traditional approaches in both medical and\nnatural image domains, exhibiting greater dataset diversity and higher\nrobustness against perturbations while preserving sample privacy. These results\nunderscore the potential of generative models to significantly impact deep\nlearning applications in data-scarce and privacy-sensitive environments. The\nsource code is available at\nhttps://github.com/francescodisalvo05/cvae-anonymization .",
    "arxiv_id": "http://arxiv.org/abs/2408.00639v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00639v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Unlocking Fair Use in the Generative AI Supply Chain: A Systematized Literature Review",
    "authors": "Amruta Mahuli, Asia Biega",
    "abstract": "Through a systematization of generative AI (GenAI) stakeholder goals and\nexpectations, this work seeks to uncover what value different stakeholders see\nin their contributions to the GenAI supply line. This valuation enables us to\nunderstand whether fair use advocated by GenAI companies to train model\nprogresses the copyright law objective of promoting science and arts. While\nassessing the validity and efficacy of the fair use argument, we uncover\nresearch gaps and potential avenues for future works for researchers and\npolicymakers to address.",
    "arxiv_id": "http://arxiv.org/abs/2408.00613v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00613v1",
    "primary_category": "cs.AI",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Using CSNNs to Perform Event-based Data Processing & Classification on ASL-DVS",
    "authors": "Ria Patel, Sujit Tripathy, Zachary Sublett, Seoyoung An, Riya Patel",
    "abstract": "Recent advancements in bio-inspired visual sensing and neuromorphic computing\nhave led to the development of various highly efficient bio-inspired solutions\nwith real-world applications. One notable application integrates event-based\ncameras with spiking neural networks (SNNs) to process event-based sequences\nthat are asynchronous and sparse, making them difficult to handle. In this\nproject, we develop a convolutional spiking neural network (CSNN) architecture\nthat leverages convolutional operations and recurrent properties of a spiking\nneuron to learn the spatial and temporal relations in the ASL-DVS gesture\ndataset. The ASL-DVS gesture dataset is a neuromorphic dataset containing hand\ngestures when displaying 24 letters (A to Y, excluding J and Z due to the\nnature of their symbols) from the American Sign Language (ASL). We performed\nclassification on a pre-processed subset of the full ASL-DVS dataset to\nidentify letter signs and achieved 100\\% training accuracy. Specifically, this\nwas achieved by training in the Google Cloud compute platform while using a\nlearning rate of 0.0005, batch size of 25 (total of 20 batches), 200\niterations, and 10 epochs.",
    "arxiv_id": "http://arxiv.org/abs/2408.00611v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00611v1",
    "primary_category": "cs.NE",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "AutoPV: Automatically Design Your Photovoltaic Power Forecasting Model",
    "authors": "Dayin Chen, Xiaodan Shi, Mingkun Jiang, Haoran Zhang, Dongxiao Zhang, Yuntian Chen, Jinyue Yan",
    "abstract": "Photovoltaic power forecasting (PVPF) is a critical area in time series\nforecasting (TSF), enabling the efficient utilization of solar energy. With\nadvancements in machine learning and deep learning, various models have been\napplied to PVPF tasks. However, constructing an optimal predictive architecture\nfor specific PVPF tasks remains challenging, as it requires cross-domain\nknowledge and significant labor costs. To address this challenge, we introduce\nAutoPV, a novel framework for the automated search and construction of PVPF\nmodels based on neural architecture search (NAS) technology. We develop a brand\nnew NAS search space that incorporates various data processing techniques from\nstate-of-the-art (SOTA) TSF models and typical PVPF deep learning models. The\neffectiveness of AutoPV is evaluated on diverse PVPF tasks using a dataset from\nthe Daqing Photovoltaic Station in China. Experimental results demonstrate that\nAutoPV can complete the predictive architecture construction process in a\nrelatively short time, and the newly constructed architecture is superior to\nSOTA predefined models. This work bridges the gap in applying NAS to TSF\nproblems, assisting non-experts and industries in automatically designing\neffective PVPF models.",
    "arxiv_id": "http://arxiv.org/abs/2408.00601v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00601v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Convergence Analysis of Natural Gradient Descent for Over-parameterized Physics-Informed Neural Networks",
    "authors": "Xianliang Xu, Ting Du, Wang Kong, Ye Li, Zhongyi Huang",
    "abstract": "First-order methods, such as gradient descent (GD) and stochastic gradient\ndescent (SGD) have been proven effective in training neural networks. In the\nsetting of over-parameterization, there is a line of work demonstrating that\nrandomly initialized (stochastic) gradient descent converges to a globally\noptimal solution at a linear convergence rate for the quadratic loss function.\nHowever, the learning rate of GD in training two-layer neural networks has a\npoor dependence on the sample size and the Gram matrix, resulting in a slow\ntraining process. In this paper, we show that for the $L^2$ regression\nproblems, the learning rate can be improved from $\\mathcal{O}(\\lambda_0/n^2)$\nto $\\mathcal{O}(1/\\|\\bm{H}^{\\infty}\\|_2)$, which implies that GD enjoys a\nfaster convergence rate. Moreover, we further generalize the method for GD in\ntraining two-layer Physics-Informed Neural Networks (PINNs), showing a similar\nimprovement for the learning rate. Although the improved learning rate depends\nmildly on the Gram matrix, we still need to set it small enough in practice due\nto the agnostic eigenvalues of the Gram matrix. More importantly, the\nconvergence rate relies on the least eigenvalue of the Gram matrix, leading to\nslow convergence. In this work, we provide the convergence analysis of natural\ngradient descent (NGD) in training two-layer PINNs. We show that the learning\nrate can be $\\mathcal{O}(1)$ and at this time, the convergence rate is\nindependent of the Gram matrix.",
    "arxiv_id": "http://arxiv.org/abs/2408.00573v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00573v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Analyzing the Effectiveness of Quantum Annealing with Meta-Learning",
    "authors": "Riccardo Pellini, Maurizio Ferrari Dacrema",
    "abstract": "The field of Quantum Computing has gathered significant popularity in recent\nyears and a large number of papers have studied its effectiveness in tackling\nmany tasks. We focus in particular on Quantum Annealing (QA), a meta-heuristic\nsolver for Quadratic Unconstrained Binary Optimization (QUBO) problems. It is\nknown that the effectiveness of QA is dependent on the task itself, as is the\ncase for classical solvers, but there is not yet a clear understanding of which\nare the characteristics of a problem that makes it difficult to solve with QA.\nIn this work, we propose a new methodology to study the effectiveness of QA\nbased on meta-learning models. To do so, we first build a dataset composed of\nmore than five thousand instances of ten different optimization problems. We\ndefine a set of more than a hundred features to describe their characteristics,\nand solve them with both QA and three classical solvers. We publish this\ndataset online for future research. Then, we train multiple meta-models to\npredict whether QA would solve that instance effectively and use them to probe\nwhich are the features with the strongest impact on the effectiveness of QA.\nOur results indicate that it is possible to accurately predict the\neffectiveness of QA, validating our methodology. Furthermore, we observe that\nthe distribution of the problem coefficients representing the bias and coupling\nterms is very informative to identify the probability of finding good\nsolutions, while the density of these coefficients alone is not enough. The\nmethodology we propose allows to open new research directions to further our\nunderstanding of the effectiveness of QA, by probing specific dimensions or by\ndeveloping new QUBO formulations that are better suited for the particular\nnature of QA. Furthermore, the proposed methodology is flexible and can be\nextended or used to study other quantum or classical solvers.",
    "arxiv_id": "http://arxiv.org/abs/2408.00570v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00570v1",
    "primary_category": "quant-ph",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Learning to Embed Distributions via Maximum Kernel Entropy",
    "authors": "Oleksii Kachaiev, Stefano Recanatesi",
    "abstract": "Empirical data can often be considered as samples from a set of probability\ndistributions. Kernel methods have emerged as a natural approach for learning\nto classify these distributions. Although numerous kernels between\ndistributions have been proposed, applying kernel methods to distribution\nregression tasks remains challenging, primarily because selecting a suitable\nkernel is not straightforward. Surprisingly, the question of learning a\ndata-dependent distribution kernel has received little attention. In this\npaper, we propose a novel objective for the unsupervised learning of\ndata-dependent distribution kernel, based on the principle of entropy\nmaximization in the space of probability measure embeddings. We examine the\ntheoretical properties of the latent embedding space induced by our objective,\ndemonstrating that its geometric structure is well-suited for solving\ndownstream discriminative tasks. Finally, we demonstrate the performance of the\nlearned kernel across different modalities.",
    "arxiv_id": "http://arxiv.org/abs/2408.00549v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00549v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "The Energy Cost of Artificial Intelligence of Things Lifecycle",
    "authors": "Shih-Kai Chou, Jernej Hribar, Mihael Mohor\u010di\u010d, Carolina Fortuna",
    "abstract": "Artificial intelligence (AI)coupled with existing Internet of Things (IoT)\nenables more streamlined and autonomous operations across various economic\nsectors. Consequently, the paradigm of Artificial Intelligence of Things (AIoT)\nhaving AI techniques at its core implies additional energy and carbon costs\nthat may become significant with more complex neural architectures. To better\nunderstand the energy and Carbon Footprint (CF) of some AIoT components, very\nrecent studies employ conventional metrics. However, these metrics are not\ndesigned to capture energy efficiency aspects of inference. In this paper, we\npropose a new metric, the Energy Cost of AIoT Lifecycle (eCAL) to capture the\noverall energy cost of inference over the lifecycle of an AIoT system. We\ndevise a new methodology for determining eCAL of an AIoT system by analyzing\nthe complexity of data manipulation in individual components involved in the\nAIoT lifecycle and derive the overall and per bit energy consumption. With eCAL\nwe show that the better a model is and the more it is used, the more energy\nefficient an inference is. For an example AIoT configuration, eCAL for making\n$100$ inferences is $1.43$ times higher than for $1000$ inferences. We also\nevaluate the CF of the AIoT system by calculating the equivalent CO$_{2}$\nemissions based on the energy consumption and the Carbon Intensity (CI) across\ndifferent countries. Using 2023 renewable data, our analysis reveals that\ndeploying an AIoT system in Germany results in emitting $4.62$ times higher\nCO$_2$ than in Finland, due to latter using more low-CI energy sources.",
    "arxiv_id": "http://arxiv.org/abs/2408.00540v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00540v1",
    "primary_category": "cs.ET",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "ReSi: A Comprehensive Benchmark for Representational Similarity Measures",
    "authors": "Max Klabunde, Tassilo Wald, Tobias Schumacher, Klaus Maier-Hein, Markus Strohmaier, Florian Lemmerich",
    "abstract": "Measuring the similarity of different representations of neural architectures\nis a fundamental task and an open research challenge for the machine learning\ncommunity. This paper presents the first comprehensive benchmark for evaluating\nrepresentational similarity measures based on well-defined groundings of\nsimilarity. The representational similarity (ReSi) benchmark consists of (i)\nsix carefully designed tests for similarity measures, (ii) 23 similarity\nmeasures, (iii) eleven neural network architectures, and (iv) six datasets,\nspanning over the graph, language, and vision domains. The benchmark opens up\nseveral important avenues of research on representational similarity that\nenable novel explorations and applications of neural architectures. We\ndemonstrate the utility of the ReSi benchmark by conducting experiments on\nvarious neural network architectures, real world datasets and similarity\nmeasures. All components of the benchmark are publicly available and thereby\nfacilitate systematic reproduction and production of research results. The\nbenchmark is extensible, future research can build on and further expand it. We\nbelieve that the ReSi benchmark can serve as a sound platform catalyzing future\nresearch that aims to systematically evaluate existing and explore novel ways\nof comparing representations of neural architectures.",
    "arxiv_id": "http://arxiv.org/abs/2408.00531v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00531v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Contrastive Learning with Dynamic Localized Repulsion for Brain Age Prediction on 3D Stiffness Maps",
    "authors": "Jakob Tr\u00e4uble, Lucy Hiscox, Curtis Johnson, Carola-Bibiane Sch\u00f6nlieb, Gabriele Kaminski Schierle, Angelica Aviles-Rivero",
    "abstract": "In the field of neuroimaging, accurate brain age prediction is pivotal for\nuncovering the complexities of brain aging and pinpointing early indicators of\nneurodegenerative conditions. Recent advancements in self-supervised learning,\nparticularly in contrastive learning, have demonstrated greater robustness when\ndealing with complex datasets. However, current approaches often fall short in\ngeneralizing across non-uniformly distributed data, prevalent in medical\nimaging scenarios. To bridge this gap, we introduce a novel contrastive loss\nthat adapts dynamically during the training process, focusing on the localized\nneighborhoods of samples. Moreover, we expand beyond traditional structural\nfeatures by incorporating brain stiffness, a mechanical property previously\nunderexplored yet promising due to its sensitivity to age-related changes. This\nwork presents the first application of self-supervised learning to brain\nmechanical properties, using compiled stiffness maps from various clinical\nstudies to predict brain age. Our approach, featuring dynamic localized loss,\nconsistently outperforms existing state-of-the-art methods, demonstrating\nsuperior performance and laying the way for new directions in brain aging\nresearch.",
    "arxiv_id": "http://arxiv.org/abs/2408.00527v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00527v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Hilbert curves for efficient exploratory landscape analysis neighbourhood sampling",
    "authors": "Johannes J. Pienaar, Anna S. Bosman, Katherine M. Malan",
    "abstract": "Landscape analysis aims to characterise optimisation problems based on their\nobjective (or fitness) function landscape properties. The problem search space\nis typically sampled, and various landscape features are estimated based on the\nsamples. One particularly salient set of features is information content, which\nrequires the samples to be sequences of neighbouring solutions, such that the\nlocal relationships between consecutive sample points are preserved. Generating\nsuch spatially correlated samples that also provide good search space coverage\nis challenging. It is therefore common to first obtain an unordered sample with\ngood search space coverage, and then apply an ordering algorithm such as the\nnearest neighbour to minimise the distance between consecutive points in the\nsample. However, the nearest neighbour algorithm becomes computationally\nprohibitive in higher dimensions, thus there is a need for more efficient\nalternatives. In this study, Hilbert space-filling curves are proposed as a\nmethod to efficiently obtain high-quality ordered samples. Hilbert curves are a\nspecial case of fractal curves, and guarantee uniform coverage of a bounded\nsearch space while providing a spatially correlated sample. We study the\neffectiveness of Hilbert curves as samplers, and discover that they are capable\nof extracting salient features at a fraction of the computational cost compared\nto Latin hypercube sampling with post-factum ordering. Further, we investigate\nthe use of Hilbert curves as an ordering strategy, and find that they order the\nsample significantly faster than the nearest neighbour ordering, without\nsacrificing the saliency of the extracted features.",
    "arxiv_id": "http://arxiv.org/abs/2408.00526v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00526v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Identifying the Hierarchical Emotional Areas in the Human Brain Through Information Fusion",
    "authors": "Zhongyu Huang, Changde Du, Chaozhuo Li, Kaicheng Fu, Huiguang He",
    "abstract": "The brain basis of emotion has consistently received widespread attention,\nattracting a large number of studies to explore this cutting-edge topic.\nHowever, the methods employed in these studies typically only model the\npairwise relationship between two brain regions, while neglecting the\ninteractions and information fusion among multiple brain\nregions$\\unicode{x2014}$one of the key ideas of the psychological\nconstructionist hypothesis. To overcome the limitations of traditional methods,\nthis study provides an in-depth theoretical analysis of how to maximize\ninteractions and information fusion among brain regions. Building on the\nresults of this analysis, we propose to identify the hierarchical emotional\nareas in the human brain through multi-source information fusion and graph\nmachine learning methods. Comprehensive experiments reveal that the identified\nhierarchical emotional areas, from lower to higher levels, primarily facilitate\nthe fundamental process of emotion perception, the construction of basic\npsychological operations, and the coordination and integration of these\noperations. Overall, our findings provide unique insights into the brain\nmechanisms underlying specific emotions based on the psychological\nconstructionist hypothesis.",
    "arxiv_id": "http://arxiv.org/abs/2408.00525v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00525v1",
    "primary_category": "cs.HC",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Jailbreaking Text-to-Image Models with LLM-Based Agents",
    "authors": "Yingkai Dong, Zheng Li, Xiangtao Meng, Ning Yu, Shanqing Guo",
    "abstract": "Recent advancements have significantly improved automated task-solving\ncapabilities using autonomous agents powered by large language models (LLMs).\nHowever, most LLM-based agents focus on dialogue, programming, or specialized\ndomains, leaving gaps in addressing generative AI safety tasks. These gaps are\nprimarily due to the challenges posed by LLM hallucinations and the lack of\nclear guidelines. In this paper, we propose Atlas, an advanced LLM-based\nmulti-agent framework that integrates an efficient fuzzing workflow to target\ngenerative AI models, specifically focusing on jailbreak attacks against\ntext-to-image (T2I) models with safety filters. Atlas utilizes a\nvision-language model (VLM) to assess whether a prompt triggers the T2I model's\nsafety filter. It then iteratively collaborates with both LLM and VLM to\ngenerate an alternative prompt that bypasses the filter. Atlas also enhances\nthe reasoning abilities of LLMs in attack scenarios by leveraging multi-agent\ncommunication, in-context learning (ICL) memory mechanisms, and the\nchain-of-thought (COT) approach. Our evaluation demonstrates that Atlas\nsuccessfully jailbreaks several state-of-the-art T2I models in a black-box\nsetting, which are equipped with multi-modal safety filters. In addition, Atlas\noutperforms existing methods in both query efficiency and the quality of the\ngenerated images.",
    "arxiv_id": "http://arxiv.org/abs/2408.00523v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00523v1",
    "primary_category": "cs.CR",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Low-Power Vibration-Based Predictive Maintenance for Industry 4.0 using Neural Networks: A Survey",
    "authors": "Alexandru Vasilache, Sven Nitzsche, Daniel Floegel, Tobias Schuermann, Stefan von Dosky, Thomas Bierweiler, Marvin Mu\u00dfler, Florian K\u00e4lber, Soeren Hohmann, Juergen Becker",
    "abstract": "The advancements in smart sensors for Industry 4.0 offer ample opportunities\nfor low-powered predictive maintenance and condition monitoring. However,\ntraditional approaches in this field rely on processing in the cloud, which\nincurs high costs in energy and storage. This paper investigates the potential\nof neural networks for low-power on-device computation of vibration sensor data\nfor predictive maintenance. We review the literature on Spiking Neural Networks\n(SNNs) and Artificial Neuronal Networks (ANNs) for vibration-based predictive\nmaintenance by analyzing datasets, data preprocessing, network architectures,\nand hardware implementations. Our findings suggest that no satisfactory\nstandard benchmark dataset exists for evaluating neural networks in predictive\nmaintenance tasks. Furthermore frequency domain transformations are commonly\nemployed for preprocessing. SNNs mainly use shallow feed forward architectures,\nwhereas ANNs explore a wider range of models and deeper networks. Finally, we\nhighlight the need for future research on hardware implementations of neural\nnetworks for low-power predictive maintenance applications and the development\nof a standardized benchmark dataset.",
    "arxiv_id": "http://arxiv.org/abs/2408.00516v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00516v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "VecAug: Unveiling Camouflaged Frauds with Cohort Augmentation for Enhanced Detection",
    "authors": "Fei Xiao, Shaofeng Cai, Gang Chen, H. V. Jagadish, Beng Chin Ooi, Meihui Zhang",
    "abstract": "Fraud detection presents a challenging task characterized by ever-evolving\nfraud patterns and scarce labeled data. Existing methods predominantly rely on\ngraph-based or sequence-based approaches. While graph-based approaches connect\nusers through shared entities to capture structural information, they remain\nvulnerable to fraudsters who can disrupt or manipulate these connections. In\ncontrast, sequence-based approaches analyze users' behavioral patterns,\noffering robustness against tampering but overlooking the interactions between\nsimilar users. Inspired by cohort analysis in retention and healthcare, this\npaper introduces VecAug, a novel cohort-augmented learning framework that\naddresses these challenges by enhancing the representation learning of target\nusers with personalized cohort information. To this end, we first propose a\nvector burn-in technique for automatic cohort identification, which retrieves a\ntask-specific cohort for each target user. Then, to fully exploit the cohort\ninformation, we introduce an attentive cohort aggregation technique for\naugmenting target user representations. To improve the robustness of such\ncohort augmentation, we also propose a novel label-aware cohort neighbor\nseparation mechanism to distance negative cohort neighbors and calibrate the\naggregated cohort information. By integrating this cohort information with\ntarget user representations, VecAug enhances the modeling capacity and\ngeneralization capabilities of the model to be augmented. Our framework is\nflexible and can be seamlessly integrated with existing fraud detection models.\nWe deploy our framework on e-commerce platforms and evaluate it on three fraud\ndetection datasets, and results show that VecAug improves the detection\nperformance of base models by up to 2.48\\% in AUC and 22.5\\% in R@P$_{0.9}$,\noutperforming state-of-the-art methods significantly.",
    "arxiv_id": "http://arxiv.org/abs/2408.00513v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00513v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Block-Operations: Using Modular Routing to Improve Compositional Generalization",
    "authors": "Florian Dietz, Dietrich Klakow",
    "abstract": "We explore the hypothesis that poor compositional generalization in neural\nnetworks is caused by difficulties with learning effective routing. To solve\nthis problem, we propose the concept of block-operations, which is based on\nsplitting all activation tensors in the network into uniformly sized blocks and\nusing an inductive bias to encourage modular routing and modification of these\nblocks. Based on this concept we introduce the Multiplexer, a new architectural\ncomponent that enhances the Feed Forward Neural Network (FNN). We\nexperimentally confirm that Multiplexers exhibit strong compositional\ngeneralization. On both a synthetic and a realistic task our model was able to\nlearn the underlying process behind the task, whereas both FNNs and\nTransformers were only able to learn heuristic approximations. We propose as\nfuture work to use the principles of block-operations to improve other existing\narchitectures.",
    "arxiv_id": "http://arxiv.org/abs/2408.00508v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00508v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Graph Representation Learning via Causal Diffusion for Out-of-Distribution Recommendation",
    "authors": "Chu Zhao, Enneng Yang, Yuliang Liang, Pengxiang Lan, Yuting Liu, Jianzhe Zhao, Guibing Guo, Xingwei Wang",
    "abstract": "Graph Neural Networks (GNNs)-based recommendation algorithms typically assume\nthat training and testing data are drawn from independent and identically\ndistributed (IID) spaces. However, this assumption often fails in the presence\nof out-of-distribution (OOD) data, resulting in significant performance\ndegradation. In this study, we construct a Structural Causal Model (SCM) to\nanalyze interaction data, revealing that environmental confounders (e.g., the\nCOVID-19 pandemic) lead to unstable correlations in GNN-based models, thus\nimpairing their generalization to OOD data. To address this issue, we propose a\nnovel approach, graph representation learning via causal diffusion\n(CausalDiffRec) for OOD recommendation. This method enhances the model's\ngeneralization on OOD data by eliminating environmental confounding factors and\nlearning invariant graph representations. Specifically, we use backdoor\nadjustment and variational inference to infer the real environmental\ndistribution, thereby eliminating the impact of environmental confounders. This\ninferred distribution is then used as prior knowledge to guide the\nrepresentation learning in the reverse phase of the diffusion process to learn\nthe invariant representation. In addition, we provide a theoretical derivation\nthat proves optimizing the objective function of CausalDiffRec can encourage\nthe model to learn environment-invariant graph representations, thereby\nachieving excellent generalization performance in recommendations under\ndistribution shifts. Our extensive experiments validate the effectiveness of\nCausalDiffRec in improving the generalization of OOD data, and the average\nimprovement is up to 10.69% on Food, 18.83% on KuaiRec, 22.41% on Yelp2018, and\n11.65% on Douban datasets.",
    "arxiv_id": "http://arxiv.org/abs/2408.00490v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00490v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "A Systematic Review on Long-Tailed Learning",
    "authors": "Chongsheng Zhang, George Almpanidis, Gaojuan Fan, Binquan Deng, Yanbo Zhang, Ji Liu, Aouaidjia Kamel, Paolo Soda, Jo\u00e3o Gama",
    "abstract": "Long-tailed data is a special type of multi-class imbalanced data with a very\nlarge amount of minority/tail classes that have a very significant combined\ninfluence. Long-tailed learning aims to build high-performance models on\ndatasets with long-tailed distributions, which can identify all the classes\nwith high accuracy, in particular the minority/tail classes. It is a\ncutting-edge research direction that has attracted a remarkable amount of\nresearch effort in the past few years. In this paper, we present a\ncomprehensive survey of latest advances in long-tailed visual learning. We\nfirst propose a new taxonomy for long-tailed learning, which consists of eight\ndifferent dimensions, including data balancing, neural architecture, feature\nenrichment, logits adjustment, loss function, bells and whistles, network\noptimization, and post hoc processing techniques. Based on our proposed\ntaxonomy, we present a systematic review of long-tailed learning methods,\ndiscussing their commonalities and alignable differences. We also analyze the\ndifferences between imbalance learning and long-tailed learning approaches.\nFinally, we discuss prospects and future directions in this field.",
    "arxiv_id": "http://arxiv.org/abs/2408.00483v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00483v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Infrequent Resolving Algorithm for Online Linear Programming",
    "authors": "Guokai Li, Zizhuo Wang, Jingwei Zhang",
    "abstract": "Online linear programming (OLP) has gained significant attention from both\nresearchers and practitioners due to its extensive applications, such as online\nauction, network revenue management and advertising. Existing OLP algorithms\nfall into two categories: LP-based algorithms and LP-free algorithms. The\nformer one typically guarantees better performance, even offering a constant\nregret, but requires solving a large number of LPs, which could be\ncomputationally expensive. In contrast, LP-free algorithm only requires\nfirst-order computations but induces a worse performance, lacking a constant\nregret bound. In this work, we bridge the gap between these two extremes by\nproposing an algorithm that achieves a constant regret while solving LPs only\n$O(\\log\\log T)$ times over the time horizon $T$. Moreover, when we are allowed\nto solve LPs only $M$ times, we propose an algorithm that can guarantee an\n$O\\left(T^{(1/2+\\epsilon)^{M-1}}\\right)$ regret. Furthermore, when the arrival\nprobabilities are known at the beginning, our algorithm can guarantee a\nconstant regret by solving LPs $O(\\log\\log T)$ times, and an\n$O\\left(T^{(1/2+\\epsilon)^{M}}\\right)$ regret by solving LPs only $M$ times.\nNumerical experiments are conducted to demonstrate the efficiency of the\nproposed algorithms.",
    "arxiv_id": "http://arxiv.org/abs/2408.00465v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00465v1",
    "primary_category": "cs.DS",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Designing Efficient LLM Accelerators for Edge Devices",
    "authors": "Jude Haris, Rappy Saha, Wenhao Hu, Jos\u00e9 Cano",
    "abstract": "The increase in open-source availability of Large Language Models (LLMs) has\nenabled users to deploy them on more and more resource-constrained edge devices\nto reduce reliance on network connections and provide more privacy. However,\nthe high computation and memory demands of LLMs make their execution on\nresource-constrained edge devices challenging and inefficient. To address this\nissue, designing new and efficient edge accelerators for LLM inference is\ncrucial. FPGA-based accelerators are ideal for LLM acceleration due to their\nreconfigurability, as they enable model-specific optimizations and higher\nperformance per watt. However, creating and integrating FPGA-based accelerators\nfor LLMs (particularly on edge devices) has proven challenging, mainly due to\nthe limited hardware design flows for LLMs in existing FPGA platforms.\n  To tackle this issue, in this paper we first propose a new design platform,\nnamed SECDA-LLM, that utilizes the SECDA methodology to streamline the process\nof designing, integrating, and deploying efficient FPGA-based LLM accelerators\nfor the llama.cpp inference framework. We then demonstrate, through a case\nstudy, the potential benefits of SECDA-LLM by creating a new MatMul accelerator\nthat supports block floating point quantized operations for LLMs. Our initial\naccelerator design, deployed on the PYNQ-Z1 board, reduces latency 1.7 seconds\nper token or ~2 seconds per word) by 11x over the dual-core Arm NEON-based CPU\nexecution for the TinyLlama model.",
    "arxiv_id": "http://arxiv.org/abs/2408.00462v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00462v1",
    "primary_category": "cs.AR",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Reenact Anything: Semantic Video Motion Transfer Using Motion-Textual Inversion",
    "authors": "Manuel Kansy, Jacek Naruniec, Christopher Schroers, Markus Gross, Romann M. Weber",
    "abstract": "Recent years have seen a tremendous improvement in the quality of video\ngeneration and editing approaches. While several techniques focus on editing\nappearance, few address motion. Current approaches using text, trajectories, or\nbounding boxes are limited to simple motions, so we specify motions with a\nsingle motion reference video instead. We further propose to use a pre-trained\nimage-to-video model rather than a text-to-video model. This approach allows us\nto preserve the exact appearance and position of a target object or scene and\nhelps disentangle appearance from motion. Our method, called motion-textual\ninversion, leverages our observation that image-to-video models extract\nappearance mainly from the (latent) image input, while the text/image embedding\ninjected via cross-attention predominantly controls motion. We thus represent\nmotion using text/image embedding tokens. By operating on an inflated\nmotion-text embedding containing multiple text/image embedding tokens per\nframe, we achieve a high temporal motion granularity. Once optimized on the\nmotion reference video, this embedding can be applied to various target images\nto generate videos with semantically similar motions. Our approach does not\nrequire spatial alignment between the motion reference video and target image,\ngeneralizes across various domains, and can be applied to various tasks such as\nfull-body and face reenactment, as well as controlling the motion of inanimate\nobjects and the camera. We empirically demonstrate the effectiveness of our\nmethod in the semantic video motion transfer task, significantly outperforming\nexisting methods in this context.",
    "arxiv_id": "http://arxiv.org/abs/2408.00458v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00458v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Rapid and Power-Aware Learned Optimization for Modular Receive Beamforming",
    "authors": "Ohad Levy, Nir Shlezinger",
    "abstract": "Multiple-input multiple-output (MIMO) systems play a key role in wireless\ncommunication technologies. A widely considered approach to realize scalable\nMIMO systems involves architectures comprised of multiple separate modules,\neach with its own beamforming capability. Such models accommodate cell-free\nmassive MIMO and partially connected hybrid MIMO architectures. A core issue\nwith the implementation of modular MIMO arises from the need to rapidly set the\nbeampatterns of the modules, while maintaining their power efficiency. This\nleads to challenging constrained optimization that should be repeatedly solved\non each coherence duration. In this work, we propose a power-oriented\noptimization algorithm for beamforming in uplink modular hybrid MIMO systems,\nwhich learns from data to operate rapidly. We derive our learned optimizer by\ntackling the rate maximization objective using projected gradient ascent steps\nwith momentum. We then leverage data to tune the hyperparameters of the\noptimizer, allowing it to operate reliably in a fixed and small number of\niterations while completely preserving its interpretable operation. We show how\npower efficient beamforming can be encouraged by the learned optimizer, via\nboosting architectures with low-resolution phase shifts and with deactivated\nanalog components. Numerical results show that our learn-to-optimize method\nnotably reduces the number of iterations and computation latency required to\nreliably tune modular MIMO receivers, and that it allows obtaining desirable\nbalances between power efficient designs and throughput.",
    "arxiv_id": "http://arxiv.org/abs/2408.00439v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00439v1",
    "primary_category": "eess.SP",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Efficient Patient Fine-Tuned Seizure Detection with a Tensor Kernel Machine",
    "authors": "Seline J. S. de Rooij, Frederiek Wesel, Borb\u00e1la Hunyadi",
    "abstract": "Recent developments in wearable devices have made accurate and efficient\nseizure detection more important than ever. A challenge in seizure detection is\nthat patient-specific models typically outperform patient-independent models.\nHowever, in a wearable device one typically starts with a patient-independent\nmodel, until such patient-specific data is available. To avoid having to\nconstruct a new classifier with this data, as required in conventional kernel\nmachines, we propose a transfer learning approach with a tensor kernel machine.\nThis method learns the primal weights in a compressed form using the canonical\npolyadic decomposition, making it possible to efficiently update the weights of\nthe patient-independent model with patient-specific data. The results show that\nthis patient fine-tuned model reaches as high a performance as a\npatient-specific SVM model with a model size that is twice as small as the\npatient-specific model and ten times as small as the patient-independent model.",
    "arxiv_id": "http://arxiv.org/abs/2408.00437v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00437v1",
    "primary_category": "eess.SP",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "A Cross-Domain Benchmark for Active Learning",
    "authors": "Thorben Werner, Johannes Burchert, Maximilian Stubbemann, Lars Schmidt-Thieme",
    "abstract": "Active Learning (AL) deals with identifying the most informative samples for\nlabeling to reduce data annotation costs for supervised learning tasks. AL\nresearch suffers from the fact that lifts from literature generalize poorly and\nthat only a small number of repetitions of experiments are conducted. To\novercome these obstacles, we propose \\emph{CDALBench}, the first active\nlearning benchmark which includes tasks in computer vision, natural language\nprocessing and tabular learning. Furthermore, by providing an efficient, greedy\noracle, \\emph{CDALBench} can be evaluated with 50 runs for each experiment. We\nshow, that both the cross-domain character and a large amount of repetitions\nare crucial for sophisticated evaluation of AL research. Concretely, we show\nthat the superiority of specific methods varies over the different domains,\nmaking it important to evaluate Active Learning with a cross-domain benchmark.\nAdditionally, we show that having a large amount of runs is crucial. With only\nconducting three runs as often done in the literature, the superiority of\nspecific methods can strongly vary with the specific runs. This effect is so\nstrong, that, depending on the seed, even a well-established method's\nperformance can be significantly better and significantly worse than random for\nthe same dataset.",
    "arxiv_id": "http://arxiv.org/abs/2408.00426v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00426v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Towards Evolutionary-based Automated Machine Learning for Small Molecule Pharmacokinetic Prediction",
    "authors": "Alex G. C. de S\u00e1, David B. Ascher",
    "abstract": "Machine learning (ML) is revolutionising drug discovery by expediting the\nprediction of small molecule properties essential for developing new drugs.\nThese properties -- including absorption, distribution, metabolism and\nexcretion (ADME)-- are crucial in the early stages of drug development since\nthey provide an understanding of the course of the drug in the organism, i.e.,\nthe drug's pharmacokinetics. However, existing methods lack personalisation and\nrely on manually crafted ML algorithms or pipelines, which can introduce\ninefficiencies and biases into the process. To address these challenges, we\npropose a novel evolutionary-based automated ML method (AutoML) specifically\ndesigned for predicting small molecule properties, with a particular focus on\npharmacokinetics. Leveraging the advantages of grammar-based genetic\nprogramming, our AutoML method streamlines the process by automatically\nselecting algorithms and designing predictive pipelines tailored to the\nparticular characteristics of input molecular data. Results demonstrate\nAutoML's effectiveness in selecting diverse ML algorithms, resulting in\ncomparable or even improved predictive performances compared to conventional\napproaches. By offering personalised ML-driven pipelines, our method promises\nto enhance small molecule research in drug discovery, providing researchers\nwith a valuable tool for accelerating the development of novel therapeutic\ndrugs.",
    "arxiv_id": "http://arxiv.org/abs/2408.00421v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00421v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Unsupervised Pairwise Causal Discovery on Heterogeneous Data using Mutual Information Measures",
    "authors": "Alexandre Trilla, Nenad Mijatovic",
    "abstract": "A fundamental task in science is to determine the underlying causal relations\nbecause it is the knowledge of this functional structure what leads to the\ncorrect interpretation of an effect given the apparent associations in the\nobserved data. In this sense, Causal Discovery is a technique that tackles this\nchallenge by analyzing the statistical properties of the constituent variables.\nIn this work, we target the generalizability of the discovery method by\nfollowing a reductionist approach that only involves two variables, i.e., the\npairwise or bi-variate setting. We question the current (possibly misleading)\nbaseline results on the basis that they were obtained through supervised\nlearning, which is arguably contrary to this genuinely exploratory endeavor. In\nconsequence, we approach this problem in an unsupervised way, using robust\nMutual Information measures, and observing the impact of the different variable\ntypes, which is oftentimes ignored in the design of solutions. Thus, we provide\na novel set of standard unbiased results that can serve as a reference to guide\nfuture discovery tasks in completely unknown environments.",
    "arxiv_id": "http://arxiv.org/abs/2408.00399v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00399v1",
    "primary_category": "cs.AI",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "What comes after transformers? -- A selective survey connecting ideas in deep learning",
    "authors": "Johannes Schneider",
    "abstract": "Transformers have become the de-facto standard model in artificial\nintelligence since 2017 despite numerous shortcomings ranging from energy\ninefficiency to hallucinations. Research has made a lot of progress in\nimproving elements of transformers, and, more generally, deep learning\nmanifesting in many proposals for architectures, layers, optimization\nobjectives, and optimization techniques. For researchers it is difficult to\nkeep track of such developments on a broader level. We provide a comprehensive\noverview of the many important, recent works in these areas to those who\nalready have a basic understanding of deep learning. Our focus differs from\nother works, as we target specifically novel, alternative potentially\ndisruptive approaches to transformers as well as successful ideas of recent\ndeep learning. We hope that such a holistic and unified treatment of\ninfluential, recent works and novel ideas helps researchers to form new\nconnections between diverse areas of deep learning. We identify and discuss\nmultiple patterns that summarize the key strategies for successful innovations\nover the last decade as well as works that can be seen as rising stars.\nEspecially, we discuss attempts on how to improve on transformers covering\n(partially) proven methods such as state space models but also including\nfar-out ideas in deep learning that seem promising despite not achieving\nstate-of-the-art results. We also cover a discussion on recent state-of-the-art\nmodels such as OpenAI's GPT series and Meta's LLama models and, Google's Gemini\nmodel family.",
    "arxiv_id": "http://arxiv.org/abs/2408.00386v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00386v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Enhancing Whole Slide Pathology Foundation Models through Stain Normalization",
    "authors": "Juseung Yun, Yi Hu, Jinhyung Kim, Jongseong Jang, Soonyoung Lee",
    "abstract": "Recent advancements in digital pathology have led to the development of\nnumerous foundational models that utilize self-supervised learning on patches\nextracted from gigapixel whole slide images (WSIs). While this approach\nleverages vast amounts of unlabeled data, we have discovered a significant\nissue: features extracted from these self-supervised models tend to cluster by\nindividual WSIs, a phenomenon we term WSI-specific feature collapse. This\nproblem can potentially limit the model's generalization ability and\nperformance on various downstream tasks. To address this issue, we introduce\nStain Normalized Pathology Foundational Model, a novel foundational model\ntrained on patches that have undergone stain normalization. Stain normalization\nhelps reduce color variability arising from different laboratories and\nscanners, enabling the model to learn more consistent features. Stain\nNormalized Pathology Foundational Model is trained using 285,153,903 patches\nextracted from a total of 34,795 WSIs, combining data from The Cancer Genome\nAtlas (TCGA) and the Genotype-Tissue Expression (GTEx) project. Our experiments\ndemonstrate that Stain Normalized Pathology Foundational Model significantly\nmitigates the feature collapse problem, indicating that the model has learned\nmore generalized features rather than overfitting to individual WSI\ncharacteristics. We compared Stain Normalized Pathology Foundational Model with\nstate-of-the-art models across six downstream task datasets, and our results\nshow that \\name{} achieves excellent performance relative to the number of WSIs\nused and the model's parameter count. This suggests that the application of\nstain normalization has substantially improved the model's efficiency and\ngeneralization capabilities.",
    "arxiv_id": "http://arxiv.org/abs/2408.00380v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00380v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "On the Limitations and Prospects of Machine Unlearning for Generative AI",
    "authors": "Shiji Zhou, Lianzhe Wang, Jiangnan Ye, Yongliang Wu, Heng Chang",
    "abstract": "Generative AI (GenAI), which aims to synthesize realistic and diverse data\nsamples from latent variables or other data modalities, has achieved remarkable\nresults in various domains, such as natural language, images, audio, and\ngraphs. However, they also pose challenges and risks to data privacy, security,\nand ethics. Machine unlearning is the process of removing or weakening the\ninfluence of specific data samples or features from a trained model, without\naffecting its performance on other data or tasks. While machine unlearning has\nshown significant efficacy in traditional machine learning tasks, it is still\nunclear if it could help GenAI become safer and aligned with human desire. To\nthis end, this position paper provides an in-depth discussion of the machine\nunlearning approaches for GenAI. Firstly, we formulate the problem of machine\nunlearning tasks on GenAI and introduce the background. Subsequently, we\nsystematically examine the limitations of machine unlearning on GenAI models by\nfocusing on the two representative branches: LLMs and image generative\n(diffusion) models. Finally, we provide our prospects mainly from three\naspects: benchmark, evaluation metrics, and utility-unlearning trade-off, and\nconscientiously advocate for the future development of this field.",
    "arxiv_id": "http://arxiv.org/abs/2408.00376v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00376v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Conformal Trajectory Prediction with Multi-View Data Integration in Cooperative Driving",
    "authors": "Xi Chen, Rahul Bhadani, Larry Head",
    "abstract": "Current research on trajectory prediction primarily relies on data collected\nby onboard sensors of an ego vehicle. With the rapid advancement in connected\ntechnologies, such as vehicle-to-vehicle (V2V) and vehicle-to-infrastructure\n(V2I) communication, valuable information from alternate views becomes\naccessible via wireless networks. The integration of information from\nalternative views has the potential to overcome the inherent limitations\nassociated with a single viewpoint, such as occlusions and limited field of\nview. In this work, we introduce V2INet, a novel trajectory prediction\nframework designed to model multi-view data by extending existing single-view\nmodels. Unlike previous approaches where the multi-view data is manually fused\nor formulated as a separate training stage, our model supports end-to-end\ntraining, enhancing both flexibility and performance. Moreover, the predicted\nmultimodal trajectories are calibrated by a post-hoc conformal prediction\nmodule to get valid and efficient confidence regions. We evaluated the entire\nframework using the real-world V2I dataset V2X-Seq. Our results demonstrate\nsuperior performance in terms of Final Displacement Error (FDE) and Miss Rate\n(MR) using a single GPU. The code is publicly available at:\n\\url{https://github.com/xichennn/V2I_trajectory_prediction}.",
    "arxiv_id": "http://arxiv.org/abs/2408.00374v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00374v1",
    "primary_category": "cs.AI",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Memorization Capacity for Additive Fine-Tuning with Small ReLU Networks",
    "authors": "Jy-yong Sohn, Dohyun Kwon, Seoyeon An, Kangwook Lee",
    "abstract": "Fine-tuning large pre-trained models is a common practice in machine learning\napplications, yet its mathematical analysis remains largely unexplored. In this\npaper, we study fine-tuning through the lens of memorization capacity. Our new\nmeasure, the Fine-Tuning Capacity (FTC), is defined as the maximum number of\nsamples a neural network can fine-tune, or equivalently, as the minimum number\nof neurons ($m$) needed to arbitrarily change $N$ labels among $K$ samples\nconsidered in the fine-tuning process. In essence, FTC extends the memorization\ncapacity concept to the fine-tuning scenario. We analyze FTC for the additive\nfine-tuning scenario where the fine-tuned network is defined as the summation\nof the frozen pre-trained network $f$ and a neural network $g$ (with $m$\nneurons) designed for fine-tuning. When $g$ is a ReLU network with either 2 or\n3 layers, we obtain tight upper and lower bounds on FTC; we show that $N$\nsamples can be fine-tuned with $m=\\Theta(N)$ neurons for 2-layer networks, and\nwith $m=\\Theta(\\sqrt{N})$ neurons for 3-layer networks, no matter how large $K$\nis. Our results recover the known memorization capacity results when $N = K$ as\na special case.",
    "arxiv_id": "http://arxiv.org/abs/2408.00359v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00359v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Neural Graph Matching for Video Retrieval in Large-Scale Video-driven E-commerce",
    "authors": "Houye Ji, Ye Tang, Zhaoxin Chen, Lixi Deng, Jun Hu, Lei Su",
    "abstract": "With the rapid development of the short video industry, traditional\ne-commerce has encountered a new paradigm, video-driven e-commerce, which\nleverages attractive videos for product showcases and provides both video and\nitem services for users. Benefitting from the dynamic and visualized\nintroduction of items,video-driven e-commerce has shown huge potential in\nstimulating consumer confidence and promoting sales. In this paper, we focus on\nthe video retrieval task, facing the following challenges: (1) Howto handle the\nheterogeneities among users, items, and videos? (2)How to mine the\ncomplementarity between items and videos for better user understanding? In this\npaper, we first leverage the dual graph to model the co-existing of user-video\nand user-item interactions in video-driven e-commerce and innovatively reduce\nuser preference understanding to a graph matching problem. To solve it, we\nfurther propose a novel bi-level Graph Matching Network(GMN), which mainly\nconsists of node- and preference-level graph matching. Given a user, node-level\ngraph matching aims to match videos and items, while preference-level graph\nmatching aims to match multiple user preferences extracted from both videos and\nitems. Then the proposed GMN can generate and improve user embedding by\naggregating matched nodes or preferences from the dual graph in a bi-level\nmanner. Comprehensive experiments show the superiority of the proposed GMN with\nsignificant improvements over state-of-the-art approaches (e.g., AUC+1.9% and\nCTR+7.15%). We have developed it on a well-known video-driven e-commerce\nplatform, serving hundreds of millions of users every day",
    "arxiv_id": "http://arxiv.org/abs/2408.00346v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00346v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "IN-Sight: Interactive Navigation through Sight",
    "authors": "Philipp Schoch, Fan Yang, Yuntao Ma, Stefan Leutenegger, Marco Hutter, Quentin Leboute",
    "abstract": "Current visual navigation systems often treat the environment as static,\nlacking the ability to adaptively interact with obstacles. This limitation\nleads to navigation failure when encountering unavoidable obstructions. In\nresponse, we introduce IN-Sight, a novel approach to self-supervised path\nplanning, enabling more effective navigation strategies through interaction\nwith obstacles. Utilizing RGB-D observations, IN-Sight calculates\ntraversability scores and incorporates them into a semantic map, facilitating\nlong-range path planning in complex, maze-like environments. To precisely\nnavigate around obstacles, IN-Sight employs a local planner, trained\nimperatively on a differentiable costmap using representation learning\ntechniques. The entire framework undergoes end-to-end training within the\nstate-of-the-art photorealistic Intel SPEAR Simulator. We validate the\neffectiveness of IN-Sight through extensive benchmarking in a variety of\nsimulated scenarios and ablation studies. Moreover, we demonstrate the system's\nreal-world applicability with zero-shot sim-to-real transfer, deploying our\nplanner on the legged robot platform ANYmal, showcasing its practical potential\nfor interactive navigation in real environments.",
    "arxiv_id": "http://arxiv.org/abs/2408.00343v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00343v1",
    "primary_category": "cs.RO",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "MuJoCo MPC for Humanoid Control: Evaluation on HumanoidBench",
    "authors": "Moritz Meser, Aditya Bhatt, Boris Belousov, Jan Peters",
    "abstract": "We tackle the recently introduced benchmark for whole-body humanoid control\nHumanoidBench using MuJoCo MPC. We find that sparse reward functions of\nHumanoidBench yield undesirable and unrealistic behaviors when optimized;\ntherefore, we propose a set of regularization terms that stabilize the robot\nbehavior across tasks. Current evaluations on a subset of tasks demonstrate\nthat our proposed reward function allows achieving the highest HumanoidBench\nscores while maintaining realistic posture and smooth control signals. Our code\nis publicly available and will become a part of MuJoCo MPC, enabling rapid\nprototyping of robot behaviors.",
    "arxiv_id": "http://arxiv.org/abs/2408.00342v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00342v1",
    "primary_category": "cs.RO",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "\"Patriarchy Hurts Men Too.\" Does Your Model Agree? A Discussion on Fairness Assumptions",
    "authors": "Marco Favier, Toon Calders",
    "abstract": "The pipeline of a fair ML practitioner is generally divided into three\nphases: 1) Selecting a fairness measure. 2) Choosing a model that minimizes\nthis measure. 3) Maximizing the model's performance on the data. In the context\nof group fairness, this approach often obscures implicit assumptions about how\nbias is introduced into the data. For instance, in binary classification, it is\noften assumed that the best model, with equal fairness, is the one with better\nperformance. However, this belief already imposes specific properties on the\nprocess that introduced bias. More precisely, we are already assuming that the\nbiasing process is a monotonic function of the fair scores, dependent solely on\nthe sensitive attribute. We formally prove this claim regarding several\nimplicit fairness assumptions. This leads, in our view, to two possible\nconclusions: either the behavior of the biasing process is more complex than\nmere monotonicity, which means we need to identify and reject our implicit\nassumptions in order to develop models capable of tackling more complex\nsituations; or the bias introduced in the data behaves predictably, implying\nthat many of the developed models are superfluous.",
    "arxiv_id": "http://arxiv.org/abs/2408.00330v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00330v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "OTAD: An Optimal Transport-Induced Robust Model for Agnostic Adversarial Attack",
    "authors": "Kuo Gai, Sicong Wang, Shihua Zhang",
    "abstract": "Deep neural networks (DNNs) are vulnerable to small adversarial perturbations\nof the inputs, posing a significant challenge to their reliability and\nrobustness. Empirical methods such as adversarial training can defend against\nparticular attacks but remain vulnerable to more powerful attacks.\nAlternatively, Lipschitz networks provide certified robustness to unseen\nperturbations but lack sufficient expressive power. To harness the advantages\nof both approaches, we design a novel two-step Optimal Transport induced\nAdversarial Defense (OTAD) model that can fit the training data accurately\nwhile preserving the local Lipschitz continuity. First, we train a DNN with a\nregularizer derived from optimal transport theory, yielding a discrete optimal\ntransport map linking data to its features. By leveraging the map's inherent\nregularity, we interpolate the map by solving the convex integration problem\n(CIP) to guarantee the local Lipschitz property. OTAD is extensible to diverse\narchitectures of ResNet and Transformer, making it suitable for complex data.\nFor efficient computation, the CIP can be solved through training neural\nnetworks. OTAD opens a novel avenue for developing reliable and secure deep\nlearning systems through the regularity of optimal transport maps. Empirical\nresults demonstrate that OTAD can outperform other robust models on diverse\ndatasets.",
    "arxiv_id": "http://arxiv.org/abs/2408.00329v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00329v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Exploiting Preferences in Loss Functions for Sequential Recommendation via Weak Transitivity",
    "authors": "Hyunsoo Chung, Jungtaek Kim, Hyungeun Jo, Hyungwon Choi",
    "abstract": "A choice of optimization objective is immensely pivotal in the design of a\nrecommender system as it affects the general modeling process of a user's\nintent from previous interactions. Existing approaches mainly adhere to three\ncategories of loss functions: pairwise, pointwise, and setwise loss functions.\nDespite their effectiveness, a critical and common drawback of such objectives\nis viewing the next observed item as a unique positive while considering all\nremaining items equally negative. Such a binary label assignment is generally\nlimited to assuring a higher recommendation score of the positive item,\nneglecting potential structures induced by varying preferences between other\nunobserved items. To alleviate this issue, we propose a novel method that\nextends original objectives to explicitly leverage the different levels of\npreferences as relative orders between their scores. Finally, we demonstrate\nthe superior performance of our method compared to baseline objectives.",
    "arxiv_id": "http://arxiv.org/abs/2408.00326v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00326v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "ADBM: Adversarial diffusion bridge model for reliable adversarial purification",
    "authors": "Xiao Li, Wenxuan Sun, Huanran Chen, Qiongxiu Li, Yining Liu, Yingzhe He, Jie Shi, Xiaolin Hu",
    "abstract": "Recently Diffusion-based Purification (DiffPure) has been recognized as an\neffective defense method against adversarial examples. However, we find\nDiffPure which directly employs the original pre-trained diffusion models for\nadversarial purification, to be suboptimal. This is due to an inherent\ntrade-off between noise purification performance and data recovery quality.\nAdditionally, the reliability of existing evaluations for DiffPure is\nquestionable, as they rely on weak adaptive attacks. In this work, we propose a\nnovel Adversarial Diffusion Bridge Model, termed ADBM. ADBM directly constructs\na reverse bridge from the diffused adversarial data back to its original clean\nexamples, enhancing the purification capabilities of the original diffusion\nmodels. Through theoretical analysis and experimental validation across various\nscenarios, ADBM has proven to be a superior and robust defense mechanism,\noffering significant promise for practical applications.",
    "arxiv_id": "http://arxiv.org/abs/2408.00315v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00315v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Adversarial Text Rewriting for Text-aware Recommender Systems",
    "authors": "Sejoon Oh, Gaurav Verma, Srijan Kumar",
    "abstract": "Text-aware recommender systems incorporate rich textual features, such as\ntitles and descriptions, to generate item recommendations for users. The use of\ntextual features helps mitigate cold-start problems, and thus, such recommender\nsystems have attracted increased attention. However, we argue that the\ndependency on item descriptions makes the recommender system vulnerable to\nmanipulation by adversarial sellers on e-commerce platforms. In this paper, we\nexplore the possibility of such manipulation by proposing a new text rewriting\nframework to attack text-aware recommender systems. We show that the rewriting\nattack can be exploited by sellers to unfairly uprank their products, even\nthough the adversarially rewritten descriptions are perceived as realistic by\nhuman evaluators. Methodologically, we investigate two different variations to\ncarry out text rewriting attacks: (1) two-phase fine-tuning for greater attack\nperformance, and (2) in-context learning for higher text rewriting quality.\nExperiments spanning 3 different datasets and 4 existing approaches demonstrate\nthat recommender systems exhibit vulnerability against the proposed text\nrewriting attack. Our work adds to the existing literature around the\nrobustness of recommender systems, while highlighting a new dimension of\nvulnerability in the age of large-scale automated text generation.",
    "arxiv_id": "http://arxiv.org/abs/2408.00312v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00312v1",
    "primary_category": "cs.IR",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Online Linear Programming with Batching",
    "authors": "Haoran Xu, Peter W. Glynn, Yinyu Ye",
    "abstract": "We study Online Linear Programming (OLP) with batching. The planning horizon\nis cut into $K$ batches, and the decisions on customers arriving within a batch\ncan be delayed to the end of their associated batch. Compared with OLP without\nbatching, the ability to delay decisions brings better operational performance,\nas measured by regret. Two research questions of interest are: (1) What is a\nlower bound of the regret as a function of $K$? (2) What algorithms can achieve\nthe regret lower bound? These questions have been analyzed in the literature\nwhen the distribution of the reward and the resource consumption of the\ncustomers have finite support. By contrast, this paper analyzes these questions\nwhen the conditional distribution of the reward given the resource consumption\nis continuous, and we show the answers are different under this setting. When\nthere is only a single type of resource and the decision maker knows the total\nnumber of customers, we propose an algorithm with a $O(\\log K)$ regret upper\nbound and provide a $\\Omega(\\log K)$ regret lower bound. We also propose\nalgorithms with $O(\\log K)$ regret upper bound for the setting in which there\nare multiple types of resource and the setting in which customers arrive\nfollowing a Poisson process. All these regret upper and lower bounds are\nindependent of the length of the planning horizon, and all the proposed\nalgorithms delay decisions on customers arriving in only the first and the last\nbatch. We also take customer impatience into consideration and establish a way\nof selecting an appropriate batch size.",
    "arxiv_id": "http://arxiv.org/abs/2408.00310v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00310v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Discretizing Continuous Action Space with Unimodal Probability Distributions for On-Policy Reinforcement Learning",
    "authors": "Yuanyang Zhu, Zhi Wang, Yuanheng Zhu, Chunlin Chen, Dongbin Zhao",
    "abstract": "For on-policy reinforcement learning, discretizing action space for\ncontinuous control can easily express multiple modes and is straightforward to\noptimize. However, without considering the inherent ordering between the\ndiscrete atomic actions, the explosion in the number of discrete actions can\npossess undesired properties and induce a higher variance for the policy\ngradient estimator. In this paper, we introduce a straightforward architecture\nthat addresses this issue by constraining the discrete policy to be unimodal\nusing Poisson probability distributions. This unimodal architecture can better\nleverage the continuity in the underlying continuous action space using\nexplicit unimodal probability distributions. We conduct extensive experiments\nto show that the discrete policy with the unimodal probability distribution\nprovides significantly faster convergence and higher performance for on-policy\nreinforcement learning algorithms in challenging control tasks, especially in\nhighly complex tasks such as Humanoid. We provide theoretical analysis on the\nvariance of the policy gradient estimator, which suggests that our attentively\ndesigned unimodal discrete policy can retain a lower variance and yield a\nstable learning process.",
    "arxiv_id": "http://arxiv.org/abs/2408.00309v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00309v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "ABC Align: Large Language Model Alignment for Safety & Accuracy",
    "authors": "Gareth Seneque, Lap-Hang Ho, Ariel Kuperman, Nafise Erfanian Saeedi, Jeffrey Molendijk",
    "abstract": "Alignment of Large Language Models (LLMs) remains an unsolved problem. Human\npreferences are highly distributed and can be captured at multiple levels of\nabstraction, from the individual to diverse populations. Organisational\npreferences, represented by standards and principles, are defined to mitigate\nreputational risk or meet legislative obligations. In this paper, we present\nABC Align, a novel alignment methodology for LLMs that enables integration of\nthe standards and preferences of a large media organisation into the LLM\nitself. We combine a set of data and methods that build on recent breakthroughs\nin synthetic data generation, preference optimisation, and post-training model\nquantisation. Our unified approach mitigates bias and improves accuracy, while\npreserving reasoning capability, as measured against standard benchmarks.",
    "arxiv_id": "http://arxiv.org/abs/2408.00307v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00307v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Contrastive Graph Representation Learning with Adversarial Cross-view Reconstruction and Information Bottleneck",
    "authors": "Yuntao Shou, Haozhi Lan, Xiangyong Cao",
    "abstract": "Graph Neural Networks (GNNs) have received extensive research attention due\nto their powerful information aggregation capabilities. Despite the success of\nGNNs, most of them suffer from the popularity bias issue in a graph caused by a\nsmall number of popular categories. Additionally, real graph datasets always\ncontain incorrect node labels, which hinders GNNs from learning effective node\nrepresentations. Graph contrastive learning (GCL) has been shown to be\neffective in solving the above problems for node classification tasks. Most\nexisting GCL methods are implemented by randomly removing edges and nodes to\ncreate multiple contrasting views, and then maximizing the mutual information\n(MI) between these contrasting views to improve the node feature\nrepresentation. However, maximizing the mutual information between multiple\ncontrasting views may lead the model to learn some redundant information\nirrelevant to the node classification task. To tackle this issue, we propose an\neffective Contrastive Graph Representation Learning with Adversarial Cross-view\nReconstruction and Information Bottleneck (CGRL) for node classification, which\ncan adaptively learn to mask the nodes and edges in the graph to obtain the\noptimal graph structure representation. Furthermore, we innovatively introduce\nthe information bottleneck theory into GCLs to remove redundant information in\nmultiple contrasting views while retaining as much information as possible\nabout node classification. Moreover, we add noise perturbations to the original\nviews and reconstruct the augmented views by constructing adversarial views to\nimprove the robustness of node feature representation. Extensive experiments on\nreal-world public datasets demonstrate that our method significantly\noutperforms existing state-of-the-art algorithms.",
    "arxiv_id": "http://arxiv.org/abs/2408.00295v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00295v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Gradient Harmonization in Unsupervised Domain Adaptation",
    "authors": "Fuxiang Huang, Suqi Song, Lei Zhang",
    "abstract": "Unsupervised domain adaptation (UDA) intends to transfer knowledge from a\nlabeled source domain to an unlabeled target domain. Many current methods focus\non learning feature representations that are both discriminative for\nclassification and invariant across domains by simultaneously optimizing domain\nalignment and classification tasks. However, these methods often overlook a\ncrucial challenge: the inherent conflict between these two tasks during\ngradient-based optimization. In this paper, we delve into this issue and\nintroduce two effective solutions known as Gradient Harmonization, including GH\nand GH++, to mitigate the conflict between domain alignment and classification\ntasks. GH operates by altering the gradient angle between different tasks from\nan obtuse angle to an acute angle, thus resolving the conflict and trade-offing\nthe two tasks in a coordinated manner. Yet, this would cause both tasks to\ndeviate from their original optimization directions. We thus further propose an\nimproved version, GH++, which adjusts the gradient angle between tasks from an\nobtuse angle to a vertical angle. This not only eliminates the conflict but\nalso minimizes deviation from the original gradient directions. Finally, for\noptimization convenience and efficiency, we evolve the gradient harmonization\nstrategies into a dynamically weighted loss function using an integral operator\non the harmonized gradient. Notably, GH/GH++ are orthogonal to UDA and can be\nseamlessly integrated into most existing UDA models. Theoretical insights and\nexperimental analyses demonstrate that the proposed approaches not only enhance\npopular UDA baselines but also improve recent state-of-the-art models.",
    "arxiv_id": "http://arxiv.org/abs/2408.00288v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00288v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "High Performance Im2win and Direct Convolutions using Three Tensor Layouts on SIMD Architectures",
    "authors": "Xiang Fu, Xinpeng Zhang, Jixiang Ma, Peng Zhao, Shuai Lu, Xu T. Liu",
    "abstract": "Convolution is the core component within deep neural networks and it is\ncomputationally intensive and time consuming. Tensor data layouts significantly\nimpact convolution operations in terms of memory access and computational\nefficiency. Yet, there is still a lack of comprehensive performance\ncharacterization on data layouts on SIMD architectures concerning convolution\nmethods. This paper proposes three novel data layouts for im2win convolution:\nNHWC, CHWN, and CHWN8, and introduces a set of general optimization techniques\nfor both direct and im2win convolutions. We compare the optimized im2win\nconvolution with the direct convolution and PyTorch's im2col-based convolution\nacross the aforementioned layouts on SIMD machines. The experiments\ndemonstrated that the im2win convolution with the new NHWC layout achieved up\nto 355% performance speedup over NCHW layout. Our optimizations also\nsignificantly improve the performance of both im2win and direct convolutions.\nOur optimized im2win and direct convolutions achieved up to 95% and 94% of\nmachine's theoretical peak performance, respectively.",
    "arxiv_id": "http://arxiv.org/abs/2408.00278v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00278v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Clover-2: Accurate Inference for Regressive Lightweight Speculative Decoding",
    "authors": "Bin Xiao, Lujun Gui, Lei Su, Weipeng Chen",
    "abstract": "Large Language Models (LLMs) frequently suffer from inefficiencies, largely\nattributable to the discord between the requirements of auto-regressive\ndecoding and the architecture of contemporary GPUs. Recently, regressive\nlightweight speculative decoding has garnered attention for its notable\nefficiency improvements in text generation tasks. This approach utilizes a\nlightweight regressive draft model, like a Recurrent Neural Network (RNN) or a\nsingle transformer decoder layer, leveraging sequential information to\niteratively predict potential tokens. Specifically, RNN draft models are\ncomputationally economical but tend to deliver lower accuracy, while attention\ndecoder layer models exhibit the opposite traits. This paper presents Clover-2,\nan advanced iteration of Clover, an RNN-based draft model designed to achieve\ncomparable accuracy to that of attention decoder layer models while maintaining\nminimal computational overhead. Clover-2 enhances the model architecture and\nincorporates knowledge distillation to increase Clover's accuracy and improve\noverall efficiency. We conducted experiments using the open-source Vicuna 7B\nand LLaMA3-Instruct 8B models. The results demonstrate that Clover-2 surpasses\nexisting methods across various model architectures, showcasing its efficacy\nand robustness.",
    "arxiv_id": "http://arxiv.org/abs/2408.00264v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00264v1",
    "primary_category": "cs.CL",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Mobility-Aware Federated Self-supervised Learning in Vehicular Network",
    "authors": "Xueying Gu, Qiong Wu, Pingyi Fan, Qiang Fan",
    "abstract": "Federated Learning (FL) is an advanced distributed machine learning approach,\nthat protects the privacy of each vehicle by allowing the model to be trained\non multiple devices simultaneously without the need to upload all data to a\nroad side unit (RSU). This enables FL to handle scenarios with sensitive or\nwidely distributed data. However, in these fields, it is well known that the\nlabeling costs can be a significant expense, and models relying on labels are\nnot suitable for these rapidly evolving fields especially in vehicular\nnetworks, or mobile internet of things (MIoT), where new data emerges\nconstantly. To handle this issue, the self-supervised learning paves the way\nfor training without labels. Additionally, for vehicles with high velocity,\nowing to blurred images, simple aggregation not only impacts the accuracy of\nthe aggregated model but also reduces the convergence speed of FL. This paper\nproposes a FL algorithm based on image blur level to aggregation, called\nFLSimCo, which does not require labels and serves as a pre-training stage for\nself-supervised learning in the vehicular environment. Simulation results\ndemonstrate that the proposed algorithm exhibits fast and stable convergence.",
    "arxiv_id": "http://arxiv.org/abs/2408.00256v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00256v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Discovering Car-following Dynamics from Trajectory Data through Deep Learning",
    "authors": "Ohay Angah, James Enouen, Xuegang, Ban, Yan Liu",
    "abstract": "This study aims to discover the governing mathematical expressions of\ncar-following dynamics from trajectory data directly using deep learning\ntechniques. We propose an expression exploration framework based on deep\nsymbolic regression (DSR) integrated with a variable intersection selection\n(VIS) method to find variable combinations that encourage interpretable and\nparsimonious mathematical expressions. In the exploration learning process, two\npenalty terms are added to improve the reward function: (i) a complexity\npenalty to regulate the complexity of the explored expressions to be\nparsimonious, and (ii) a variable interaction penalty to encourage the\nexpression exploration to focus on variable combinations that can best describe\nthe data. We show the performance of the proposed method to learn several\ncar-following dynamics models and discuss its limitations and future research\ndirections.",
    "arxiv_id": "http://arxiv.org/abs/2408.00251v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00251v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Enhanced Structured State Space Models via Grouped FIR Filtering and Attention Sink Mechanisms",
    "authors": "Tian Meng, Yang Tao, Wuliang Yin",
    "abstract": "Structured State Space Models (SSMs) have emerged as compelling alternatives\nto Transformer architectures, offering linear-time complexity and superior\nperformance in various sequence modeling tasks. Despite their advantages, SSMs\nlike the original Mamba-2 face training difficulties due to the sensitivities\nintroduced by the extended series of recurrent matrix multiplications. In this\npaper, we propose an advanced architecture that mitigates these challenges by\ndecomposing A-multiplications into multiple groups and optimizing positional\nencoding through Grouped Finite Impulse Response (FIR) filtering. This new\nstructure, denoted as Grouped FIR-enhanced SSM (GFSSM), employs semiseparable\nmatrices for efficient computation. Furthermore, inspired by the \"attention\nsink\" phenomenon identified in streaming language models, we incorporate a\nsimilar mechanism to enhance the stability and performance of our model over\nextended sequences. Our approach further bridges the gap between SSMs and\nTransformer architectures, offering a viable path forward for scalable and\nhigh-performing sequence modeling.",
    "arxiv_id": "http://arxiv.org/abs/2408.00244v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00244v1",
    "primary_category": "cs.CL",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Empirical Bayes Linked Matrix Decomposition",
    "authors": "Eric F. Lock",
    "abstract": "Data for several applications in diverse fields can be represented as\nmultiple matrices that are linked across rows or columns. This is particularly\ncommon in molecular biomedical research, in which multiple molecular \"omics\"\ntechnologies may capture different feature sets (e.g., corresponding to rows in\na matrix) and/or different sample populations (corresponding to columns). This\nhas motivated a large body of work on integrative matrix factorization\napproaches that identify and decompose low-dimensional signal that is shared\nacross multiple matrices or specific to a given matrix. We propose an empirical\nvariational Bayesian approach to this problem that has several advantages over\nexisting techniques, including the flexibility to accommodate shared signal\nover any number of row or column sets (i.e., bidimensional integration), an\nintuitive model-based objective function that yields appropriate shrinkage for\nthe inferred signals, and a relatively efficient estimation algorithm with no\ntuning parameters. A general result establishes conditions for the uniqueness\nof the underlying decomposition for a broad family of methods that includes the\nproposed approach. For scenarios with missing data, we describe an associated\niterative imputation approach that is novel for the single-matrix context and a\npowerful approach for \"blockwise\" imputation (in which an entire row or column\nis missing) in various linked matrix contexts. Extensive simulations show that\nthe method performs very well under different scenarios with respect to\nrecovering underlying low-rank signal, accurately decomposing shared and\nspecific signals, and accurately imputing missing data. The approach is applied\nto gene expression and miRNA data from breast cancer tissue and normal breast\ntissue, for which it gives an informative decomposition of variation and\noutperforms alternative strategies for missing data imputation.",
    "arxiv_id": "http://arxiv.org/abs/2408.00237v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00237v1",
    "primary_category": "stat.ML",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "CDFGNN: a Systematic Design of Cache-based Distributed Full-Batch Graph Neural Network Training with Communication Reduction",
    "authors": "Shuai Zhang, Zite Jiang, Haihang You",
    "abstract": "Graph neural network training is mainly categorized into mini-batch and\nfull-batch training methods. The mini-batch training method samples subgraphs\nfrom the original graph in each iteration. This sampling operation introduces\nextra computation overhead and reduces the training accuracy. Meanwhile, the\nfull-batch training method calculates the features and corresponding gradients\nof all vertices in each iteration, and therefore has higher convergence\naccuracy. However, in the distributed cluster, frequent remote accesses of\nvertex features and gradients lead to huge communication overhead, thus\nrestricting the overall training efficiency.\n  In this paper, we introduce the cached-based distributed full-batch graph\nneural network training framework (CDFGNN). We propose the adaptive cache\nmechanism to reduce the remote vertex access by caching the historical features\nand gradients of neighbor vertices. Besides, we further optimize the\ncommunication overhead by quantifying the messages and designing the graph\npartition algorithm for the hierarchical communication architecture.\nExperiments show that the adaptive cache mechanism reduces remote vertex\naccesses by 63.14% on average. Combined with communication quantization and\nhierarchical GP algorithm, CDFGNN outperforms the state-of-the-art distributed\nfull-batch training frameworks by 30.39% in our experiments. Our results\nindicate that CDFGNN has great potential in accelerating distributed full-batch\nGNN training tasks.",
    "arxiv_id": "http://arxiv.org/abs/2408.00232v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00232v1",
    "primary_category": "cs.DC",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Invariant Discovery of Features Across Multiple Length Scales: Applications in Microscopy and Autonomous Materials Characterization",
    "authors": "Aditya Raghavan, Utkarsh Pratiush, Mani Valleti, Richard Liu, Reece Emery, Hiroshi Funakubo, Yongtao Liu, Philip Rack, Sergei Kalinin",
    "abstract": "Physical imaging is a foundational characterization method in areas from\ncondensed matter physics and chemistry to astronomy and spans length scales\nfrom atomic to universe. Images encapsulate crucial data regarding atomic\nbonding, materials microstructures, and dynamic phenomena such as\nmicrostructural evolution and turbulence, among other phenomena. The challenge\nlies in effectively extracting and interpreting this information. Variational\nAutoencoders (VAEs) have emerged as powerful tools for identifying underlying\nfactors of variation in image data, providing a systematic approach to\ndistilling meaningful patterns from complex datasets. However, a significant\nhurdle in their application is the definition and selection of appropriate\ndescriptors reflecting local structure. Here we introduce the scale-invariant\nVAE approach (SI-VAE) based on the progressive training of the VAE with the\ndescriptors sampled at different length scales. The SI-VAE allows the discovery\nof the length scale dependent factors of variation in the system. Here, we\nillustrate this approach using the ferroelectric domain images and generalize\nit to the movies of the electron-beam induced phenomena in graphene and\ntopography evolution across combinatorial libraries. This approach can further\nbe used to initialize the decision making in automated experiments including\nstructure-property discovery and can be applied across a broad range of imaging\nmethods. This approach is universal and can be applied to any spatially\nresolved data including both experimental imaging studies and simulations, and\ncan be particularly useful for exploration of phenomena such as turbulence,\nscale-invariant transformation fronts, etc.",
    "arxiv_id": "http://arxiv.org/abs/2408.00229v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00229v1",
    "primary_category": "physics.comp-ph",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Persistent de Rham-Hodge Laplacians in the Eulerian representation",
    "authors": "Zhe Su, Yiying Tong, Guo-Wei Wei",
    "abstract": "Recently, topological data analysis (TDA) has become a trending topic in data\nscience and engineering. However, the key technique of TDA, i.e., persistent\nhomology, is defined on point cloud data, which restricts its scope. In this\nwork, we propose persistent de Rham-Hodge Laplacian, or persistent Hodge\nLaplacian (PHL) for abbreviation, for the TDA on manifolds with boundaries, or\nvolumetric data. Specifically, we extended the evolutionary de Rham-Hodge\ntheory from the Lagrangian formulation to the Eulerian formulation via\nstructure-persevering Cartesian grids, and extended the persistent Laplacian on\npoint clouds to persistent (de Rham-)Hodge Laplacian on nested families of\nmanifolds with appropriate boundary conditions. The proposed PHL facilitates\nthe machine learning and deep learning prediction of volumetric data. For a\nproof-of-principle application of the proposed PHL, we propose a persistent\nHodge Laplacian learning (PHLL) algorithm for data on manifolds or volumetric\ndata. To this end, we showcase the PHLL prediction of protein-ligand binding\naffinities in two benchmark datasets. Our numerical experiments highlight the\npower and promise of PHLL.",
    "arxiv_id": "http://arxiv.org/abs/2408.00220v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00220v1",
    "primary_category": "math.DG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Load Balancing in Federated Learning",
    "authors": "Alireza Javani, Zhiying Wang",
    "abstract": "Federated Learning (FL) is a decentralized machine learning framework that\nenables learning from data distributed across multiple remote devices,\nenhancing communication efficiency and data privacy. Due to limited\ncommunication resources, a scheduling policy is often applied to select a\nsubset of devices for participation in each FL round. The scheduling process\nconfronts significant challenges due to the need for fair workload\ndistribution, efficient resource utilization, scalability in environments with\nnumerous edge devices, and statistically heterogeneous data across devices.\nThis paper proposes a load metric for scheduling policies based on the Age of\nInformation and addresses the above challenges by minimizing the load metric\nvariance across the clients. Furthermore, a decentralized Markov scheduling\npolicy is presented, that ensures a balanced workload distribution while\neliminating the management overhead irrespective of the network size due to\nindependent client decision-making. We establish the optimal parameters of the\nMarkov chain model and validate our approach through simulations. The results\ndemonstrate that reducing the load metric variance not only promotes fairness\nand improves operational efficiency, but also enhances the convergence rate of\nthe learning models.",
    "arxiv_id": "http://arxiv.org/abs/2408.00217v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00217v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Penzai + Treescope: A Toolkit for Interpreting, Visualizing, and Editing Models As Data",
    "authors": "Daniel D. Johnson",
    "abstract": "Much of today's machine learning research involves interpreting, modifying or\nvisualizing models after they are trained. I present Penzai, a neural network\nlibrary designed to simplify model manipulation by representing models as\nsimple data structures, and Treescope, an interactive pretty-printer and array\nvisualizer that can visualize both model inputs/outputs and the models\nthemselves. Penzai models are built using declarative combinators that expose\nthe model forward pass in the structure of the model object itself, and use\nnamed axes to ensure each operation is semantically meaningful. With Penzai's\ntree-editing selector system, users can both insert and replace model\ncomponents, allowing them to intervene on intermediate values or make other\nedits to the model structure. Users can then get immediate feedback by\nvisualizing the modified model with Treescope. I describe the motivation and\nmain features of Penzai and Treescope, and discuss how treating the model as\ndata enables a variety of analyses and interventions to be implemented as\ndata-structure transformations, without requiring model designers to add\nexplicit hooks.",
    "arxiv_id": "http://arxiv.org/abs/2408.00211v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00211v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Prognosis of COVID-19 using Artificial Intelligence: A Systematic Review and Meta-analysis",
    "authors": "SaeedReza Motamedian, Sadra Mohaghegh, Elham Babadi Oregani, Mahrsa Amjadi, Parnian Shobeiri, Negin Cheraghi, Niusha Solouki, Nikoo Ahmadi, Hossein Mohammad-Rahimi, Yassine Bouchareb, Arman Rahmim",
    "abstract": "Purpose: Artificial intelligence (AI) techniques have been extensively\nutilized for diagnosing and prognosis of several diseases in recent years. This\nstudy identifies, appraises and synthesizes published studies on the use of AI\nfor the prognosis of COVID-19. Method: Electronic search was performed using\nMedline, Google Scholar, Scopus, Embase, Cochrane and ProQuest. Studies that\nexamined machine learning or deep learning methods to determine the prognosis\nof COVID-19 using CT or chest X-ray images were included. Polled sensitivity,\nspecificity area under the curve and diagnostic odds ratio were calculated.\nResult: A total of 36 articles were included; various prognosis-related issues,\nincluding disease severity, mechanical ventilation or admission to the\nintensive care unit and mortality, were investigated. Several AI models and\narchitectures were employed, such as the Siamense model, support vector\nmachine, Random Forest , eXtreme Gradient Boosting, and convolutional neural\nnetworks. The models achieved 71%, 88% and 67% sensitivity for mortality,\nseverity assessment and need for ventilation, respectively. The specificity of\n69%, 89% and 89% were reported for the aforementioned variables. Conclusion:\nBased on the included articles, machine learning and deep learning methods used\nfor the prognosis of COVID-19 patients using radiomic features from CT or CXR\nimages can help clinicians manage patients and allocate resources more\neffectively. These studies also demonstrate that combining patient demographic,\nclinical data, laboratory tests and radiomic features improves model\nperformances.",
    "arxiv_id": "http://arxiv.org/abs/2408.00208v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00208v1",
    "primary_category": "physics.med-ph",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "OmniParser for Pure Vision Based GUI Agent",
    "authors": "Yadong Lu, Jianwei Yang, Yelong Shen, Ahmed Awadallah",
    "abstract": "The recent success of large vision language models shows great potential in\ndriving the agent system operating on user interfaces. However, we argue that\nthe power multimodal models like GPT-4V as a general agent on multiple\noperating systems across different applications is largely underestimated due\nto the lack of a robust screen parsing technique capable of: 1) reliably\nidentifying interactable icons within the user interface, and 2) understanding\nthe semantics of various elements in a screenshot and accurately associate the\nintended action with the corresponding region on the screen. To fill these\ngaps, we introduce \\textsc{OmniParser}, a comprehensive method for parsing user\ninterface screenshots into structured elements, which significantly enhances\nthe ability of GPT-4V to generate actions that can be accurately grounded in\nthe corresponding regions of the interface. We first curated an interactable\nicon detection dataset using popular webpages and an icon description dataset.\nThese datasets were utilized to fine-tune specialized models: a detection model\nto parse interactable regions on the screen and a caption model to extract the\nfunctional semantics of the detected elements. \\textsc{OmniParser}\nsignificantly improves GPT-4V's performance on ScreenSpot benchmark. And on\nMind2Web and AITW benchmark, \\textsc{OmniParser} with screenshot only input\noutperforms the GPT-4V baselines requiring additional information outside of\nscreenshot.",
    "arxiv_id": "http://arxiv.org/abs/2408.00203v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00203v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "UnPaSt: unsupervised patient stratification by differentially expressed biclusters in omics data",
    "authors": "Michael Hartung, Andreas Maier, Fernando Delgado-Chaves, Yuliya Burankova, Olga I. Isaeva, F\u00e1bio Malta de S\u00e1 Patroni, Daniel He, Casey Shannon, Katharina Kaufmann, Jens Lohmann, Alexey Savchik, Anne Hartebrodt, Zoe Chervontseva, Farzaneh Firoozbakht, Niklas Probul, Evgenia Zotova, Olga Tsoy, David B. Blumenthal, Martin Ester, Tanja Laske, Jan Baumbach, Olga Zolotareva",
    "abstract": "Most complex diseases, including cancer and non-malignant diseases like\nasthma, have distinct molecular subtypes that require distinct clinical\napproaches. However, existing computational patient stratification methods have\nbeen benchmarked almost exclusively on cancer omics data and only perform well\nwhen mutually exclusive subtypes can be characterized by many biomarkers. Here,\nwe contribute with a massive evaluation attempt, quantitatively exploring the\npower of 22 unsupervised patient stratification methods using both, simulated\nand real transcriptome data. From this experience, we developed UnPaSt\n(https://apps.cosy.bio/unpast/) optimizing unsupervised patient stratification,\nworking even with only a limited number of subtype-predictive biomarkers. We\nevaluated all 23 methods on real-world breast cancer and asthma transcriptomics\ndata. Although many methods reliably detected major breast cancer subtypes,\nonly few identified Th2-high asthma, and UnPaSt significantly outperformed its\nclosest competitors in both test datasets. Essentially, we showed that UnPaSt\ncan detect many biologically insightful and reproducible patterns in omic\ndatasets.",
    "arxiv_id": "http://arxiv.org/abs/2408.00200v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00200v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Automated Software Vulnerability Static Code Analysis Using Generative Pre-Trained Transformer Models",
    "authors": "Elijah Pelofske, Vincent Urias, Lorie M. Liebrock",
    "abstract": "Generative Pre-Trained Transformer models have been shown to be surprisingly\neffective at a variety of natural language processing tasks -- including\ngenerating computer code. We evaluate the effectiveness of open source GPT\nmodels for the task of automatic identification of the presence of vulnerable\ncode syntax (specifically targeting C and C++ source code). This task is\nevaluated on a selection of 36 source code examples from the NIST SARD dataset,\nwhich are specifically curated to not contain natural English that indicates\nthe presence, or lack thereof, of a particular vulnerability. The NIST SARD\nsource code dataset contains identified vulnerable lines of source code that\nare examples of one out of the 839 distinct Common Weakness Enumerations (CWE),\nallowing for exact quantification of the GPT output classification error rate.\nA total of 5 GPT models are evaluated, using 10 different inference\ntemperatures and 100 repetitions at each setting, resulting in 5,000 GPT\nqueries per vulnerable source code analyzed. Ultimately, we find that the GPT\nmodels that we evaluated are not suitable for fully automated vulnerability\nscanning because the false positive and false negative rates are too high to\nlikely be useful in practice. However, we do find that the GPT models perform\nsurprisingly well at automated vulnerability detection for some of the test\ncases, in particular surpassing random sampling, and being able to identify the\nexact lines of code that are vulnerable albeit at a low success rate. The best\nperforming GPT model result found was Llama-2-70b-chat-hf with inference\ntemperature of 0.1 applied to NIST SARD test case 149165 (which is an example\nof a buffer overflow vulnerability), which had a binary classification recall\nscore of 1.0 and a precision of 1.0 for correctly and uniquely identifying the\nvulnerable line of code and the correct CWE number.",
    "arxiv_id": "http://arxiv.org/abs/2408.00197v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00197v1",
    "primary_category": "cs.CR",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Combining audio control and style transfer using latent diffusion",
    "authors": "Nils Demerl\u00e9, Philippe Esling, Guillaume Doras, David Genova",
    "abstract": "Deep generative models are now able to synthesize high-quality audio signals,\nshifting the critical aspect in their development from audio quality to control\ncapabilities. Although text-to-music generation is getting largely adopted by\nthe general public, explicit control and example-based style transfer are more\nadequate modalities to capture the intents of artists and musicians.\n  In this paper, we aim to unify explicit control and style transfer within a\nsingle model by separating local and global information to capture musical\nstructure and timbre respectively. To do so, we leverage the capabilities of\ndiffusion autoencoders to extract semantic features, in order to build two\nrepresentation spaces. We enforce disentanglement between those spaces using an\nadversarial criterion and a two-stage training strategy. Our resulting model\ncan generate audio matching a timbre target, while specifying structure either\nwith explicit controls or through another audio example. We evaluate our model\non one-shot timbre transfer and MIDI-to-audio tasks on instrumental recordings\nand show that we outperform existing baselines in terms of audio quality and\ntarget fidelity. Furthermore, we show that our method can generate cover\nversions of complete musical pieces by transferring rhythmic and melodic\ncontent to the style of a target audio in a different genre.",
    "arxiv_id": "http://arxiv.org/abs/2408.00196v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00196v1",
    "primary_category": "cs.SD",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Adapting Skills to Novel Grasps: A Self-Supervised Approach",
    "authors": "Georgios Papagiannis, Kamil Dreczkowski, Vitalis Vosylius, Edward Johns",
    "abstract": "In this paper, we study the problem of adapting manipulation trajectories\ninvolving grasped objects (e.g. tools) defined for a single grasp pose to novel\ngrasp poses. A common approach to address this is to define a new trajectory\nfor each possible grasp explicitly, but this is highly inefficient. Instead, we\npropose a method to adapt such trajectories directly while only requiring a\nperiod of self-supervised data collection, during which a camera observes the\nrobot's end-effector moving with the object rigidly grasped. Importantly, our\nmethod requires no prior knowledge of the grasped object (such as a 3D CAD\nmodel), it can work with RGB images, depth images, or both, and it requires no\ncamera calibration. Through a series of real-world experiments involving 1360\nevaluations, we find that self-supervised RGB data consistently outperforms\nalternatives that rely on depth images including several state-of-the-art pose\nestimation methods. Compared to the best-performing baseline, our method\nresults in an average of 28.5% higher success rate when adapting manipulation\ntrajectories to novel grasps on several everyday tasks. Videos of the\nexperiments are available on our webpage at\nhttps://www.robot-learning.uk/adapting-skills",
    "arxiv_id": "http://arxiv.org/abs/2408.00178v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00178v1",
    "primary_category": "cs.RO",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "CREW: Facilitating Human-AI Teaming Research",
    "authors": "Lingyu Zhang, Zhengran Ji, Boyuan Chen",
    "abstract": "With the increasing deployment of artificial intelligence (AI) technologies,\nthe potential of humans working with AI agents has been growing at a great\nspeed. Human-AI teaming is an important paradigm for studying various aspects\nwhen humans and AI agents work together. The unique aspect of Human-AI teaming\nresearch is the need to jointly study humans and AI agents, demanding\nmultidisciplinary research efforts from machine learning to human-computer\ninteraction, robotics, cognitive science, neuroscience, psychology, social\nscience, and complex systems. However, existing platforms for Human-AI teaming\nresearch are limited, often supporting oversimplified scenarios and a single\ntask, or specifically focusing on either human-teaming research or multi-agent\nAI algorithms. We introduce CREW, a platform to facilitate Human-AI teaming\nresearch and engage collaborations from multiple scientific disciplines, with a\nstrong emphasis on human involvement. It includes pre-built tasks for cognitive\nstudies and Human-AI teaming with expandable potentials from our modular\ndesign. Following conventional cognitive neuroscience research, CREW also\nsupports multimodal human physiological signal recording for behavior analysis.\nMoreover, CREW benchmarks real-time human-guided reinforcement learning agents\nusing state-of-the-art algorithms and well-tuned baselines. With CREW, we were\nable to conduct 50 human subject studies within a week to verify the\neffectiveness of our benchmark.",
    "arxiv_id": "http://arxiv.org/abs/2408.00170v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00170v1",
    "primary_category": "cs.HC",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Strike the Balance: On-the-Fly Uncertainty based User Interactions for Long-Term Video Object Segmentation",
    "authors": "St\u00e9phane Vujasinovi\u0107, Stefan Becker, Sebastian Bullinger, Norbert Scherer-Negenborn, Michael Arens",
    "abstract": "In this paper, we introduce a variant of video object segmentation (VOS) that\nbridges interactive and semi-automatic approaches, termed Lazy Video Object\nSegmentation (ziVOS). In contrast, to both tasks, which handle video object\nsegmentation in an off-line manner (i.e., pre-recorded sequences), we propose\nthrough ziVOS to target online recorded sequences. Here, we strive to strike a\nbalance between performance and robustness for long-term scenarios by\nsoliciting user feedback's on-the-fly during the segmentation process. Hence,\nwe aim to maximize the tracking duration of an object of interest, while\nrequiring minimal user corrections to maintain tracking over an extended\nperiod. We propose a competitive baseline, i.e., Lazy-XMem, as a reference for\nfuture works in ziVOS. Our proposed approach uses an uncertainty estimation of\nthe tracking state to determine whether a user interaction is necessary to\nrefine the model's prediction. To quantitatively assess the performance of our\nmethod and the user's workload, we introduce complementary metrics alongside\nthose already established in the field. We evaluate our approach using the\nrecently introduced LVOS dataset, which offers numerous long-term videos. Our\ncode is publicly available at https://github.com/Vujas-Eteph/LazyXMem.",
    "arxiv_id": "http://arxiv.org/abs/2408.00169v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00169v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Review of Explainable Graph-Based Recommender Systems",
    "authors": "Thanet Markchom, Huizhi Liang, James Ferryman",
    "abstract": "Explainability of recommender systems has become essential to ensure users'\ntrust and satisfaction. Various types of explainable recommender systems have\nbeen proposed including explainable graph-based recommender systems. This\nreview paper discusses state-of-the-art approaches of these systems and\ncategorizes them based on three aspects: learning methods, explaining methods,\nand explanation types. It also explores the commonly used datasets,\nexplainability evaluation methods, and future directions of this research area.\nCompared with the existing review papers, this paper focuses on explainability\nbased on graphs and covers the topics required for developing novel explainable\ngraph-based recommender systems.",
    "arxiv_id": "http://arxiv.org/abs/2408.00166v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00166v1",
    "primary_category": "cs.IR",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Non-convolutional Graph Neural Networks",
    "authors": "Yuanqing Wang, Kyunghyun Cho",
    "abstract": "Rethink convolution-based graph neural networks (GNN) -- they\ncharacteristically suffer from limited expressiveness, over-smoothing, and\nover-squashing, and require specialized sparse kernels for efficient\ncomputation. Here, we design a simple graph learning module entirely free of\nconvolution operators, coined \\textit{random walk with unifying memory} (RUM)\nneural network, where an RNN merges the topological and semantic graph features\nalong the random walks terminating at each node. Relating the rich literature\non RNN behavior and graph topology, we theoretically show and experimentally\nverify that RUM attenuates the aforementioned symptoms and is more expressive\nthan the Weisfeiler-Lehman (WL) isomorphism test. On a variety of node- and\ngraph-level classification and regression tasks, RUM not only achieves\ncompetitive performance, but is also robust, memory-efficient, scalable, and\nfaster than the simplest convolutional GNNs.",
    "arxiv_id": "http://arxiv.org/abs/2408.00165v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00165v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "A Taxonomy of Stereotype Content in Large Language Models",
    "authors": "Gandalf Nicolas, Aylin Caliskan",
    "abstract": "This study introduces a taxonomy of stereotype content in contemporary large\nlanguage models (LLMs). We prompt ChatGPT 3.5, Llama 3, and Mixtral 8x7B, three\npowerful and widely used LLMs, for the characteristics associated with 87\nsocial categories (e.g., gender, race, occupations). We identify 14 stereotype\ndimensions (e.g., Morality, Ability, Health, Beliefs, Emotions), accounting for\n~90% of LLM stereotype associations. Warmth and Competence facets were the most\nfrequent content, but all other dimensions were significantly prevalent.\nStereotypes were more positive in LLMs (vs. humans), but there was significant\nvariability across categories and dimensions. Finally, the taxonomy predicted\nthe LLMs' internal evaluations of social categories (e.g., how\npositively/negatively the categories were represented), supporting the\nrelevance of a multidimensional taxonomy for characterizing LLM stereotypes.\nOur findings suggest that high-dimensional human stereotypes are reflected in\nLLMs and must be considered in AI auditing and debiasing to minimize\nunidentified harms from reliance in low-dimensional views of bias in LLMs.",
    "arxiv_id": "http://arxiv.org/abs/2408.00162v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00162v1",
    "primary_category": "cs.CY",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Automatic Generation of Behavioral Test Cases For Natural Language Processing Using Clustering and Prompting",
    "authors": "Ying Li, Rahul Singh, Tarun Joshi, Agus Sudjianto",
    "abstract": "Recent work in behavioral testing for natural language processing (NLP)\nmodels, such as Checklist, is inspired by related paradigms in software\nengineering testing. They allow evaluation of general linguistic capabilities\nand domain understanding, hence can help evaluate conceptual soundness and\nidentify model weaknesses. However, a major challenge is the creation of test\ncases. The current packages rely on semi-automated approach using manual\ndevelopment which requires domain expertise and can be time consuming. This\npaper introduces an automated approach to develop test cases by exploiting the\npower of large language models and statistical techniques. It clusters the text\nrepresentations to carefully construct meaningful groups and then apply\nprompting techniques to automatically generate Minimal Functionality Tests\n(MFT). The well-known Amazon Reviews corpus is used to demonstrate our\napproach. We analyze the behavioral test profiles across four different\nclassification algorithms and discuss the limitations and strengths of those\nmodels.",
    "arxiv_id": "http://arxiv.org/abs/2408.00161v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00161v1",
    "primary_category": "cs.CL",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Hierarchical Conditioning of Diffusion Models Using Tree-of-Life for Studying Species Evolution",
    "authors": "Mridul Khurana, Arka Daw, M. Maruf, Josef C. Uyeda, Wasila Dahdul, Caleb Charpentier, Yasin Bak\u0131\u015f, Henry L. Bart Jr., Paula M. Mabee, Hilmar Lapp, James P. Balhoff, Wei-Lun Chao, Charles Stewart, Tanya Berger-Wolf, Anuj Karpatne",
    "abstract": "A central problem in biology is to understand how organisms evolve and adapt\nto their environment by acquiring variations in the observable characteristics\nor traits of species across the tree of life. With the growing availability of\nlarge-scale image repositories in biology and recent advances in generative\nmodeling, there is an opportunity to accelerate the discovery of evolutionary\ntraits automatically from images. Toward this goal, we introduce\nPhylo-Diffusion, a novel framework for conditioning diffusion models with\nphylogenetic knowledge represented in the form of HIERarchical Embeddings\n(HIER-Embeds). We also propose two new experiments for perturbing the embedding\nspace of Phylo-Diffusion: trait masking and trait swapping, inspired by\ncounterpart experiments of gene knockout and gene editing/swapping. Our work\nrepresents a novel methodological advance in generative modeling to structure\nthe embedding space of diffusion models using tree-based knowledge. Our work\nalso opens a new chapter of research in evolutionary biology by using\ngenerative models to visualize evolutionary changes directly from images. We\nempirically demonstrate the usefulness of Phylo-Diffusion in capturing\nmeaningful trait variations for fishes and birds, revealing novel insights\nabout the biological mechanisms of their evolution.",
    "arxiv_id": "http://arxiv.org/abs/2408.00160v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00160v1",
    "primary_category": "q-bio.PE",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Generative Learning of the Solution of Parametric Partial Differential Equations Using Guided Diffusion Models and Virtual Observations",
    "authors": "Han Gao, Sebastian Kaltenbach, Petros Koumoutsakos",
    "abstract": "We introduce a generative learning framework to model high-dimensional\nparametric systems using gradient guidance and virtual observations. We\nconsider systems described by Partial Differential Equations (PDEs) discretized\nwith structured or unstructured grids. The framework integrates multi-level\ninformation to generate high fidelity time sequences of the system dynamics. We\ndemonstrate the effectiveness and versatility of our framework with two case\nstudies in incompressible, two dimensional, low Reynolds cylinder flow on an\nunstructured mesh and incompressible turbulent channel flow on a structured\nmesh, both parameterized by the Reynolds number. Our results illustrate the\nframework's robustness and ability to generate accurate flow sequences across\nvarious parameter settings, significantly reducing computational costs allowing\nfor efficient forecasting and reconstruction of flow dynamics.",
    "arxiv_id": "http://arxiv.org/abs/2408.00157v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00157v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Distributionally Robust Optimization as a Scalable Framework to Characterize Extreme Value Distributions",
    "authors": "Patrick Kuiper, Ali Hasan, Wenhao Yang, Yuting Ng, Hoda Bidkhori, Jose Blanchet, Vahid Tarokh",
    "abstract": "The goal of this paper is to develop distributionally robust optimization\n(DRO) estimators, specifically for multidimensional Extreme Value Theory (EVT)\nstatistics. EVT supports using semi-parametric models called max-stable\ndistributions built from spatial Poisson point processes. While powerful, these\nmodels are only asymptotically valid for large samples. However, since extreme\ndata is by definition scarce, the potential for model misspecification error is\ninherent to these applications, thus DRO estimators are natural. In order to\nmitigate over-conservative estimates while enhancing out-of-sample performance,\nwe study DRO estimators informed by semi-parametric max-stable constraints in\nthe space of point processes. We study both tractable convex formulations for\nsome problems of interest (e.g. CVaR) and more general neural network based\nestimators. Both approaches are validated using synthetically generated data,\nrecovering prescribed characteristics, and verifying the efficacy of the\nproposed techniques. Additionally, the proposed method is applied to a real\ndata set of financial returns for comparison to a previous analysis. We\nestablished the proposed model as a novel formulation in the multivariate EVT\ndomain, and innovative with respect to performance when compared to relevant\nalternate proposals.",
    "arxiv_id": "http://arxiv.org/abs/2408.00131v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00131v1",
    "primary_category": "stat.ML",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Vera Verto: Multimodal Hijacking Attack",
    "authors": "Minxing Zhang, Ahmed Salem, Michael Backes, Yang Zhang",
    "abstract": "The increasing cost of training machine learning (ML) models has led to the\ninclusion of new parties to the training pipeline, such as users who contribute\ntraining data and companies that provide computing resources. This involvement\nof such new parties in the ML training process has introduced new attack\nsurfaces for an adversary to exploit. A recent attack in this domain is the\nmodel hijacking attack, whereby an adversary hijacks a victim model to\nimplement their own -- possibly malicious -- hijacking tasks. However, the\nscope of the model hijacking attack is so far limited to the\nhomogeneous-modality tasks. In this paper, we transform the model hijacking\nattack into a more general multimodal setting, where the hijacking and original\ntasks are performed on data of different modalities. Specifically, we focus on\nthe setting where an adversary implements a natural language processing (NLP)\nhijacking task into an image classification model. To mount the attack, we\npropose a novel encoder-decoder based framework, namely the Blender, which\nrelies on advanced image and language models. Experimental results show that\nour modal hijacking attack achieves strong performances in different settings.\nFor instance, our attack achieves 94%, 94%, and 95% attack success rate when\nusing the Sogou news dataset to hijack STL10, CIFAR-10, and MNIST classifiers.",
    "arxiv_id": "http://arxiv.org/abs/2408.00129v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00129v1",
    "primary_category": "cs.CR",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Certifying Robustness of Learning-Based Keypoint Detection and Pose Estimation Methods",
    "authors": "Xusheng Luo, Tianhao Wei, Simin Liu, Ziwei Wang, Luis Mattei-Mendez, Taylor Loper, Joshua Neighbor, Casidhe Hutchison, Changliu Liu",
    "abstract": "This work addresses the certification of the local robustness of vision-based\ntwo-stage 6D object pose estimation. The two-stage method for object pose\nestimation achieves superior accuracy by first employing deep neural\nnetwork-driven keypoint regression and then applying a Perspective-n-Point\n(PnP) technique. Despite advancements, the certification of these methods'\nrobustness remains scarce. This research aims to fill this gap with a focus on\ntheir local robustness on the system level--the capacity to maintain robust\nestimations amidst semantic input perturbations. The core idea is to transform\nthe certification of local robustness into neural network verification for\nclassification tasks. The challenge is to develop model, input, and output\nspecifications that align with off-the-shelf verification tools. To facilitate\nverification, we modify the keypoint detection model by substituting nonlinear\noperations with those more amenable to the verification processes. Instead of\ninjecting random noise into images, as is common, we employ a convex hull\nrepresentation of images as input specifications to more accurately depict\nsemantic perturbations. Furthermore, by conducting a sensitivity analysis, we\npropagate the robustness criteria from pose to keypoint accuracy, and then\nformulating an optimal error threshold allocation problem that allows for the\nsetting of a maximally permissible keypoint deviation thresholds. Viewing each\npixel as an individual class, these thresholds result in linear,\nclassification-akin output specifications. Under certain conditions, we\ndemonstrate that the main components of our certification framework are both\nsound and complete, and validate its effects through extensive evaluations on\nrealistic perturbations. To our knowledge, this is the first study to certify\nthe robustness of large-scale, keypoint-based pose estimation given images in\nreal-world scenarios.",
    "arxiv_id": "http://arxiv.org/abs/2408.00117v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00117v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Measuring Progress in Dictionary Learning for Language Model Interpretability with Board Game Models",
    "authors": "Adam Karvonen, Benjamin Wright, Can Rager, Rico Angell, Jannik Brinkmann, Logan Smith, Claudio Mayrink Verdun, David Bau, Samuel Marks",
    "abstract": "What latent features are encoded in language model (LM) representations?\nRecent work on training sparse autoencoders (SAEs) to disentangle interpretable\nfeatures in LM representations has shown significant promise. However,\nevaluating the quality of these SAEs is difficult because we lack a\nground-truth collection of interpretable features that we expect good SAEs to\nrecover. We thus propose to measure progress in interpretable dictionary\nlearning by working in the setting of LMs trained on chess and Othello\ntranscripts. These settings carry natural collections of interpretable features\n-- for example, \"there is a knight on F3\" -- which we leverage into\n$\\textit{supervised}$ metrics for SAE quality. To guide progress in\ninterpretable dictionary learning, we introduce a new SAE training technique,\n$\\textit{p-annealing}$, which improves performance on prior unsupervised\nmetrics as well as our new metrics.",
    "arxiv_id": "http://arxiv.org/abs/2408.00113v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00113v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Adaptive Transit Signal Priority based on Deep Reinforcement Learning and Connected Vehicles in a Traffic Microsimulation Environment",
    "authors": "Dickness Kwesiga, Angshuman Guin, Michael Hunter",
    "abstract": "Model free reinforcement learning (RL) provides a potential alternative to\nearlier formulations of adaptive transit signal priority (TSP) algorithms based\non mathematical programming that require complex and nonlinear objective\nfunctions. This study extends RL - based traffic control to include TSP. Using\na microscopic simulation environment and connected vehicle data, the study\ndevelops and tests a TSP event-based RL agent that assumes control from another\ndeveloped RL - based general traffic signal controller. The TSP agent assumes\ncontrol when transit buses enter the dedicated short-range communication (DSRC)\nzone of the intersection. This agent is shown to reduce the bus travel time by\nabout 21%, with marginal impacts to general traffic at a saturation rate of\n0.95. The TSP agent also shows slightly better bus travel time compared to\nactuated signal control with TSP. The architecture of the agent and simulation\nis selected considering the need to improve simulation run time efficiency.",
    "arxiv_id": "http://arxiv.org/abs/2408.00098v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00098v1",
    "primary_category": "cs.LG",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Approximating Rayleigh Scattering in Exoplanetary Atmospheres using Physics-informed Neural Networks (PINNs)",
    "authors": "David Dahlb\u00fcdding, Karan Molaverdikhani, Barbara Ercolano, Tommaso Grassi",
    "abstract": "This research introduces an innovative application of physics-informed neural\nnetworks (PINNs) to tackle the intricate challenges of radiative transfer (RT)\nmodeling in exoplanetary atmospheres, with a special focus on efficiently\nhandling scattering phenomena. Traditional RT models often simplify scattering\nas absorption, leading to inaccuracies. Our approach utilizes PINNs, noted for\ntheir ability to incorporate the governing differential equations of RT\ndirectly into their loss function, thus offering a more precise yet potentially\nfast modeling technique. The core of our method involves the development of a\nparameterized PINN tailored for a modified RT equation, enhancing its\nadaptability to various atmospheric scenarios. We focus on RT in transiting\nexoplanet atmospheres using a simplified 1D isothermal model with\npressure-dependent coefficients for absorption and Rayleigh scattering. In\nscenarios of pure absorption, the PINN demonstrates its effectiveness in\npredicting transmission spectra for diverse absorption profiles. For Rayleigh\nscattering, the network successfully computes the RT equation, addressing both\ndirect and diffuse stellar light components. While our preliminary results with\nsimplified models are promising, indicating the potential of PINNs in improving\nRT calculations, we acknowledge the errors stemming from our approximations as\nwell as the challenges in applying this technique to more complex atmospheric\nconditions. Specifically, extending our approach to atmospheres with intricate\ntemperature-pressure profiles and varying scattering properties, such as those\nintroduced by clouds and hazes, remains a significant area for future\ndevelopment.",
    "arxiv_id": "http://arxiv.org/abs/2408.00084v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00084v1",
    "primary_category": "astro-ph.EP",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "TASI Lectures on Physics for Machine Learning",
    "authors": "Jim Halverson",
    "abstract": "These notes are based on lectures I gave at TASI 2024 on Physics for Machine\nLearning. The focus is on neural network theory, organized according to network\nexpressivity, statistics, and dynamics. I present classic results such as the\nuniversal approximation theorem and neural network / Gaussian process\ncorrespondence, and also more recent results such as the neural tangent kernel,\nfeature learning with the maximal update parameterization, and\nKolmogorov-Arnold networks. The exposition on neural network theory emphasizes\na field theoretic perspective familiar to theoretical physicists. I elaborate\non connections between the two, including a neural network approach to field\ntheory.",
    "arxiv_id": "http://arxiv.org/abs/2408.00082v1",
    "pdf_url": "http://arxiv.org/pdf/2408.00082v1",
    "primary_category": "hep-th",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  },
  {
    "title": "Generalized Out-of-Distribution Detection and Beyond in Vision Language Model Era: A Survey",
    "authors": "Atsuyuki Miyai, Jingkang Yang, Jingyang Zhang, Yifei Ming, Yueqian Lin, Qing Yu, Go Irie, Shafiq Joty, Yixuan Li, Hai Li, Ziwei Liu, Toshihiko Yamasaki, Kiyoharu Aizawa",
    "abstract": "Detecting out-of-distribution (OOD) samples is crucial for ensuring the\nsafety of machine learning systems and has shaped the field of OOD detection.\nMeanwhile, several other problems are closely related to OOD detection,\nincluding anomaly detection (AD), novelty detection (ND), open set recognition\n(OSR), and outlier detection (OD). To unify these problems, a generalized OOD\ndetection framework was proposed, taxonomically categorizing these five\nproblems. However, Vision Language Models (VLMs) such as CLIP have\nsignificantly changed the paradigm and blurred the boundaries between these\nfields, again confusing researchers. In this survey, we first present a\ngeneralized OOD detection v2, encapsulating the evolution of AD, ND, OSR, OOD\ndetection, and OD in the VLM era. Our framework reveals that, with some field\ninactivity and integration, the demanding challenges have become OOD detection\nand AD. In addition, we also highlight the significant shift in the definition,\nproblem settings, and benchmarks; we thus feature a comprehensive review of the\nmethodology for OOD detection, including the discussion over other related\ntasks to clarify their relationship to OOD detection. Finally, we explore the\nadvancements in the emerging Large Vision Language Model (LVLM) era, such as\nGPT-4V. We conclude this survey with open challenges and future directions.",
    "arxiv_id": "http://arxiv.org/abs/2407.21794v1",
    "pdf_url": "http://arxiv.org/pdf/2407.21794v1",
    "primary_category": "cs.CV",
    "votes": 0,
    "prompt": "LLM annotation ",
    "model": "gpt-4-turbo"
  }
]