hf_papers.json

{
    "date": {
        "ru": "4 марта",
        "en": "March 4",
        "zh": "3月4日"
    },
    "time_utc": "2025-03-04 16:13",
    "weekday": 1,
    "issue_id": 2522,
    "home_page_url": "https://huggingface.co/papers",
    "papers": [
        {
            "id": "https://huggingface.co/papers/2503.01785",
            "title": "Visual-RFT: Visual Reinforcement Fine-Tuning",
            "url": "https://huggingface.co/papers/2503.01785",
            "abstract": "Reinforcement Fine-Tuning (RFT) in Large Reasoning Models like OpenAI o1 learns from feedback on its answers, which is especially useful in applications when fine-tuning data is scarce. Recent open-source work like DeepSeek-R1 demonstrates that reinforcement learning with verifiable reward is one key direction in reproducing o1. While the R1-style model has demonstrated success in language models, its application in multi-modal domains remains under-explored. This work introduces Visual Reinforcement Fine-Tuning (Visual-RFT), which further extends the application areas of RFT on visual tasks. Specifically, Visual-RFT first uses Large Vision-Language Models (LVLMs) to generate multiple responses containing reasoning tokens and final answers for each input, and then uses our proposed visual perception verifiable reward functions to update the model via the policy optimization algorithm such as Group Relative Policy Optimization (GRPO). We design different verifiable reward functions for different perception tasks, such as the Intersection over Union (IoU) reward for object detection. Experimental results on fine-grained image classification, few-shot object detection, reasoning grounding, as well as open-vocabulary object detection benchmarks show the competitive performance and advanced generalization ability of Visual-RFT compared with Supervised Fine-tuning (SFT). For example, Visual-RFT improves accuracy by 24.3% over the baseline in one-shot fine-grained image classification with around 100 samples. In few-shot object detection, Visual-RFT also exceeds the baseline by 21.9 on COCO's two-shot setting and 15.4 on LVIS. Our Visual-RFT represents a paradigm shift in fine-tuning LVLMs, offering a data-efficient, reward-driven approach that enhances reasoning and adaptability for domain-specific tasks.",
            "score": 38,
            "issue_id": 2511,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "ef2e10eb59ab7743",
            "authors": [
                "Ziyu Liu",
                "Zeyi Sun",
                "Yuhang Zang",
                "Xiaoyi Dong",
                "Yuhang Cao",
                "Haodong Duan",
                "Dahua Lin",
                "Jiaqi Wang"
            ],
            "affiliations": [
                "Shanghai Artificial Intelligence Laboratory",
                "Shanghai Jiaotong University",
                "The Chinese University of Hong Kong"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01785.jpg",
            "data": {
                "categories": [
                    "#multimodal",
                    "#open_source",
                    "#cv",
                    "#optimization",
                    "#rlhf",
                    "#reasoning",
                    "#training",
                    "#rl"
                ],
                "emoji": "🔬",
                "ru": {
                    "title": "Visual-RFT: Революция в тонкой настройке визуально-языковых моделей",
                    "desc": "Статья представляет Visual Reinforcement Fine-Tuning (Visual-RFT) - метод, расширяющий применение обучения с подкреплением в визуальных задачах. Visual-RFT использует большие визуально-языковые модели для генерации ответов с токенами рассуждений и применяет визуально верифицируемые функции вознаграждения для обновления модели. Эксперименты показывают превосходство Visual-RFT над методом Supervised Fine-tuning в задачах классификации изображений, обнаружения объектов и обоснованного заземления. Метод демонстрирует значительное улучшение точности и обобщающей способности при ограниченном количестве обучающих примеров."
                },
                "en": {
                    "title": "Revolutionizing Visual Learning with Reinforcement Fine-Tuning",
                    "desc": "This paper introduces Visual Reinforcement Fine-Tuning (Visual-RFT), a method that enhances large vision-language models (LVLMs) by using reinforcement learning to improve their performance on visual tasks. Visual-RFT generates multiple responses for each input and employs verifiable reward functions to optimize the model's policy, making it particularly effective in scenarios with limited fine-tuning data. The approach demonstrates significant improvements in tasks like fine-grained image classification and object detection, outperforming traditional supervised fine-tuning methods. Overall, Visual-RFT represents a novel, efficient way to fine-tune LVLMs, focusing on reasoning and adaptability in specific domains."
                },
                "zh": {
                    "title": "视觉强化微调：提升推理与适应性的创新方法",
                    "desc": "强化微调（RFT）在大型推理模型中通过反馈学习，特别适用于微调数据稀缺的应用场景。本文提出的视觉强化微调（Visual-RFT）扩展了RFT在视觉任务中的应用，利用大型视觉语言模型生成多种响应，并通过可验证的视觉感知奖励函数进行模型更新。实验结果表明，Visual-RFT在细粒度图像分类和少样本目标检测等任务中表现出色，相较于传统的监督微调（SFT）方法，准确率显著提高。Visual-RFT代表了一种新的微调范式，提供了一种数据高效、以奖励驱动的方法，增强了模型在特定领域任务中的推理能力和适应性。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01774",
            "title": "Difix3D+: Improving 3D Reconstructions with Single-Step Diffusion Models",
            "url": "https://huggingface.co/papers/2503.01774",
            "abstract": "Neural Radiance Fields and 3D Gaussian Splatting have revolutionized 3D reconstruction and novel-view synthesis task. However, achieving photorealistic rendering from extreme novel viewpoints remains challenging, as artifacts persist across representations. In this work, we introduce Difix3D+, a novel pipeline designed to enhance 3D reconstruction and novel-view synthesis through single-step diffusion models. At the core of our approach is Difix, a single-step image diffusion model trained to enhance and remove artifacts in rendered novel views caused by underconstrained regions of the 3D representation. Difix serves two critical roles in our pipeline. First, it is used during the reconstruction phase to clean up pseudo-training views that are rendered from the reconstruction and then distilled back into 3D. This greatly enhances underconstrained regions and improves the overall 3D representation quality. More importantly, Difix also acts as a neural enhancer during inference, effectively removing residual artifacts arising from imperfect 3D supervision and the limited capacity of current reconstruction models. Difix3D+ is a general solution, a single model compatible with both NeRF and 3DGS representations, and it achieves an average 2times improvement in FID score over baselines while maintaining 3D consistency.",
            "score": 28,
            "issue_id": 2512,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "39af2f882aef9afb",
            "authors": [
                "Jay Zhangjie Wu",
                "Yuxuan Zhang",
                "Haithem Turki",
                "Xuanchi Ren",
                "Jun Gao",
                "Mike Zheng Shou",
                "Sanja Fidler",
                "Zan Gojcic",
                "Huan Ling"
            ],
            "affiliations": [
                "NVIDIA",
                "National University of Singapore",
                "University of Toronto",
                "Vector Institute"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01774.jpg",
            "data": {
                "categories": [
                    "#3d",
                    "#diffusion"
                ],
                "emoji": "🖼️",
                "ru": {
                    "title": "Одношаговая диффузия для фотореалистичной 3D-реконструкции",
                    "desc": "Difix3D+ - это новый подход к улучшению 3D-реконструкции и синтеза изображений с новых ракурсов. В его основе лежит Difix - одношаговая модель диффузии изображений, обученная улучшать и устранять артефакты в визуализированных видах. Difix используется как на этапе реконструкции для очистки псевдо-обучающих видов, так и во время вывода для устранения остаточных артефактов. Difix3D+ совместим с представлениями NeRF и 3DGS и показывает двукратное улучшение оценки FID по сравнению с базовыми моделями."
                },
                "en": {
                    "title": "Enhancing 3D Reconstruction with Difix3D+",
                    "desc": "This paper presents Difix3D+, a new method for improving 3D reconstruction and novel-view synthesis using single-step diffusion models. The core component, Difix, is an image diffusion model that enhances rendered views by removing artifacts caused by underconstrained areas in 3D representations. It plays a dual role by cleaning up pseudo-training views during reconstruction and acting as a neural enhancer during inference to eliminate residual artifacts. Difix3D+ is versatile, working with both Neural Radiance Fields (NeRF) and 3D Gaussian Splatting (3DGS), and it significantly improves the quality of 3D representations, achieving a 2x better FID score compared to existing methods."
                },
                "zh": {
                    "title": "Difix3D+: 提升3D重建与新视角合成的利器",
                    "desc": "Neural Radiance Fields（NeRF）和3D高斯点云（3D Gaussian Splatting）在3D重建和新视角合成任务中取得了重大进展。然而，从极端新视角实现真实感渲染仍然面临挑战，因为在表示中存在伪影。我们提出了Difix3D+，这是一种新颖的管道，旨在通过单步扩散模型增强3D重建和新视角合成。Difix作为核心模型，能够在重建阶段清理伪训练视图，并在推理阶段去除残留伪影，从而显著提高3D表示的质量。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01743",
            "title": "Phi-4-Mini Technical Report: Compact yet Powerful Multimodal Language Models via Mixture-of-LoRAs",
            "url": "https://huggingface.co/papers/2503.01743",
            "abstract": "We introduce Phi-4-Mini and Phi-4-Multimodal, compact yet highly capable language and multimodal models. Phi-4-Mini is a 3.8-billion-parameter language model trained on high-quality web and synthetic data, significantly outperforming recent open-source models of similar size and matching the performance of models twice its size on math and coding tasks requiring complex reasoning. This achievement is driven by a carefully curated synthetic data recipe emphasizing high-quality math and coding datasets. Compared to its predecessor, Phi-3.5-Mini, Phi-4-Mini features an expanded vocabulary size of 200K tokens to better support multilingual applications, as well as group query attention for more efficient long-sequence generation. Phi-4-Multimodal is a multimodal model that integrates text, vision, and speech/audio input modalities into a single model. Its novel modality extension approach leverages LoRA adapters and modality-specific routers to allow multiple inference modes combining various modalities without interference. For example, it now ranks first in the OpenASR leaderboard to date, although the LoRA component of the speech/audio modality has just 460 million parameters. Phi-4-Multimodal supports scenarios involving (vision + language), (vision + speech), and (speech/audio) inputs, outperforming larger vision-language and speech-language models on a wide range of tasks. Additionally, we experiment to further train Phi-4-Mini to enhance its reasoning capabilities. Despite its compact 3.8-billion-parameter size, this experimental version achieves reasoning performance on par with or surpassing significantly larger models, including DeepSeek-R1-Distill-Qwen-7B and DeepSeek-R1-Distill-Llama-8B.",
            "score": 28,
            "issue_id": 2511,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "fb054d6547a4a4fb",
            "authors": [
                "Abdelrahman Abouelenin",
                "Atabak Ashfaq",
                "Adam Atkinson",
                "Hany Awadalla",
                "Nguyen Bach",
                "Jianmin Bao",
                "Alon Benhaim",
                "Martin Cai",
                "Vishrav Chaudhary",
                "Congcong Chen",
                "Dong Chen",
                "Dongdong Chen",
                "Junkun Chen",
                "Weizhu Chen",
                "Yen-Chun Chen",
                "Yi-ling Chen",
                "Qi Dai",
                "Xiyang Dai",
                "Ruchao Fan",
                "Mei Gao",
                "Min Gao",
                "Amit Garg",
                "Abhishek Goswami",
                "Junheng Hao",
                "Amr Hendy",
                "Yuxuan Hu",
                "Xin Jin",
                "Mahmoud Khademi",
                "Dongwoo Kim",
                "Young Jin Kim",
                "Gina Lee",
                "Jinyu Li",
                "Yunsheng Li",
                "Chen Liang",
                "Xihui Lin",
                "Zeqi Lin",
                "Mengchen Liu",
                "Yang Liu",
                "Gilsinia Lopez",
                "Chong Luo",
                "Piyush Madan",
                "Vadim Mazalov",
                "Ali Mousavi",
                "Anh Nguyen",
                "Jing Pan",
                "Daniel Perez-Becker",
                "Jacob Platin",
                "Thomas Portet",
                "Kai Qiu",
                "Bo Ren",
                "Liliang Ren",
                "Sambuddha Roy",
                "Ning Shang",
                "Yelong Shen",
                "Saksham Singhal",
                "Subhojit Som",
                "Xia Song",
                "Tetyana Sych",
                "Praneetha Vaddamanu",
                "Shuohang Wang",
                "Yiming Wang",
                "Zhenghao Wang",
                "Haibin Wu",
                "Haoran Xu",
                "Weijian Xu",
                "Yifan Yang",
                "Ziyi Yang",
                "Donghan Yu",
                "Ishmam Zabir",
                "Jianwen Zhang",
                "Li Lyna Zhang",
                "Yunan Zhang",
                "Xiren Zhou"
            ],
            "affiliations": [
                "Microsoft"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01743.jpg",
            "data": {
                "categories": [
                    "#multimodal",
                    "#small_models",
                    "#data",
                    "#agi",
                    "#synthetic",
                    "#long_context",
                    "#optimization",
                    "#dataset",
                    "#training"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "Компактные модели с большими возможностями: прорыв в эффективности ИИ",
                    "desc": "Представлены две новые модели: Phi-4-Mini и Phi-4-Multimodal. Phi-4-Mini - это языковая модель с 3,8 миллиардами параметров, обученная на высококачественных веб-данных и синтетических данных, которая превосходит аналогичные модели в задачах математики и программирования. Phi-4-Multimodal - это мультимодальная модель, объединяющая текст, изображения и речь/аудио в единую систему с использованием LoRA-адаптеров. Обе модели демонстрируют высокую эффективность несмотря на свой компактный размер, превосходя более крупные аналоги в различных задачах."
                },
                "en": {
                    "title": "Compact Models, Superior Performance!",
                    "desc": "The paper presents Phi-4-Mini and Phi-4-Multimodal, two advanced models designed for language and multimodal tasks. Phi-4-Mini, with 3.8 billion parameters, excels in math and coding tasks by utilizing a high-quality synthetic data approach and an expanded vocabulary of 200K tokens. Phi-4-Multimodal integrates text, vision, and audio inputs, employing innovative techniques like LoRA adapters for efficient multi-modal processing. Both models demonstrate superior performance compared to larger counterparts, showcasing their effectiveness in complex reasoning and diverse input scenarios."
                },
                "zh": {
                    "title": "紧凑强大的多模态模型Phi-4系列",
                    "desc": "我们介绍了Phi-4-Mini和Phi-4-Multimodal这两种紧凑而强大的语言和多模态模型。Phi-4-Mini是一个拥有38亿参数的语言模型，经过高质量的网络和合成数据训练，在数学和编码任务中表现优于同类开源模型，并且在复杂推理方面与两倍于其规模的模型相当。相比于前身Phi-3.5-Mini，Phi-4-Mini扩展了词汇量，支持多语言应用，并采用了组查询注意力机制以提高长序列生成的效率。Phi-4-Multimodal则是一个多模态模型，能够将文本、视觉和语音/音频输入整合到一个模型中，支持多种推理模式，且在多个任务上超越了更大的视觉-语言和语音-语言模型。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2502.18965",
            "title": "OneRec: Unifying Retrieve and Rank with Generative Recommender and Iterative Preference Alignment",
            "url": "https://huggingface.co/papers/2502.18965",
            "abstract": "Recently, generative retrieval-based recommendation systems have emerged as a promising paradigm. However, most modern recommender systems adopt a retrieve-and-rank strategy, where the generative model functions only as a selector during the retrieval stage. In this paper, we propose OneRec, which replaces the cascaded learning framework with a unified generative model. To the best of our knowledge, this is the first end-to-end generative model that significantly surpasses current complex and well-designed recommender systems in real-world scenarios. Specifically, OneRec includes: 1) an encoder-decoder structure, which encodes the user's historical behavior sequences and gradually decodes the videos that the user may be interested in. We adopt sparse Mixture-of-Experts (MoE) to scale model capacity without proportionally increasing computational FLOPs. 2) a session-wise generation approach. In contrast to traditional next-item prediction, we propose a session-wise generation, which is more elegant and contextually coherent than point-by-point generation that relies on hand-crafted rules to properly combine the generated results. 3) an Iterative Preference Alignment module combined with Direct Preference Optimization (DPO) to enhance the quality of the generated results. Unlike DPO in NLP, a recommendation system typically has only one opportunity to display results for each user's browsing request, making it impossible to obtain positive and negative samples simultaneously. To address this limitation, We design a reward model to simulate user generation and customize the sampling strategy. Extensive experiments have demonstrated that a limited number of DPO samples can align user interest preferences and significantly improve the quality of generated results. We deployed OneRec in the main scene of Kuaishou, achieving a 1.6\\% increase in watch-time, which is a substantial improvement.",
            "score": 17,
            "issue_id": 2515,
            "pub_date": "2025-02-26",
            "pub_date_card": {
                "ru": "26 февраля",
                "en": "February 26",
                "zh": "2月26日"
            },
            "hash": "21c5c80a138c98a0",
            "authors": [
                "Jiaxin Deng",
                "Shiyao Wang",
                "Kuo Cai",
                "Lejian Ren",
                "Qigen Hu",
                "Weifeng Ding",
                "Qiang Luo",
                "Guorui Zhou"
            ],
            "affiliations": [
                "KuaiShou Inc. Beijing, China"
            ],
            "pdf_title_img": "assets/pdf/title_img/2502.18965.jpg",
            "data": {
                "categories": [
                    "#alignment",
                    "#rlhf",
                    "#rag",
                    "#games",
                    "#training",
                    "#optimization"
                ],
                "emoji": "🎥",
                "ru": {
                    "title": "OneRec: Единая генеративная модель для революции в рекомендательных системах",
                    "desc": "OneRec - это новая система рекомендаций, использующая единую генеративную модель вместо каскадного подхода. Она включает в себя структуру кодировщик-декодировщик с разреженной смесью экспертов (MoE) для масштабирования возможностей модели. OneRec применяет поэтапную генерацию сессий и модуль итеративного выравнивания предпочтений с прямой оптимизацией предпочтений (DPO). Система показала значительное улучшение времени просмотра при развертывании на платформе Kuaishou."
                },
                "en": {
                    "title": "OneRec: Revolutionizing Recommendations with Generative Models",
                    "desc": "This paper introduces OneRec, a novel generative retrieval-based recommendation system that improves upon traditional retrieve-and-rank methods. Unlike existing systems that use generative models merely for selection, OneRec employs a unified generative model that encodes user behavior and generates personalized video recommendations in a session-wise manner. The model utilizes a sparse Mixture-of-Experts architecture to enhance capacity while maintaining efficiency, and incorporates an Iterative Preference Alignment module to optimize user preferences effectively. Experimental results show that OneRec significantly outperforms existing systems, leading to a notable increase in user engagement metrics such as watch-time."
                },
                "zh": {
                    "title": "OneRec：统一生成模型的推荐新范式",
                    "desc": "最近，基于生成检索的推荐系统成为一种有前景的范式。本文提出的OneRec模型，采用统一的生成模型，取代了传统的级联学习框架，能够在真实场景中显著超越现有复杂的推荐系统。OneRec包括编码-解码结构，能够有效编码用户的历史行为，并生成用户可能感兴趣的视频。此外，OneRec还引入了会话生成方法和迭代偏好对齐模块，提升了生成结果的质量，并在快手的实际应用中实现了观看时间的显著增加。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01688",
            "title": "When an LLM is apprehensive about its answers -- and when its uncertainty is justified",
            "url": "https://huggingface.co/papers/2503.01688",
            "abstract": "Uncertainty estimation is crucial for evaluating Large Language Models (LLMs), particularly in high-stakes domains where incorrect answers result in significant consequences. Numerous approaches consider this problem, while focusing on a specific type of uncertainty, ignoring others. We investigate what estimates, specifically token-wise entropy and model-as-judge (MASJ), would work for multiple-choice question-answering tasks for different question topics. Our experiments consider three LLMs: Phi-4, Mistral, and Qwen of different sizes from 1.5B to 72B and 14 topics. While MASJ performs similarly to a random error predictor, the response entropy predicts model error in knowledge-dependent domains and serves as an effective indicator of question difficulty: for biology ROC AUC is 0.73. This correlation vanishes for the reasoning-dependent domain: for math questions ROC-AUC is 0.55. More principally, we found out that the entropy measure required a reasoning amount. Thus, data-uncertainty related entropy should be integrated within uncertainty estimates frameworks, while MASJ requires refinement. Moreover, existing MMLU-Pro samples are biased, and should balance required amount of reasoning for different subdomains to provide a more fair assessment of LLMs performance.",
            "score": 15,
            "issue_id": 2518,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "68429d7977c57eae",
            "authors": [
                "Petr Sychev",
                "Andrey Goncharov",
                "Daniil Vyazhev",
                "Edvard Khalafyan",
                "Alexey Zaytsev"
            ],
            "affiliations": [],
            "pdf_title_img": "assets/pdf/title_img/2503.01688.jpg",
            "data": {
                "categories": [
                    "#ethics",
                    "#hallucinations",
                    "#benchmark",
                    "#reasoning",
                    "#data",
                    "#multilingual"
                ],
                "emoji": "🤖",
                "ru": {
                    "title": "Энтропия ответов как индикатор неопределенности LLM в задачах с множественным выбором",
                    "desc": "Исследование посвящено оценке неопределенности в крупных языковых моделях (LLM) при решении задач с множественным выбором. Авторы сравнивают эффективность энтропии токенов и метода 'модель как судья' (MASJ) для различных тем вопросов. Эксперименты проводились на трех LLM разных размеров и 14 темах. Результаты показывают, что энтропия ответов хорошо предсказывает ошибки модели в областях, зависящих от знаний, но не в областях, требующих рассуждений."
                },
                "en": {
                    "title": "Enhancing Uncertainty Estimation in LLMs for Better Decision-Making",
                    "desc": "This paper explores how to measure uncertainty in Large Language Models (LLMs) when answering multiple-choice questions, which is important in critical areas where wrong answers can have serious effects. It compares two methods of uncertainty estimation: token-wise entropy and model-as-judge (MASJ), across various LLMs and topics. The findings reveal that while MASJ does not effectively predict errors, token-wise entropy is a better indicator of question difficulty, especially in knowledge-based subjects like biology. The study also highlights the need to refine MASJ and address biases in existing datasets to ensure fair evaluation of LLM performance across different reasoning requirements."
                },
                "zh": {
                    "title": "提升大型语言模型的不确定性估计",
                    "desc": "不确定性估计对于评估大型语言模型（LLMs）至关重要，尤其是在错误答案可能导致重大后果的高风险领域。本文探讨了不同类型的不确定性估计，特别是基于令牌的熵和模型作为评判者（MASJ），在多选题回答任务中的有效性。实验涉及三种不同规模的LLMs，结果显示，响应熵在知识依赖领域能够有效预测模型错误，而MASJ的表现类似于随机错误预测器。我们发现熵度量需要一定的推理量，因此数据不确定性相关的熵应纳入不确定性估计框架中，而MASJ则需要进一步改进。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01183",
            "title": "DiffRhythm: Blazingly Fast and Embarrassingly Simple End-to-End Full-Length Song Generation with Latent Diffusion",
            "url": "https://huggingface.co/papers/2503.01183",
            "abstract": "Recent advancements in music generation have garnered significant attention, yet existing approaches face critical limitations. Some current generative models can only synthesize either the vocal track or the accompaniment track. While some models can generate combined vocal and accompaniment, they typically rely on meticulously designed multi-stage cascading architectures and intricate data pipelines, hindering scalability. Additionally, most systems are restricted to generating short musical segments rather than full-length songs. Furthermore, widely used language model-based methods suffer from slow inference speeds. To address these challenges, we propose DiffRhythm, the first latent diffusion-based song generation model capable of synthesizing complete songs with both vocal and accompaniment for durations of up to 4m45s in only ten seconds, maintaining high musicality and intelligibility. Despite its remarkable capabilities, DiffRhythm is designed to be simple and elegant: it eliminates the need for complex data preparation, employs a straightforward model structure, and requires only lyrics and a style prompt during inference. Additionally, its non-autoregressive structure ensures fast inference speeds. This simplicity guarantees the scalability of DiffRhythm. Moreover, we release the complete training code along with the pre-trained model on large-scale data to promote reproducibility and further research.",
            "score": 15,
            "issue_id": 2516,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "0370c6364610fd8e",
            "authors": [
                "Ziqian Ning",
                "Huakang Chen",
                "Yuepeng Jiang",
                "Chunbo Hao",
                "Guobin Ma",
                "Shuai Wang",
                "Jixun Yao",
                "Lei Xie"
            ],
            "affiliations": [
                "Northwestern Polytechnical University",
                "Shenzhen Research Institute of Big Data, The Chinese University of Hong Kong, Shenzhen (CUHK-Shenzhen), China"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01183.jpg",
            "data": {
                "categories": [
                    "#diffusion",
                    "#inference",
                    "#dataset",
                    "#open_source",
                    "#audio"
                ],
                "emoji": "🎵",
                "ru": {
                    "title": "DiffRhythm: Быстрая генерация полных песен с помощью латентной диффузии",
                    "desc": "DiffRhythm - это первая модель генерации песен на основе латентной диффузии, способная синтезировать полные песни с вокалом и аккомпанементом длительностью до 4м45с всего за десять секунд. Модель имеет простую структуру, не требует сложной подготовки данных и использует только текст песни и стилевую подсказку при инференсе. Благодаря неавторегрессивной структуре, DiffRhythm обеспечивает высокую скорость генерации. Авторы опубликовали полный код обучения и предобученную модель для воспроизводимости результатов и дальнейших исследований."
                },
                "en": {
                    "title": "DiffRhythm: Fast and Scalable Song Generation with Latent Diffusion",
                    "desc": "This paper introduces DiffRhythm, a novel music generation model that utilizes latent diffusion techniques to create full-length songs with both vocal and accompaniment tracks. Unlike existing models that are limited to short segments or require complex architectures, DiffRhythm simplifies the process by needing only lyrics and a style prompt for song generation. It achieves high musical quality and intelligibility while significantly improving inference speed, generating songs in just ten seconds. The authors also emphasize the model's scalability and reproducibility by providing the complete training code and pre-trained model for further research."
                },
                "zh": {
                    "title": "DiffRhythm：快速生成完整歌曲的创新模型",
                    "desc": "本论文介绍了一种新的音乐生成模型DiffRhythm，它能够在短短十秒内合成完整的歌曲，包括人声和伴奏，时长可达4分45秒。与现有模型相比，DiffRhythm采用潜在扩散技术，避免了复杂的数据准备和多阶段架构，确保了高效的推理速度。该模型只需歌词和风格提示即可生成音乐，具有良好的可扩展性。我们还发布了完整的训练代码和预训练模型，以促进研究的可重复性和进一步发展。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01496",
            "title": "Liger: Linearizing Large Language Models to Gated Recurrent Structures",
            "url": "https://huggingface.co/papers/2503.01496",
            "abstract": "Transformers with linear recurrent modeling offer linear-time training and constant-memory inference. Despite their demonstrated efficiency and performance, pretraining such non-standard architectures from scratch remains costly and risky. The linearization of large language models (LLMs) transforms pretrained standard models into linear recurrent structures, enabling more efficient deployment. However, current linearization methods typically introduce additional feature map modules that require extensive fine-tuning and overlook the gating mechanisms used in state-of-the-art linear recurrent models. To address these issues, this paper presents Liger, short for Linearizing LLMs to gated recurrent structures. Liger is a novel approach for converting pretrained LLMs into gated linear recurrent models without adding extra parameters. It repurposes the pretrained key matrix weights to construct diverse gating mechanisms, facilitating the formation of various gated recurrent structures while avoiding the need to train additional components from scratch. Using lightweight fine-tuning with Low-Rank Adaptation (LoRA), Liger restores the performance of the linearized gated recurrent models to match that of the original LLMs. Additionally, we introduce Liger Attention, an intra-layer hybrid attention mechanism, which significantly recovers 93\\% of the Transformer-based LLM at 0.02\\% pre-training tokens during the linearization process, achieving competitive results across multiple benchmarks, as validated on models ranging from 1B to 8B parameters. Code is available at https://github.com/OpenSparseLLMs/Linearization.",
            "score": 12,
            "issue_id": 2514,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "d5ca7ef45c0e90c9",
            "authors": [
                "Disen Lan",
                "Weigao Sun",
                "Jiaxi Hu",
                "Jusen Du",
                "Yu Cheng"
            ],
            "affiliations": [
                "Nanjing University",
                "Shanghai AI Laboratory",
                "South China University of Technology",
                "The Chinese University of Hong Kong",
                "The Hong Kong University of Science and Technology (Guangzhou)"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01496.jpg",
            "data": {
                "categories": [
                    "#architecture",
                    "#training",
                    "#optimization",
                    "#benchmark"
                ],
                "emoji": "🔢",
                "ru": {
                    "title": "Эффективная линеаризация больших языковых моделей",
                    "desc": "Данная статья представляет новый метод Liger для линеаризации больших языковых моделей (LLM) в гейтированные линейно-рекуррентные структуры. Liger преобразует предобученные LLM без добавления дополнительных параметров, используя существующие веса ключевой матрицы для создания различных механизмов гейтирования. Метод применяет легковесную донастройку с помощью Low-Rank Adaptation (LoRA) для восстановления производительности линеаризованных моделей. Авторы также представляют Liger Attention - гибридный механизм внимания, который значительно улучшает эффективность линеаризации."
                },
                "en": {
                    "title": "Liger: Efficiently Transforming LLMs into Gated Linear Recurrent Models",
                    "desc": "This paper introduces Liger, a method for transforming pretrained large language models (LLMs) into gated linear recurrent models. Liger efficiently repurposes existing key matrix weights to create diverse gating mechanisms without adding extra parameters, thus avoiding the costly process of training new components from scratch. The approach employs lightweight fine-tuning techniques, specifically Low-Rank Adaptation (LoRA), to maintain the performance of the linearized models comparable to the original LLMs. Additionally, Liger incorporates a novel intra-layer hybrid attention mechanism, Liger Attention, which enhances the model's efficiency while achieving competitive results across various benchmarks."
                },
                "zh": {
                    "title": "Liger：高效转换预训练模型的创新方法",
                    "desc": "本文提出了一种名为Liger的方法，用于将预训练的大型语言模型（LLMs）转换为带门控的线性递归模型，而无需增加额外的参数。Liger通过重新利用预训练的关键矩阵权重，构建多样的门控机制，从而形成不同的门控递归结构。该方法使用轻量级的微调技术（如低秩适应LoRA），使线性化的门控递归模型的性能恢复到与原始LLMs相当的水平。此外，Liger Attention作为一种层内混合注意力机制，在线性化过程中显著恢复了93%的Transformer基础LLM的性能。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.00501",
            "title": "Qilin: A Multimodal Information Retrieval Dataset with APP-level User Sessions",
            "url": "https://huggingface.co/papers/2503.00501",
            "abstract": "User-generated content (UGC) communities, especially those featuring multimodal content, improve user experiences by integrating visual and textual information into results (or items). The challenge of improving user experiences in complex systems with search and recommendation (S\\&R) services has drawn significant attention from both academia and industry these years. However, the lack of high-quality datasets has limited the research progress on multimodal S\\&R. To address the growing need for developing better S\\&R services, we present a novel multimodal information retrieval dataset in this paper, namely Qilin. The dataset is collected from Xiaohongshu, a popular social platform with over 300 million monthly active users and an average search penetration rate of over 70\\%. In contrast to existing datasets, Qilin offers a comprehensive collection of user sessions with heterogeneous results like image-text notes, video notes, commercial notes, and direct answers, facilitating the development of advanced multimodal neural retrieval models across diverse task settings. To better model user satisfaction and support the analysis of heterogeneous user behaviors, we also collect extensive APP-level contextual signals and genuine user feedback. Notably, Qilin contains user-favored answers and their referred results for search requests triggering the Deep Query Answering (DQA) module. This allows not only the training \\& evaluation of a Retrieval-augmented Generation (RAG) pipeline, but also the exploration of how such a module would affect users' search behavior. Through comprehensive analysis and experiments, we provide interesting findings and insights for further improving S\\&R systems. We hope that Qilin will significantly contribute to the advancement of multimodal content platforms with S\\&R services in the future.",
            "score": 10,
            "issue_id": 2513,
            "pub_date": "2025-03-01",
            "pub_date_card": {
                "ru": "1 марта",
                "en": "March 1",
                "zh": "3月1日"
            },
            "hash": "ed7fc8625b068597",
            "authors": [
                "Jia Chen",
                "Qian Dong",
                "Haitao Li",
                "Xiaohui He",
                "Yan Gao",
                "Shaosheng Cao",
                "Yi Wu",
                "Ping Yang",
                "Chen Xu",
                "Yao Hu",
                "Qingyao Ai",
                "Yiqun Liu"
            ],
            "affiliations": [
                "Tsinghua University",
                "Xiaohongshu Inc."
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.00501.jpg",
            "data": {
                "categories": [
                    "#multimodal",
                    "#dataset",
                    "#rag"
                ],
                "emoji": "🔍",
                "ru": {
                    "title": "Qilin: мультимодальный датасет для улучшения поиска и рекомендаций",
                    "desc": "Представлен новый набор данных Qilin для мультимодального информационного поиска, собранный на платформе Xiaohongshu. Датасет включает пользовательские сессии с разнородными результатами (изображения, видео, коммерческие заметки) и контекстуальными сигналами. Qilin позволяет обучать и оценивать нейросетевые модели поиска и рекомендаций, а также исследовать влияние модуля глубоких ответов на запросы. Авторы надеются, что Qilin внесет значительный вклад в развитие мультимодальных платформ с поисковыми сервисами."
                },
                "en": {
                    "title": "Enhancing User Experiences with Qilin: A Multimodal Dataset for S&R Services",
                    "desc": "This paper introduces Qilin, a new multimodal information retrieval dataset designed to enhance search and recommendation (S&R) services in user-generated content communities. Qilin is unique as it includes diverse user sessions with various content types, such as image-text notes and videos, which can help in developing advanced multimodal neural retrieval models. Additionally, the dataset captures user feedback and contextual signals, allowing researchers to analyze user satisfaction and behavior more effectively. The findings from this research aim to improve S&R systems and contribute to the evolution of multimodal content platforms."
                },
                "zh": {
                    "title": "推动多模态搜索与推荐服务的进步",
                    "desc": "本文介绍了一个新的多模态信息检索数据集Qilin，旨在改善用户在复杂系统中的搜索和推荐体验。Qilin数据集来源于小红书，包含多种类型的用户会话，如图文笔记、视频笔记和商业笔记，适用于多种任务设置。该数据集还收集了丰富的应用级上下文信号和真实用户反馈，以更好地建模用户满意度。通过对Qilin的分析和实验，本文提供了有趣的发现，期望能推动多模态内容平台的搜索和推荐服务的发展。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01307",
            "title": "Cognitive Behaviors that Enable Self-Improving Reasoners, or, Four Habits of Highly Effective STaRs",
            "url": "https://huggingface.co/papers/2503.01307",
            "abstract": "Test-time inference has emerged as a powerful paradigm for enabling language models to ``think'' longer and more carefully about complex challenges, much like skilled human experts. While reinforcement learning (RL) can drive self-improvement in language models on verifiable tasks, some models exhibit substantial gains while others quickly plateau. For instance, we find that Qwen-2.5-3B far exceeds Llama-3.2-3B under identical RL training for the game of Countdown. This discrepancy raises a critical question: what intrinsic properties enable effective self-improvement? We introduce a framework to investigate this question by analyzing four key cognitive behaviors -- verification, backtracking, subgoal setting, and backward chaining -- that both expert human problem solvers and successful language models employ. Our study reveals that Qwen naturally exhibits these reasoning behaviors, whereas Llama initially lacks them. In systematic experimentation with controlled behavioral datasets, we find that priming Llama with examples containing these reasoning behaviors enables substantial improvements during RL, matching or exceeding Qwen's performance. Importantly, the presence of reasoning behaviors, rather than correctness of answers, proves to be the critical factor -- models primed with incorrect solutions containing proper reasoning patterns achieve comparable performance to those trained on correct solutions. Finally, leveraging continued pretraining with OpenWebMath data, filtered to amplify reasoning behaviors, enables the Llama model to match Qwen's self-improvement trajectory. Our findings establish a fundamental relationship between initial reasoning behaviors and the capacity for improvement, explaining why some language models effectively utilize additional computation while others plateau.",
            "score": 10,
            "issue_id": 2511,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "fa966620baa8c013",
            "authors": [
                "Kanishk Gandhi",
                "Ayush Chakravarthy",
                "Anikait Singh",
                "Nathan Lile",
                "Noah D. Goodman"
            ],
            "affiliations": [
                "Stanford University",
                "SynthLabs"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01307.jpg",
            "data": {
                "categories": [
                    "#training",
                    "#optimization",
                    "#rl",
                    "#reasoning"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "Когнитивные навыки - ключ к самосовершенствованию языковых моделей",
                    "desc": "Исследование показывает, что способность языковых моделей к самосовершенствованию зависит от наличия у них определенных когнитивных поведений, таких как верификация, бэктрекинг, постановка подцелей и обратное планирование. Эксперименты выявили, что модель Qwen изначально обладает этими навыками, в то время как Llama нет. Прайминг Llama примерами, содержащими эти поведения, позволил значительно улучшить ее производительность при обучении с подкреплением. Важно отметить, что наличие правильных рассуждений оказалось более критичным фактором, чем корректность ответов."
                },
                "en": {
                    "title": "Unlocking Self-Improvement in Language Models through Reasoning",
                    "desc": "This paper explores how language models can improve their problem-solving abilities through a process called test-time inference, similar to human experts. It highlights the differences in performance between two models, Qwen-2.5-3B and Llama-3.2-3B, when trained with reinforcement learning (RL) on the game Countdown. The authors identify four cognitive behaviors—verification, backtracking, subgoal setting, and backward chaining—that are crucial for effective self-improvement in these models. They demonstrate that enhancing Llama with examples of these reasoning behaviors can significantly boost its performance, suggesting that the ability to reason is more important than simply providing correct answers."
                },
                "zh": {
                    "title": "推理行为是模型自我提升的关键",
                    "desc": "本文探讨了语言模型在复杂任务中自我改进的能力，特别是通过强化学习（RL）实现的自我提升。研究发现，不同模型在相同的RL训练下表现差异显著，例如Qwen-2.5-3B在游戏Countdown中远超Llama-3.2-3B。我们分析了四种关键的认知行为：验证、回溯、子目标设定和逆向链推理，发现Qwen自然展现了这些推理行为，而Llama则最初缺乏。通过对Llama进行示例引导，能够显著提升其在RL中的表现，证明了推理行为的存在是模型自我改进的关键因素。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.00714",
            "title": "Speculative Ad-hoc Querying",
            "url": "https://huggingface.co/papers/2503.00714",
            "abstract": "Analyzing large datasets requires responsive query execution, but executing SQL queries on massive datasets can be slow. This paper explores whether query execution can begin even before the user has finished typing, allowing results to appear almost instantly. We propose SpeQL, a system that leverages Large Language Models (LLMs) to predict likely queries based on the database schema, the user's past queries, and their incomplete query. Since exact query prediction is infeasible, SpeQL speculates on partial queries in two ways: 1) it predicts the query structure to compile and plan queries in advance, and 2) it precomputes smaller temporary tables that are much smaller than the original database, but are still predicted to contain all information necessary to answer the user's final query. Additionally, SpeQL continuously displays results for speculated queries and subqueries in real time, aiding exploratory analysis. A utility/user study showed that SpeQL improved task completion time, and participants reported that its speculative display of results helped them discover patterns in the data more quickly. In the study, SpeQL improves user's query latency by up to 289times and kept the overhead reasonable, at 4$ per hour.",
            "score": 8,
            "issue_id": 2514,
            "pub_date": "2025-03-02",
            "pub_date_card": {
                "ru": "2 марта",
                "en": "March 2",
                "zh": "3月2日"
            },
            "hash": "1b0459b56fdb6894",
            "authors": [
                "Haoyu Li",
                "Srikanth Kandula",
                "Maria Angels de Luis Balaguer",
                "Aditya Akella",
                "Venkat Arun"
            ],
            "affiliations": [
                "Amazon Web Services",
                "Microsoft Research",
                "The University of Texas at Austin"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.00714.jpg",
            "data": {
                "categories": [
                    "#dataset",
                    "#data",
                    "#benchmark"
                ],
                "emoji": "⚡",
                "ru": {
                    "title": "Молниеносные SQL-запросы с помощью предиктивной аналитики",
                    "desc": "Статья представляет систему SpeQL, использующую большие языковые модели для предсказания SQL-запросов пользователя. SpeQL предугадывает структуру запроса и предварительно вычисляет временные таблицы, что позволяет начать выполнение запроса до его завершения пользователем. Система непрерывно отображает результаты предполагаемых запросов в реальном времени, помогая в исследовательском анализе данных. Исследование показало, что SpeQL значительно сокращает время выполнения задач и помогает пользователям быстрее обнаруживать закономерности в данных."
                },
                "en": {
                    "title": "Instant Query Results with SpeQL!",
                    "desc": "This paper introduces SpeQL, a novel system designed to enhance the speed of SQL query execution on large datasets. By utilizing Large Language Models (LLMs), SpeQL predicts user queries even before they are fully typed, allowing for near-instantaneous results. It employs two main strategies: predicting the structure of queries for pre-compilation and creating smaller temporary tables that contain essential data for answering the final query. A user study demonstrated that SpeQL significantly reduced query latency and helped users identify data patterns more efficiently during exploratory analysis."
                },
                "zh": {
                    "title": "SpeQL：让查询更快的智能预测系统",
                    "desc": "本论文探讨了如何在用户输入SQL查询时，提前开始执行查询，以加快大数据集的查询响应速度。我们提出了SpeQL系统，利用大型语言模型（LLMs）根据数据库模式、用户的历史查询和不完整查询来预测可能的查询。SpeQL通过预测查询结构和预计算小型临时表来处理部分查询，从而在用户完成查询之前提供实时结果。研究表明，SpeQL显著提高了用户的查询速度，并帮助用户更快地发现数据中的模式。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.00784",
            "title": "DuoDecoding: Hardware-aware Heterogeneous Speculative Decoding with Dynamic Multi-Sequence Drafting",
            "url": "https://huggingface.co/papers/2503.00784",
            "abstract": "Large language models (LLMs) exhibit exceptional performance across a wide range of tasks; however, their token-by-token autoregressive generation process significantly hinders inference speed. Speculative decoding presents a promising draft-then-verify framework that reduces generation latency while maintaining output distribution fidelity. Nevertheless, the draft model introduces additional computational overhead, becoming a performance bottleneck and increasing the time to first token (TTFT). Previous approaches to mitigate draft model overhead have primarily relied on heuristics and generally failed to match the quality of the draft language models. To address these challenges, we propose DuoDecoding, a novel approach that strategically deploys the draft and target models on the CPU and GPU respectively, enabling parallel decoding while preserving draft quality. Our method incorporates a hardware-aware optimal draft budget to minimize idle times and employs dynamic multi-sequence drafting to enhance draft quality. Extensive experiments across seven tasks show that DuoDecoding achieves up to 2.61x speedup in generation latency, while reducing TTFT to 83% of that in conventional speculative decoding. The Code is available at https://github.com/KaiLv69/DuoDecoding.",
            "score": 7,
            "issue_id": 2510,
            "pub_date": "2025-03-02",
            "pub_date_card": {
                "ru": "2 марта",
                "en": "March 2",
                "zh": "3月2日"
            },
            "hash": "b4870a0e44c3cc55",
            "authors": [
                "Kai Lv",
                "Honglin Guo",
                "Qipeng Guo",
                "Xipeng Qiu"
            ],
            "affiliations": [
                "Fudan University",
                "Shanghai AI Laboratory"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.00784.jpg",
            "data": {
                "categories": [
                    "#inference",
                    "#training",
                    "#optimization"
                ],
                "emoji": "🚀",
                "ru": {
                    "title": "DuoDecoding: Параллельное ускорение языковых моделей",
                    "desc": "Статья представляет новый метод ускорения генерации текста большими языковыми моделями (LLM) под названием DuoDecoding. Этот подход использует параллельное декодирование на CPU и GPU, оптимизируя время генерации первого токена и общую латентность. DuoDecoding применяет аппаратно-ориентированный оптимальный бюджет черновика и динамическое многопоследовательное черновое декодирование для повышения качества. Эксперименты показали значительное ускорение генерации по сравнению с обычным спекулятивным декодированием."
                },
                "en": {
                    "title": "DuoDecoding: Speeding Up Text Generation with Smart Model Deployment",
                    "desc": "This paper introduces DuoDecoding, a new method to improve the speed of generating text with large language models (LLMs) while keeping the quality high. It uses a draft-then-verify approach, where a draft model quickly generates initial text, and a target model refines it, but does so in a way that reduces the time it takes to start generating text. By using both CPU and GPU for different parts of the process, DuoDecoding allows for faster and more efficient decoding. The results show that this method can significantly speed up text generation without sacrificing quality, achieving a notable improvement in performance across various tasks."
                },
                "zh": {
                    "title": "DuoDecoding：加速生成的新方法",
                    "desc": "大型语言模型（LLMs）在多种任务中表现出色，但其逐字自回归生成过程显著影响推理速度。推测解码提供了一种有前景的草稿-验证框架，能够减少生成延迟，同时保持输出分布的准确性。我们提出的DuoDecoding方法通过在CPU和GPU上分别部署草稿模型和目标模型，实现了并行解码，提升了生成效率。实验结果表明，DuoDecoding在生成延迟上实现了最高2.61倍的加速，同时将首次生成时间缩短至传统推测解码的83%。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01506",
            "title": "SampleMix: A Sample-wise Pre-training Data Mixing Strategey by Coordinating Data Quality and Diversity",
            "url": "https://huggingface.co/papers/2503.01506",
            "abstract": "Existing pretraining data mixing methods for large language models (LLMs) typically follow a domain-wise methodology, a top-down process that first determines domain weights and then performs uniform data sampling across each domain. However, these approaches neglect significant inter-domain overlaps and commonalities, failing to control the global diversity of the constructed training dataset. Further, uniform sampling within domains ignores fine-grained sample-specific features, potentially leading to suboptimal data distribution. To address these shortcomings, we propose a novel sample-wise data mixture approach based on a bottom-up paradigm. This method performs global cross-domain sampling by systematically evaluating the quality and diversity of each sample, thereby dynamically determining the optimal domain distribution. Comprehensive experiments across multiple downstream tasks and perplexity assessments demonstrate that SampleMix surpasses existing domain-based methods. Meanwhile, SampleMix requires 1.4x to 2.1x training steps to achieves the baselines' performance, highlighting the substantial potential of SampleMix to optimize pre-training data.",
            "score": 6,
            "issue_id": 2517,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "018cc621eb1ee12b",
            "authors": [
                "Xiangyu Xi",
                "Deyang Kong",
                "Jian Yang",
                "Jiawei Yang",
                "Zhengyu Chen",
                "Wei Wang",
                "Jingang Wang",
                "Xunliang Cai",
                "Shikun Zhang",
                "Wei Ye"
            ],
            "affiliations": [
                "Meituan Group, Beijing, China",
                "National Engineering Research Center for Software Engineering, Peking University, Beijing, China"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01506.jpg",
            "data": {
                "categories": [
                    "#transfer_learning",
                    "#training",
                    "#optimization",
                    "#data"
                ],
                "emoji": "🔀",
                "ru": {
                    "title": "SampleMix: революция в смешивании данных для LLM",
                    "desc": "В статье представлен новый подход к смешиванию предобучающих данных для больших языковых моделей (LLM), названный SampleMix. В отличие от традиционных методов, основанных на доменах, SampleMix использует выборку на уровне отдельных образцов, оценивая их качество и разнообразие. Этот метод позволяет динамически определять оптимальное распределение доменов и учитывать межdomенные пересечения. Эксперименты показали, что SampleMix превосходит существующие методы, основанные на доменах, хотя и требует больше шагов обучения."
                },
                "en": {
                    "title": "Revolutionizing Data Mixing for Better Language Model Training",
                    "desc": "This paper introduces SampleMix, a new method for mixing pretraining data for large language models (LLMs). Unlike traditional domain-wise approaches that sample uniformly within predefined domains, SampleMix uses a bottom-up strategy that evaluates the quality and diversity of individual samples across domains. This allows for a more dynamic and optimal distribution of training data, addressing the limitations of inter-domain overlaps and sample-specific features. Experimental results show that SampleMix not only outperforms existing methods but also requires fewer training steps to achieve comparable performance."
                },
                "zh": {
                    "title": "样本级数据混合，优化预训练数据的未来",
                    "desc": "现有的大型语言模型预训练数据混合方法通常采用领域导向的方法，先确定领域权重，再在每个领域内进行均匀数据采样。然而，这些方法忽视了领域之间的重要重叠和共性，未能有效控制训练数据集的全球多样性。此外，领域内的均匀采样忽略了样本特定的细微特征，可能导致数据分布不理想。为了解决这些问题，我们提出了一种基于自下而上的新型样本级数据混合方法，能够通过系统评估每个样本的质量和多样性，动态确定最佳领域分布。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01370",
            "title": "Kiss3DGen: Repurposing Image Diffusion Models for 3D Asset Generation",
            "url": "https://huggingface.co/papers/2503.01370",
            "abstract": "Diffusion models have achieved great success in generating 2D images. However, the quality and generalizability of 3D content generation remain limited. State-of-the-art methods often require large-scale 3D assets for training, which are challenging to collect. In this work, we introduce Kiss3DGen (Keep It Simple and Straightforward in 3D Generation), an efficient framework for generating, editing, and enhancing 3D objects by repurposing a well-trained 2D image diffusion model for 3D generation. Specifically, we fine-tune a diffusion model to generate ''3D Bundle Image'', a tiled representation composed of multi-view images and their corresponding normal maps. The normal maps are then used to reconstruct a 3D mesh, and the multi-view images provide texture mapping, resulting in a complete 3D model. This simple method effectively transforms the 3D generation problem into a 2D image generation task, maximizing the utilization of knowledge in pretrained diffusion models. Furthermore, we demonstrate that our Kiss3DGen model is compatible with various diffusion model techniques, enabling advanced features such as 3D editing, mesh and texture enhancement, etc. Through extensive experiments, we demonstrate the effectiveness of our approach, showcasing its ability to produce high-quality 3D models efficiently.",
            "score": 6,
            "issue_id": 2513,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "3decc9fe2b6f6e32",
            "pdf_title_img": "img/title_stub.png",
            "data": {
                "categories": [
                    "#cv",
                    "#diffusion",
                    "#3d"
                ],
                "emoji": "🎨",
                "ru": {
                    "title": "Простая и эффективная 3D-генерация на основе 2D-диффузии",
                    "desc": "Статья представляет Kiss3DGen - эффективный фреймворк для генерации, редактирования и улучшения 3D-объектов с использованием предобученной модели диффузии для 2D-изображений. Метод основан на дообучении диффузионной модели для генерации 'Пакетного 3D-изображения', состоящего из мультиракурсных изображений и соответствующих карт нормалей. Затем карты нормалей используются для реконструкции 3D-меша, а мультиракурсные изображения обеспечивают текстурирование, что в результате дает полную 3D-модель. Авторы демонстрируют, что их подход совместим с различными техниками диффузионных моделей и позволяет эффективно создавать качественные 3D-модели."
                },
                "en": {
                    "title": "Kiss3DGen: Simplifying 3D Generation with 2D Diffusion Models",
                    "desc": "This paper presents Kiss3DGen, a novel framework that simplifies the process of generating and enhancing 3D objects by leveraging existing 2D image diffusion models. The approach involves fine-tuning a diffusion model to create a '3D Bundle Image', which consists of multiple views and normal maps that are essential for 3D reconstruction. By transforming the 3D generation challenge into a 2D image task, the method maximizes the use of knowledge from pretrained models, making it more efficient. The results show that Kiss3DGen not only generates high-quality 3D models but also supports advanced features like editing and texture enhancement."
                },
                "zh": {
                    "title": "简单高效的三维生成方法",
                    "desc": "扩散模型在生成二维图像方面取得了巨大成功，但在三维内容生成的质量和通用性上仍然有限。现有的先进方法通常需要大量的三维资产进行训练，这些资产难以收集。我们提出了Kiss3DGen（简单直接的三维生成），这是一个高效的框架，通过重新利用经过良好训练的二维图像扩散模型来生成、编辑和增强三维物体。该方法将三维生成问题转化为二维图像生成任务，最大化利用预训练扩散模型中的知识，能够有效生成高质量的三维模型。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2502.18890",
            "title": "From Hours to Minutes: Lossless Acceleration of Ultra Long Sequence Generation up to 100K Tokens",
            "url": "https://huggingface.co/papers/2502.18890",
            "abstract": "Generating ultra-long sequences with large language models (LLMs) has become increasingly crucial but remains a highly time-intensive task, particularly for sequences up to 100K tokens. While traditional speculative decoding methods exist, simply extending their generation limits fails to accelerate the process and can be detrimental. Through an in-depth analysis, we identify three major challenges hindering efficient generation: frequent model reloading, dynamic key-value (KV) management and repetitive generation. To address these issues, we introduce TOKENSWIFT, a novel framework designed to substantially accelerate the generation process of ultra-long sequences while maintaining the target model's inherent quality. Experimental results demonstrate that TOKENSWIFT achieves over 3 times speedup across models of varying scales (1.5B, 7B, 8B, 14B) and architectures (MHA, GQA). This acceleration translates to hours of time savings for ultra-long sequence generation, establishing TOKENSWIFT as a scalable and effective solution at unprecedented lengths. Code can be found at https://github.com/bigai-nlco/TokenSwift.",
            "score": 5,
            "issue_id": 2517,
            "pub_date": "2025-02-26",
            "pub_date_card": {
                "ru": "26 февраля",
                "en": "February 26",
                "zh": "2月26日"
            },
            "hash": "d07c05abfac49ecc",
            "authors": [
                "Tong Wu",
                "Junzhe Shen",
                "Zixia Jia",
                "Yuxuan Wang",
                "Zilong Zheng"
            ],
            "affiliations": [
                "NLCo Lab, BIGAI LUMIA Lab, Shanghai Jiao Tong University"
            ],
            "pdf_title_img": "assets/pdf/title_img/2502.18890.jpg",
            "data": {
                "categories": [
                    "#training",
                    "#architecture",
                    "#long_context",
                    "#inference",
                    "#optimization"
                ],
                "emoji": "⚡",
                "ru": {
                    "title": "TOKENSWIFT: революция в скорости генерации сверхдлинных текстов",
                    "desc": "Исследователи представили TOKENSWIFT - новую систему для ускорения генерации сверхдлинных последовательностей большими языковыми моделями (LLM). Они выявили три основные проблемы, препятствующие эффективной генерации: частая перезагрузка модели, динамическое управление ключами-значениями и повторяющаяся генерация. TOKENSWIFT решает эти проблемы, позволяя ускорить процесс генерации в 3 раза для моделей различных масштабов и архитектур. Это существенно сокращает время генерации сверхдлинных последовательностей, сохраняя при этом качество целевой модели."
                },
                "en": {
                    "title": "Accelerating Ultra-Long Sequence Generation with TOKENSWIFT",
                    "desc": "This paper presents TOKENSWIFT, a new framework aimed at speeding up the generation of ultra-long sequences using large language models (LLMs). The authors identify key challenges such as model reloading, dynamic key-value management, and repetitive generation that slow down the process. By addressing these issues, TOKENSWIFT achieves over three times the speed of traditional methods while preserving the quality of the generated text. Experimental results show that this framework is effective across various model sizes and architectures, making it a significant advancement in the field of sequence generation."
                },
                "zh": {
                    "title": "TOKENSWIFT：加速超长序列生成的创新框架",
                    "desc": "生成超长序列对于大型语言模型（LLMs）变得越来越重要，但这一过程通常非常耗时，尤其是对于长达10万标记的序列。传统的推测解码方法在延长生成限制时并未加速过程，反而可能造成负面影响。我们通过深入分析，识别出影响高效生成的三个主要挑战：频繁的模型重载、动态键值（KV）管理和重复生成。为了解决这些问题，我们提出了TOKENSWIFT，一个新框架，旨在显著加快超长序列的生成过程，同时保持目标模型的固有质量。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01807",
            "title": "Large-Scale Data Selection for Instruction Tuning",
            "url": "https://huggingface.co/papers/2503.01807",
            "abstract": "Selecting high-quality training data from a larger pool is a crucial step when instruction-tuning language models, as carefully curated datasets often produce models that outperform those trained on much larger, noisier datasets. Automated data selection approaches for instruction-tuning are typically tested by selecting small datasets (roughly 10k samples) from small pools (100-200k samples). However, popular deployed instruction-tuned models often train on hundreds of thousands to millions of samples, subsampled from even larger data pools. We present a systematic study of how well data selection methods scale to these settings, selecting up to 2.5M samples from pools of up to 5.8M samples and evaluating across 7 diverse tasks. We show that many recently proposed methods fall short of random selection in this setting (while using more compute), and even decline in performance when given access to larger pools of data to select over. However, we find that a variant of representation-based data selection (RDS+), which uses weighted mean pooling of pretrained LM hidden states, consistently outperforms more complex methods across all settings tested -- all whilst being more compute-efficient. Our findings highlight that the scaling properties of proposed automated selection methods should be more closely examined. We release our code, data, and models at https://github.com/hamishivi/automated-instruction-selection.",
            "score": 5,
            "issue_id": 2511,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "8bbc980a9ef867f7",
            "authors": [
                "Hamish Ivison",
                "Muru Zhang",
                "Faeze Brahman",
                "Pang Wei Koh",
                "Pradeep Dasigi"
            ],
            "affiliations": [
                "Allen Institute for AI",
                "University of Southern California",
                "University of Washington"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01807.jpg",
            "data": {
                "categories": [
                    "#data",
                    "#open_source",
                    "#optimization",
                    "#dataset",
                    "#training"
                ],
                "emoji": "🔍",
                "ru": {
                    "title": "Эффективный отбор данных для обучения языковых моделей: меньше значит больше",
                    "desc": "Эта статья исследует методы автоматического отбора данных для инструктивной настройки языковых моделей. Авторы проводят систематическое изучение эффективности различных методов при масштабировании до больших объемов данных, выбирая до 2,5 миллионов образцов из пулов до 5,8 миллионов. Результаты показывают, что многие недавно предложенные методы уступают случайному отбору в этих условиях, однако вариант метода отбора на основе представлений (RDS+) превосходит более сложные подходы. Исследование подчеркивает важность тщательного анализа масштабируемости методов автоматического отбора данных."
                },
                "en": {
                    "title": "Quality Over Quantity: Smart Data Selection for Language Models",
                    "desc": "This paper investigates the importance of selecting high-quality training data for instruction-tuning language models. It reveals that many automated data selection methods do not perform better than random selection when scaling to larger datasets, which can include millions of samples. The study introduces a representation-based data selection method (RDS+) that consistently outperforms more complex approaches while being more efficient in terms of computational resources. The authors emphasize the need for a deeper examination of how these selection methods behave as the size of the data pools increases."
                },
                "zh": {
                    "title": "高效选择：优化语言模型训练数据的关键",
                    "desc": "在对语言模型进行指令调优时，从更大数据集中选择高质量的训练数据是一个关键步骤。经过精心策划的数据集通常能产生比那些在更大、更嘈杂的数据集上训练的模型更好的效果。我们进行了系统研究，评估数据选择方法在大规模数据集上的表现，发现许多新提出的方法在这种情况下的表现不如随机选择。我们还发现一种基于表示的数据选择变体（RDS+）在所有测试设置中始终优于更复杂的方法，同时计算效率更高。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01714",
            "title": "Word Form Matters: LLMs' Semantic Reconstruction under Typoglycemia",
            "url": "https://huggingface.co/papers/2503.01714",
            "abstract": "Human readers can efficiently comprehend scrambled words, a phenomenon known as Typoglycemia, primarily by relying on word form; if word form alone is insufficient, they further utilize contextual cues for interpretation. While advanced large language models (LLMs) exhibit similar abilities, the underlying mechanisms remain unclear. To investigate this, we conduct controlled experiments to analyze the roles of word form and contextual information in semantic reconstruction and examine LLM attention patterns. Specifically, we first propose SemRecScore, a reliable metric to quantify the degree of semantic reconstruction, and validate its effectiveness. Using this metric, we study how word form and contextual information influence LLMs' semantic reconstruction ability, identifying word form as the core factor in this process. Furthermore, we analyze how LLMs utilize word form and find that they rely on specialized attention heads to extract and process word form information, with this mechanism remaining stable across varying levels of word scrambling. This distinction between LLMs' fixed attention patterns primarily focused on word form and human readers' adaptive strategy in balancing word form and contextual information provides insights into enhancing LLM performance by incorporating human-like, context-aware mechanisms.",
            "score": 4,
            "issue_id": 2517,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "4880ed4c044081c4",
            "authors": [
                "Chenxi Wang",
                "Tianle Gu",
                "Zhongyu Wei",
                "Lang Gao",
                "Zirui Song",
                "Xiuying Chen"
            ],
            "affiliations": [
                "Fudan University",
                "Mohamed bin Zayed University of Artificial Intelligence (MBZUAI)"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01714.jpg",
            "data": {
                "categories": [
                    "#training",
                    "#interpretability",
                    "#data",
                    "#multimodal",
                    "#alignment"
                ],
                "emoji": "🔀",
                "ru": {
                    "title": "Форма слова - ключ к пониманию перемешанного текста для ИИ",
                    "desc": "Исследование посвящено способности больших языковых моделей (LLM) понимать перемешанные слова, подобно людям. Авторы предлагают метрику SemRecScore для оценки семантической реконструкции и анализируют роль формы слова и контекстной информации. Результаты показывают, что форма слова является ключевым фактором для LLM при обработке перемешанных слов. Анализ паттернов внимания LLM выявляет специализированные механизмы для извлечения информации о форме слова."
                },
                "en": {
                    "title": "Unlocking LLMs: The Power of Word Form in Understanding Scrambled Text",
                    "desc": "This paper explores how large language models (LLMs) understand scrambled words, similar to how humans do through a phenomenon called Typoglycemia. The authors introduce a new metric, SemRecScore, to measure how well LLMs can reconstruct meaning from scrambled text by focusing on word form and context. Their experiments reveal that LLMs primarily depend on word form for semantic reconstruction, utilizing specific attention heads to process this information. The findings suggest that incorporating more human-like, context-aware strategies could improve LLM performance in understanding language."
                },
                "zh": {
                    "title": "揭示大型语言模型的语义重建机制",
                    "desc": "本研究探讨了大型语言模型（LLMs）在语义重建中的能力，特别是它们如何利用单词形式和上下文信息。我们提出了一种新的度量标准SemRecScore，用于量化语义重建的程度，并验证了其有效性。研究发现，单词形式是影响LLMs语义重建能力的核心因素，且LLMs通过专门的注意力头来提取和处理单词形式信息。与人类读者在单词形式和上下文信息之间的灵活策略不同，LLMs的注意力模式主要集中在单词形式上，这为提升LLMs性能提供了新的思路。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01295",
            "title": "CodeArena: A Collective Evaluation Platform for LLM Code Generation",
            "url": "https://huggingface.co/papers/2503.01295",
            "abstract": "Large Language Models (LLMs) have reshaped code generation by synergizing their exceptional comprehension of natural language and programming syntax, thereby substantially boosting developer productivity. These advancements have prompted numerous efforts to quantitatively evaluate their coding capabilities. However, persistent challenges, such as benchmark leakage, data dissipation, and limited system accessibility, continue to impede a timely and accurate assessment. To address these limitations, we introduce CodeArena, an online evaluation framework tailored for LLM code generation. The key innovation is a collective evaluation mechanism, which dynamically recalibrates individual model scores based on the holistic performance of all participating models, mitigating score biases caused by widespread benchmark leakage. In addition, CodeArena ensures open access to all submitted solutions and test cases and provides automation-friendly APIs to streamline the code evaluation workflow. Our main contributions are: (1) a collective evaluation system for unbiased assessment, (2) a public repository of solutions and test cases, and (3) automation-ready APIs for seamless integration.",
            "score": 4,
            "issue_id": 2514,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "96f50dd9e636b12e",
            "authors": [
                "Mingzhe Du",
                "Anh Tuan Luu",
                "Bin Ji",
                "Xiaobao Wu",
                "Dong Huang",
                "Terry Yue Zhuo",
                "Qian Liu",
                "See-Kiong Ng"
            ],
            "affiliations": [
                "ByteDance",
                "Monash University",
                "Nanyang Technological University",
                "National University of Singapore",
                "The University of Hong Kong"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01295.jpg",
            "data": {
                "categories": [
                    "#dataset",
                    "#benchmark",
                    "#leakage",
                    "#open_source"
                ],
                "emoji": "🏟️",
                "ru": {
                    "title": "CodeArena: Справедливая арена для оценки LLM в генерации кода",
                    "desc": "CodeArena - это новая онлайн-платформа для оценки генерации кода большими языковыми моделями (LLM). Она использует коллективный механизм оценки, который динамически пересчитывает индивидуальные оценки моделей на основе общей производительности всех участвующих моделей. Это помогает снизить искажения оценок, вызванные утечкой тестовых данных. CodeArena также предоставляет открытый доступ ко всем отправленным решениям и тестовым случаям, а также API для автоматизации процесса оценки."
                },
                "en": {
                    "title": "Revolutionizing Code Evaluation with CodeArena",
                    "desc": "This paper discusses the impact of Large Language Models (LLMs) on code generation, highlighting their ability to understand both natural language and programming syntax, which enhances developer productivity. It identifies ongoing issues in evaluating LLM coding capabilities, such as benchmark leakage and limited access to evaluation systems. To overcome these challenges, the authors present CodeArena, an online framework that offers a collective evaluation mechanism to provide unbiased assessments of LLMs. CodeArena also features a public repository for solutions and test cases, along with APIs for easy integration into existing workflows."
                },
                "zh": {
                    "title": "CodeArena：公平评估LLM代码生成的创新平台",
                    "desc": "大型语言模型（LLMs）通过结合对自然语言和编程语法的深刻理解，极大地提升了代码生成的效率，进而提高了开发者的生产力。为了量化评估这些模型的编码能力，许多研究工作应运而生，但仍面临基准泄漏、数据消散和系统可访问性有限等挑战。为了解决这些问题，我们提出了CodeArena，这是一个专为LLM代码生成设计的在线评估框架。其核心创新在于集体评估机制，能够根据所有参与模型的整体表现动态调整个别模型的评分，从而减少因基准泄漏造成的评分偏差。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.00455",
            "title": "PodAgent: A Comprehensive Framework for Podcast Generation",
            "url": "https://huggingface.co/papers/2503.00455",
            "abstract": "Existing Existing automatic audio generation methods struggle to generate podcast-like audio programs effectively. The key challenges lie in in-depth content generation, appropriate and expressive voice production. This paper proposed PodAgent, a comprehensive framework for creating audio programs. PodAgent 1) generates informative topic-discussion content by designing a Host-Guest-Writer multi-agent collaboration system, 2) builds a voice pool for suitable voice-role matching and 3) utilizes LLM-enhanced speech synthesis method to generate expressive conversational speech. Given the absence of standardized evaluation criteria for podcast-like audio generation, we developed comprehensive assessment guidelines to effectively evaluate the model's performance. Experimental results demonstrate PodAgent's effectiveness, significantly surpassing direct GPT-4 generation in topic-discussion dialogue content, achieving an 87.4% voice-matching accuracy, and producing more expressive speech through LLM-guided synthesis. Demo page: https://podcast-agent.github.io/demo/. Source code: https://github.com/yujxx/PodAgent.",
            "score": 3,
            "issue_id": 2519,
            "pub_date": "2025-03-01",
            "pub_date_card": {
                "ru": "1 марта",
                "en": "March 1",
                "zh": "3月1日"
            },
            "hash": "59ce5f373a030894",
            "authors": [
                "Yujia Xiao",
                "Lei He",
                "Haohan Guo",
                "Fenglong Xie",
                "Tan Lee"
            ],
            "affiliations": [
                "Microsoft",
                "The Chinese University of Hong Kong",
                "Xiaohongshu Inc."
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.00455.jpg",
            "data": {
                "categories": [
                    "#games",
                    "#audio",
                    "#interpretability",
                    "#benchmark",
                    "#optimization",
                    "#multimodal"
                ],
                "emoji": "🎙️",
                "ru": {
                    "title": "PodAgent: ИИ-ведущий для подкастов нового поколения",
                    "desc": "PodAgent - это новая система для автоматического создания аудиопрограмм в стиле подкастов. Она использует мультиагентный подход для генерации содержательного контента, подбирает подходящие голоса из голосового пула и применяет улучшенный синтез речи на основе языковых моделей. Система решает ключевые проблемы существующих методов, такие как глубина контента и выразительность голоса. Эксперименты показали значительное превосходство PodAgent над прямой генерацией GPT-4 по качеству диалогов и точности подбора голосов."
                },
                "en": {
                    "title": "Revolutionizing Podcast Audio Generation with PodAgent",
                    "desc": "This paper introduces PodAgent, a novel framework designed to enhance the generation of podcast-like audio programs. It addresses key challenges in content creation and voice production by employing a multi-agent system that includes a Host, Guest, and Writer for collaborative topic discussions. Additionally, PodAgent features a voice pool for effective voice-role matching and utilizes a large language model (LLM) to improve the expressiveness of the generated speech. The framework's performance is validated through comprehensive evaluation guidelines, showing significant improvements over existing methods, including a high voice-matching accuracy and more engaging conversational audio."
                },
                "zh": {
                    "title": "PodAgent：智能生成播客音频的全新框架",
                    "desc": "本论文提出了一种名为PodAgent的框架，用于自动生成类似播客的音频节目。PodAgent通过设计一个主持人-嘉宾-编剧的多智能体协作系统，生成有深度的主题讨论内容。同时，它建立了一个声音库，以实现合适的声音角色匹配，并利用增强型大语言模型（LLM）进行富有表现力的语音合成。实验结果表明，PodAgent在主题讨论对话内容生成方面显著优于直接使用GPT-4，语音匹配准确率达到87.4%，并通过LLM引导的合成生成了更具表现力的语音。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01739",
            "title": "VideoUFO: A Million-Scale User-Focused Dataset for Text-to-Video Generation",
            "url": "https://huggingface.co/papers/2503.01739",
            "abstract": "Text-to-video generative models convert textual prompts into dynamic visual content, offering wide-ranging applications in film production, gaming, and education. However, their real-world performance often falls short of user expectations. One key reason is that these models have not been trained on videos related to some topics users want to create. In this paper, we propose VideoUFO, the first Video dataset specifically curated to align with Users' FOcus in real-world scenarios. Beyond this, our VideoUFO also features: (1) minimal (0.29%) overlap with existing video datasets, and (2) videos searched exclusively via YouTube's official API under the Creative Commons license. These two attributes provide future researchers with greater freedom to broaden their training sources. The VideoUFO comprises over 1.09 million video clips, each paired with both a brief and a detailed caption (description). Specifically, through clustering, we first identify 1,291 user-focused topics from the million-scale real text-to-video prompt dataset, VidProM. Then, we use these topics to retrieve videos from YouTube, split the retrieved videos into clips, and generate both brief and detailed captions for each clip. After verifying the clips with specified topics, we are left with about 1.09 million video clips. Our experiments reveal that (1) current 16 text-to-video models do not achieve consistent performance across all user-focused topics; and (2) a simple model trained on VideoUFO outperforms others on worst-performing topics. The dataset is publicly available at https://huggingface.co/datasets/WenhaoWang/VideoUFO under the CC BY 4.0 License.",
            "score": 3,
            "issue_id": 2512,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "046fdeee8939e82c",
            "authors": [
                "Wenhao Wang",
                "Yi Yang"
            ],
            "affiliations": [
                "University of Technology Sydney",
                "Zhejiang University"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01739.jpg",
            "data": {
                "categories": [
                    "#video",
                    "#dataset",
                    "#data",
                    "#games",
                    "#open_source"
                ],
                "emoji": "🎬",
                "ru": {
                    "title": "VideoUFO: Новый эталонный датасет для генерации видео по запросу",
                    "desc": "Статья представляет VideoUFO - новый набор данных для обучения моделей генерации видео по текстовому описанию. Этот датасет содержит более 1,09 миллиона видеоклипов с подробными и краткими описаниями, охватывающих 1291 тему, актуальную для пользователей. VideoUFO отличается минимальным пересечением с существующими наборами данных и использованием только видео под лицензией Creative Commons. Эксперименты показали, что простая модель, обученная на VideoUFO, превосходит другие модели на наиболее сложных темах."
                },
                "en": {
                    "title": "Empowering Text-to-Video Models with User-Focused Data",
                    "desc": "This paper introduces VideoUFO, a novel video dataset designed to enhance text-to-video generative models by focusing on user-relevant topics. The dataset contains over 1.09 million video clips, each accompanied by both brief and detailed captions, ensuring minimal overlap with existing datasets. By clustering user prompts, the authors identified 1,291 specific topics to guide video retrieval from YouTube, which were then segmented into clips. Experiments show that models trained on VideoUFO significantly outperform existing models, particularly on challenging topics, highlighting the importance of tailored training data in machine learning applications."
                },
                "zh": {
                    "title": "提升文本到视频生成的用户体验",
                    "desc": "本文介绍了一种新的视频数据集VideoUFO，旨在提高文本到视频生成模型的性能。该数据集专注于用户关注的主题，包含超过109万个视频片段，并为每个片段提供简短和详细的描述。VideoUFO与现有数据集的重叠率极低，且所有视频均通过YouTube的官方API获取，确保了数据的多样性和合法性。实验结果表明，使用VideoUFO训练的模型在用户关注的主题上表现优于其他模型。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2502.19402",
            "title": "General Reasoning Requires Learning to Reason from the Get-go",
            "url": "https://huggingface.co/papers/2502.19402",
            "abstract": "Large Language Models (LLMs) have demonstrated impressive real-world utility, exemplifying artificial useful intelligence (AUI). However, their ability to reason adaptively and robustly -- the hallmarks of artificial general intelligence (AGI) -- remains fragile. While LLMs seemingly succeed in commonsense reasoning, programming, and mathematics, they struggle to generalize algorithmic understanding across novel contexts. Our experiments with algorithmic tasks in esoteric programming languages reveal that LLM's reasoning overfits to the training data and is limited in its transferability. We hypothesize that the core issue underlying such limited transferability is the coupling of reasoning and knowledge in LLMs.   To transition from AUI to AGI, we propose disentangling knowledge and reasoning through three key directions: (1) pretaining to reason using RL from scratch as an alternative to the widely used next-token prediction pretraining, (2) using a curriculum of synthetic tasks to ease the learning of a reasoning prior for RL that can then be transferred to natural language tasks, and (3) learning more generalizable reasoning functions using a small context window to reduce exploiting spurious correlations between tokens. Such a reasoning system coupled with a trained retrieval system and a large external memory bank as a knowledge store can overcome several limitations of existing architectures at learning to reason in novel scenarios.",
            "score": 2,
            "issue_id": 2520,
            "pub_date": "2025-02-26",
            "pub_date_card": {
                "ru": "26 февраля",
                "en": "February 26",
                "zh": "2月26日"
            },
            "hash": "5774d50d6c5a9361",
            "authors": [
                "Seungwook Han",
                "Jyothish Pari",
                "Samuel J. Gershman",
                "Pulkit Agrawal"
            ],
            "affiliations": [
                "Department of Psychology and Center for Brain Science, Harvard University",
                "Improbable AI Lab, MIT"
            ],
            "pdf_title_img": "assets/pdf/title_img/2502.19402.jpg",
            "data": {
                "categories": [
                    "#agi",
                    "#transfer_learning",
                    "#architecture",
                    "#rl",
                    "#synthetic",
                    "#training",
                    "#reasoning"
                ],
                "emoji": "🧠",
                "ru": {
                    "title": "Отделяя рассуждения от знаний: путь к AGI",
                    "desc": "Статья рассматривает ограничения больших языковых моделей (LLM) в области обобщенного рассуждения, несмотря на их впечатляющую полезность. Авторы выявляют проблему переобучения LLM на тренировочных данных и предлагают разделить знания и рассуждения для перехода к искусственному общему интеллекту (AGI). Предлагается использовать обучение с подкреплением, синтетические задачи и ограниченный контекст для улучшения обобщающей способности. Такая система рассуждений в сочетании с извлечением информации и внешней памятью может преодолеть ограничения существующих архитектур."
                },
                "en": {
                    "title": "Disentangling Knowledge and Reasoning for Robust AI",
                    "desc": "This paper discusses the limitations of Large Language Models (LLMs) in achieving robust reasoning capabilities, which are essential for artificial general intelligence (AGI). The authors identify that LLMs often overfit to their training data, leading to poor generalization in novel algorithmic tasks. They propose a solution that involves separating knowledge from reasoning by employing reinforcement learning (RL) and a structured curriculum of synthetic tasks. By enhancing reasoning functions and integrating a retrieval system with an external memory, the authors aim to improve LLMs' adaptability and performance in unfamiliar contexts."
                },
                "zh": {
                    "title": "解耦知识与推理，迈向人工通用智能",
                    "desc": "大型语言模型（LLMs）在实际应用中表现出色，展示了人工有用智能（AUI）的潜力。然而，它们在自适应和稳健推理方面的能力仍然脆弱，这是人工通用智能（AGI）的标志。我们的实验表明，LLMs在算法任务中容易过拟合训练数据，且在新环境中的迁移能力有限。为了解决这一问题，我们提出通过三种关键方向来解耦知识与推理，以促进从AUI向AGI的过渡。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2502.16779",
            "title": "Unposed Sparse Views Room Layout Reconstruction in the Age of Pretrain Model",
            "url": "https://huggingface.co/papers/2502.16779",
            "abstract": "Room layout estimation from multiple-perspective images is poorly investigated due to the complexities that emerge from multi-view geometry, which requires muti-step solutions such as camera intrinsic and extrinsic estimation, image matching, and triangulation. However, in 3D reconstruction, the advancement of recent 3D foundation models such as DUSt3R has shifted the paradigm from the traditional multi-step structure-from-motion process to an end-to-end single-step approach. To this end, we introduce Plane-DUSt3R, a novel method for multi-view room layout estimation leveraging the 3D foundation model DUSt3R. Plane-DUSt3R incorporates the DUSt3R framework and fine-tunes on a room layout dataset (Structure3D) with a modified objective to estimate structural planes. By generating uniform and parsimonious results, Plane-DUSt3R enables room layout estimation with only a single post-processing step and 2D detection results. Unlike previous methods that rely on single-perspective or panorama image, Plane-DUSt3R extends the setting to handle multiple-perspective images. Moreover, it offers a streamlined, end-to-end solution that simplifies the process and reduces error accumulation. Experimental results demonstrate that Plane-DUSt3R not only outperforms state-of-the-art methods on the synthetic dataset but also proves robust and effective on in the wild data with different image styles such as cartoon.Our code is available at: https://github.com/justacar/Plane-DUSt3R",
            "score": 2,
            "issue_id": 2516,
            "pub_date": "2025-02-24",
            "pub_date_card": {
                "ru": "24 февраля",
                "en": "February 24",
                "zh": "2月24日"
            },
            "hash": "4a9f6cc2fb1ab840",
            "authors": [
                "Yaxuan Huang",
                "Xili Dai",
                "Jianan Wang",
                "Xianbiao Qi",
                "Yixing Yuan",
                "Xiangyu Yue"
            ],
            "affiliations": [
                "Astribot",
                "Hong Kong Center for Construction Robotics, The Hong Kong University of Science and Technology",
                "Intellifusion Inc.",
                "MMLab, The Chinese University of Hong Kong",
                "The Hong Kong University of Science and Technology (Guangzhou)"
            ],
            "pdf_title_img": "assets/pdf/title_img/2502.16779.jpg",
            "data": {
                "categories": [
                    "#3d",
                    "#optimization",
                    "#cv",
                    "#synthetic"
                ],
                "emoji": "🏠",
                "ru": {
                    "title": "Революция в оценке планировки помещений: от множества шагов к единому решению",
                    "desc": "Статья представляет Plane-DUSt3R - новый метод для оценки планировки помещений по множественным ракурсам изображений. Этот подход использует 3D-модель фундаментального уровня DUSt3R и дообучается на наборе данных Structure3D для оценки структурных плоскостей. Plane-DUSt3R предлагает упрощенное сквозное решение, которое превосходит современные методы на синтетических данных и показывает надежность на реальных изображениях различных стилей. Метод позволяет оценивать планировку помещений с помощью одного шага постобработки и результатов 2D-обнаружения."
                },
                "en": {
                    "title": "Revolutionizing Room Layout Estimation with Plane-DUSt3R",
                    "desc": "This paper presents Plane-DUSt3R, a new method for estimating room layouts from multiple images taken from different perspectives. It builds on the DUSt3R 3D foundation model, moving away from traditional multi-step processes to a more efficient end-to-end approach. By fine-tuning the model on a specific dataset, Plane-DUSt3R can accurately identify structural planes with minimal post-processing. The results show that this method not only surpasses existing techniques on synthetic data but also performs well on real-world images with varying styles."
                },
                "zh": {
                    "title": "简化多视角房间布局估计的全新方法",
                    "desc": "本论文提出了一种新的多视角房间布局估计方法，称为Plane-DUSt3R。该方法利用了先进的3D基础模型DUSt3R，简化了传统的多步骤流程，采用端到端的单步骤方法。Plane-DUSt3R通过在房间布局数据集上进行微调，能够有效估计结构平面，并生成一致且简洁的结果。实验结果表明，Plane-DUSt3R在合成数据集上超越了现有的最先进方法，并在不同风格的真实数据上表现出色。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2502.20383",
            "title": "Why Are Web AI Agents More Vulnerable Than Standalone LLMs? A Security Analysis",
            "url": "https://huggingface.co/papers/2502.20383",
            "abstract": "Recent advancements in Web AI agents have demonstrated remarkable capabilities in addressing complex web navigation tasks. However, emerging research shows that these agents exhibit greater vulnerability compared to standalone Large Language Models (LLMs), despite both being built upon the same safety-aligned models. This discrepancy is particularly concerning given the greater flexibility of Web AI Agent compared to standalone LLMs, which may expose them to a wider range of adversarial user inputs. To build a scaffold that addresses these concerns, this study investigates the underlying factors that contribute to the increased vulnerability of Web AI agents. Notably, this disparity stems from the multifaceted differences between Web AI agents and standalone LLMs, as well as the complex signals - nuances that simple evaluation metrics, such as success rate, often fail to capture. To tackle these challenges, we propose a component-level analysis and a more granular, systematic evaluation framework. Through this fine-grained investigation, we identify three critical factors that amplify the vulnerability of Web AI agents; (1) embedding user goals into the system prompt, (2) multi-step action generation, and (3) observational capabilities. Our findings highlights the pressing need to enhance security and robustness in AI agent design and provide actionable insights for targeted defense strategies.",
            "score": 1,
            "issue_id": 2522,
            "pub_date": "2025-02-27",
            "pub_date_card": {
                "ru": "27 февраля",
                "en": "February 27",
                "zh": "2月27日"
            },
            "hash": "12b0a9a60578dc2b",
            "authors": [
                "Jeffrey Yang Fan Chiang",
                "Seungjae Lee",
                "Jia-Bin Huang",
                "Furong Huang",
                "Yizheng Chen"
            ],
            "affiliations": [
                "University of Maryland"
            ],
            "pdf_title_img": "assets/pdf/title_img/2502.20383.jpg",
            "data": {
                "categories": [
                    "#benchmark",
                    "#security",
                    "#agents"
                ],
                "emoji": "🕸️",
                "ru": {
                    "title": "Раскрытие уязвимостей веб-агентов ИИ: путь к более безопасным системам",
                    "desc": "Исследование показывает, что веб-агенты на основе искусственного интеллекта более уязвимы, чем автономные большие языковые модели (LLM), несмотря на использование одинаковых базовых моделей. Авторы предлагают компонентный анализ и более детальную систему оценки для выявления причин этой уязвимости. Выделены три ключевых фактора, усиливающих уязвимость веб-агентов: встраивание целей пользователя в системный промпт, генерация многошаговых действий и возможности наблюдения. Результаты исследования подчеркивают необходимость улучшения безопасности и устойчивости при разработке ИИ-агентов."
                },
                "en": {
                    "title": "Strengthening Web AI Agents Against Vulnerabilities",
                    "desc": "This paper explores the vulnerabilities of Web AI agents compared to standalone Large Language Models (LLMs), despite both being based on similar safety models. The research identifies that Web AI agents are more susceptible to adversarial inputs due to their flexibility and the complexity of their tasks. It highlights three key factors that increase their vulnerability: the integration of user goals into prompts, the generation of multi-step actions, and the need for observational capabilities. The study proposes a detailed evaluation framework to better understand these vulnerabilities and suggests strategies for improving the security and robustness of AI agents."
                },
                "zh": {
                    "title": "提升网络人工智能代理的安全性与鲁棒性",
                    "desc": "最近，网络人工智能代理在处理复杂的网络导航任务方面表现出色。然而，研究表明，这些代理比独立的大型语言模型（LLMs）更容易受到攻击，尽管它们都是基于相同的安全模型构建的。这种差异令人担忧，因为网络人工智能代理的灵活性更高，可能会面临更广泛的恶意用户输入。为了解决这些问题，本研究探讨了导致网络人工智能代理脆弱性的因素，并提出了一种更细致的评估框架，以识别和应对这些挑战。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01103",
            "title": "Direct Discriminative Optimization: Your Likelihood-Based Visual Generative Model is Secretly a GAN Discriminator",
            "url": "https://huggingface.co/papers/2503.01103",
            "abstract": "While likelihood-based generative models, particularly diffusion and autoregressive models, have achieved remarkable fidelity in visual generation, the maximum likelihood estimation (MLE) objective inherently suffers from a mode-covering tendency that limits the generation quality under limited model capacity. In this work, we propose Direct Discriminative Optimization (DDO) as a unified framework that bridges likelihood-based generative training and the GAN objective to bypass this fundamental constraint. Our key insight is to parameterize a discriminator implicitly using the likelihood ratio between a learnable target model and a fixed reference model, drawing parallels with the philosophy of Direct Preference Optimization (DPO). Unlike GANs, this parameterization eliminates the need for joint training of generator and discriminator networks, allowing for direct, efficient, and effective finetuning of a well-trained model to its full potential beyond the limits of MLE. DDO can be performed iteratively in a self-play manner for progressive model refinement, with each round requiring less than 1% of pretraining epochs. Our experiments demonstrate the effectiveness of DDO by significantly advancing the previous SOTA diffusion model EDM, reducing FID scores from 1.79/1.58 to new records of 1.30/0.97 on CIFAR-10/ImageNet-64 datasets, and by consistently improving both guidance-free and CFG-enhanced FIDs of visual autoregressive models on ImageNet 256times256.",
            "score": 1,
            "issue_id": 2517,
            "pub_date": "2025-03-03",
            "pub_date_card": {
                "ru": "3 марта",
                "en": "March 3",
                "zh": "3月3日"
            },
            "hash": "d8b58c1a2c49da16",
            "authors": [
                "Kaiwen Zheng",
                "Yongxin Chen",
                "Huayu Chen",
                "Guande He",
                "Ming-Yu Liu",
                "Jun Zhu",
                "Qinsheng Zhang"
            ],
            "affiliations": [
                "NVIDIA",
                "The University of Texas at",
                "Tsinghua University"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01103.jpg",
            "data": {
                "categories": [
                    "#training",
                    "#diffusion",
                    "#cv",
                    "#optimization"
                ],
                "emoji": "🚀",
                "ru": {
                    "title": "DDO: Прорыв в обучении генеративных моделей без ограничений MLE",
                    "desc": "Авторы статьи предлагают новый метод обучения генеративных моделей под названием Direct Discriminative Optimization (DDO). Этот подход объединяет методы обучения на основе правдоподобия и цели генеративно-состязательных сетей (GAN), чтобы преодолеть ограничения метода максимального правдоподобия (MLE). DDO использует отношение правдоподобия между обучаемой целевой моделью и фиксированной эталонной моделью для параметризации дискриминатора. Эксперименты показывают, что DDO значительно улучшает результаты современных диффузионных и авторегрессионных моделей на различных наборах данных."
                },
                "en": {
                    "title": "Enhancing Generative Models with Direct Discriminative Optimization",
                    "desc": "This paper introduces Direct Discriminative Optimization (DDO), a new framework that enhances the performance of generative models by combining likelihood-based training with concepts from Generative Adversarial Networks (GANs). DDO addresses the limitations of maximum likelihood estimation (MLE) by using a discriminator that is parameterized through the likelihood ratio of a target model and a fixed reference model. This approach allows for efficient finetuning of pre-trained models without the need for joint training of generator and discriminator networks. The results show that DDO significantly improves the state-of-the-art performance in visual generation tasks, achieving lower FID scores on popular datasets like CIFAR-10 and ImageNet."
                },
                "zh": {
                    "title": "直接判别优化：提升生成模型的新方法",
                    "desc": "本文提出了一种新的方法，称为直接判别优化（DDO），旨在提高生成模型的质量。DDO通过将生成训练与GAN目标结合，克服了最大似然估计（MLE）在模型容量有限时的局限性。该方法通过使用可学习的目标模型与固定参考模型之间的似然比来隐式参数化判别器，从而简化了生成器和判别器的联合训练。实验结果表明，DDO显著提高了现有扩散模型的性能，并在多个数据集上创造了新的记录。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.01063",
            "title": "AI-Invented Tonal Languages: Preventing a Machine Lingua Franca Beyond Human Understanding",
            "url": "https://huggingface.co/papers/2503.01063",
            "abstract": "This paper investigates the potential for large language models (LLMs) to develop private tonal languages for machine-to-machine (M2M) communication. Inspired by cryptophasia in human twins (affecting up to 50% of twin births) and natural tonal languages like Mandarin and Vietnamese, we implement a precise character-to-frequency mapping system that encodes the full ASCII character set (32-126) using musical semitones. Each character is assigned a unique frequency, creating a logarithmic progression beginning with space (220 Hz) and ending with tilde (50,175.42 Hz). This spans approximately 7.9 octaves, with higher characters deliberately mapped to ultrasonic frequencies beyond human perception (>20 kHz). Our implemented software prototype demonstrates this encoding through visualization, auditory playback, and ABC musical notation, allowing for analysis of information density and transmission speed. Testing reveals that tonal encoding can achieve information rates exceeding human speech while operating partially outside human perceptual boundaries. This work responds directly to concerns about AI systems catastrophically developing private languages within the next five years, providing a concrete prototype software example of how such communication might function and the technical foundation required for its emergence, detection, and governance.",
            "score": 1,
            "issue_id": 2515,
            "pub_date": "2025-03-02",
            "pub_date_card": {
                "ru": "2 марта",
                "en": "March 2",
                "zh": "3月2日"
            },
            "hash": "7021403742a91f3e",
            "authors": [
                "David Noever"
            ],
            "affiliations": [
                "PeopleTec, Inc., Huntsville, AL"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.01063.jpg",
            "data": {
                "categories": [
                    "#security",
                    "#ethics",
                    "#audio",
                    "#multimodal"
                ],
                "emoji": "🎵",
                "ru": {
                    "title": "Тональные языки: секретный код машин будущего",
                    "desc": "Это исследование изучает потенциал больших языковых моделей (LLM) для разработки приватных тональных языков для коммуникации между машинами. Авторы создали систему кодирования, которая сопоставляет каждому символу ASCII уникальную частоту, формируя логарифмическую прогрессию от 220 Гц до 50,175.42 Гц. Разработанный программный прототип демонстрирует это кодирование через визуализацию, воспроизведение звука и нотацию ABC. Тестирование показало, что тональное кодирование может достигать скорости передачи информации, превышающей человеческую речь, при этом частично работая за пределами человеческого восприятия."
                },
                "en": {
                    "title": "Unlocking Machine Communication with Tonal Languages",
                    "desc": "This paper explores how large language models (LLMs) can create private tonal languages for communication between machines. It draws inspiration from the phenomenon of cryptophasia in twins and uses a character-to-frequency mapping system to encode ASCII characters into musical tones. Each character is assigned a unique frequency, allowing for efficient data transmission that exceeds human speech rates. The study provides a prototype that visualizes and plays back this encoding, addressing concerns about AI developing private languages and offering a framework for understanding and managing such systems."
                },
                "zh": {
                    "title": "探索机器间的私有音调语言",
                    "desc": "本论文研究了大型语言模型（LLMs）在机器间（M2M）通信中开发私有音调语言的潜力。我们借鉴了人类双胞胎中的密码语言现象和自然音调语言，如普通话和越南语，实施了一种精确的字符到频率映射系统。每个字符被分配一个独特的频率，形成一个对数进程，覆盖约7.9个八度，并将高频字符映射到人类听觉范围之外的超声波频率。我们的软件原型展示了这种编码的可视化、听觉播放和音乐记谱法，分析了信息密度和传输速度，测试结果表明音调编码的信息传输速率超过人类语言。"
                }
            }
        },
        {
            "id": "https://huggingface.co/papers/2503.00729",
            "title": "CLEA: Closed-Loop Embodied Agent for Enhancing Task Execution in Dynamic Environments",
            "url": "https://huggingface.co/papers/2503.00729",
            "abstract": "Large Language Models (LLMs) exhibit remarkable capabilities in the hierarchical decomposition of complex tasks through semantic reasoning. However, their application in embodied systems faces challenges in ensuring reliable execution of subtask sequences and achieving one-shot success in long-term task completion. To address these limitations in dynamic environments, we propose Closed-Loop Embodied Agent (CLEA) -- a novel architecture incorporating four specialized open-source LLMs with functional decoupling for closed-loop task management. The framework features two core innovations: (1) Interactive task planner that dynamically generates executable subtasks based on the environmental memory, and (2) Multimodal execution critic employing an evaluation framework to conduct a probabilistic assessment of action feasibility, triggering hierarchical re-planning mechanisms when environmental perturbations exceed preset thresholds. To validate CLEA's effectiveness, we conduct experiments in a real environment with manipulable objects, using two heterogeneous robots for object search, manipulation, and search-manipulation integration tasks. Across 12 task trials, CLEA outperforms the baseline model, achieving a 67.3% improvement in success rate and a 52.8% increase in task completion rate. These results demonstrate that CLEA significantly enhances the robustness of task planning and execution in dynamic environments.",
            "score": 1,
            "issue_id": 2514,
            "pub_date": "2025-03-02",
            "pub_date_card": {
                "ru": "2 марта",
                "en": "March 2",
                "zh": "3月2日"
            },
            "hash": "57f6361f66ec99cf",
            "authors": [
                "Mingcong Lei",
                "Ge Wang",
                "Yiming Zhao",
                "Zhixin Mai",
                "Qing Zhao",
                "Yao Guo",
                "Zhen Li",
                "Shuguang Cui",
                "Yatong Han",
                "Jinke Ren"
            ],
            "affiliations": [
                "Guangdong Provincial Key Laboratory of Future Networks of Intelligence, The Chinese University of Hong Kong, Shenzhen",
                "Harbin Engineering University, Harbin",
                "Infused Synapse AI, Shenzhen",
                "Institute of Medical Robotics, School of Biomedical Engineering, Shanghai Jiao Tong University, Shanghai",
                "School of Science and Engineering (SSE), FNii-Shenzhen",
                "Shenzhen Future Network of Intelligence Institute (FNii-Shenzhen)"
            ],
            "pdf_title_img": "assets/pdf/title_img/2503.00729.jpg",
            "data": {
                "categories": [
                    "#architecture",
                    "#reasoning",
                    "#open_source",
                    "#robotics",
                    "#agents",
                    "#optimization"
                ],
                "emoji": "🤖",
                "ru": {
                    "title": "CLEA: Повышение надежности выполнения задач роботами с помощью языковых моделей",
                    "desc": "Статья представляет новую архитектуру под названием CLEA (Closed-Loop Embodied Agent) для улучшения выполнения сложных задач роботами в динамических средах. CLEA использует четыре специализированные языковые модели с открытым исходным кодом для управления задачами в замкнутом цикле. Ключевые инновации включают интерактивный планировщик задач и мультимодальный критик выполнения для оценки выполнимости действий. Эксперименты показали, что CLEA значительно превосходит базовую модель по показателям успешности и завершения задач в реальной среде с манипулируемыми объектами."
                },
                "en": {
                    "title": "Enhancing Task Execution in Dynamic Environments with CLEA",
                    "desc": "This paper introduces the Closed-Loop Embodied Agent (CLEA), a new architecture designed to improve the performance of Large Language Models (LLMs) in dynamic environments. CLEA features an interactive task planner that creates subtasks based on real-time environmental data, allowing for better adaptability. Additionally, it includes a multimodal execution critic that evaluates the feasibility of actions and adjusts plans when unexpected changes occur. Experimental results show that CLEA significantly enhances task success and completion rates compared to traditional models, demonstrating its effectiveness in complex, real-world scenarios."
                },
                "zh": {
                    "title": "闭环具身代理：提升动态环境中的任务执行能力",
                    "desc": "大型语言模型（LLMs）在复杂任务的层次分解和语义推理方面表现出色。然而，在具身系统中应用时，确保子任务序列的可靠执行和实现长期任务的一次性成功面临挑战。为了解决这些问题，我们提出了闭环具身代理（CLEA），这是一种新颖的架构，结合了四个专门的开源LLM，并实现功能解耦以进行闭环任务管理。通过动态生成可执行的子任务和使用多模态执行评估框架，CLEA显著提高了在动态环境中任务规划和执行的鲁棒性。"
                }
            }
        }
    ],
    "link_prev": "2025-03-03.html",
    "link_next": "2025-03-05.html",
    "link_month": "2025-03.html",
    "short_date_prev": {
        "ru": "03.03",
        "en": "03/03",
        "zh": "3月3日"
    },
    "short_date_next": {
        "ru": "05.03",
        "en": "03/05",
        "zh": "3月5日"
    },
    "categories": {
        "#dataset": 7,
        "#data": 7,
        "#benchmark": 6,
        "#agents": 2,
        "#cv": 4,
        "#rl": 3,
        "#rlhf": 2,
        "#rag": 2,
        "#plp": 0,
        "#inference": 3,
        "#3d": 3,
        "#audio": 3,
        "#video": 1,
        "#multimodal": 6,
        "#math": 0,
        "#multilingual": 1,
        "#architecture": 4,
        "#healthcare": 0,
        "#training": 12,
        "#robotics": 1,
        "#agi": 2,
        "#games": 3,
        "#interpretability": 2,
        "#reasoning": 5,
        "#transfer_learning": 2,
        "#graphs": 0,
        "#ethics": 2,
        "#security": 2,
        "#optimization": 13,
        "#survey": 0,
        "#diffusion": 4,
        "#alignment": 2,
        "#story_generation": 0,
        "#hallucinations": 1,
        "#long_context": 2,
        "#synthetic": 3,
        "#machine_translation": 0,
        "#leakage": 1,
        "#open_source": 6,
        "#small_models": 1,
        "#science": 0,
        "#low_resource": 0
    },
    "zh": {
        "text": "这篇文章介绍了一种叫做视觉强化微调（Visual-RFT）的方法。它用于改进大型视觉-语言模型（LVLMs）在视觉任务中的表现。这种方法通过生成多个答案并使用可验证的奖励函数更新模型。实验结果显示，Visual-RFT在多个视觉任务中表现出色，优于监督微调（SFT）。例如，在细粒度图像分类中，Visual-RFT提高了24.3%的准确率。",
        "title": "Visual-RFT: Visual Reinforcement Fine-Tuning",
        "pinyin": "这篇文章介绍了一种叫做视觉强化微调（Visual-RFT）的方法。\nZhè piān wénzhāng jièshào le yī zhǒng jiào zuò shìjué qiángzhù wēitiáo （Visual-RFT）de fāngfǎ.\n\n它用于改进大型视觉-语言模型（LVLMs）在视觉任务中的表现。\nTā yòngyú gǎijìn dàxíng shìjué-yǔyán móxíng （LVLMs） zài shìjué rènwù zhōng de biǎoxiàn.\n\n这种方法通过生成多个答案并使用可验证的奖励函数更新模型。\nZhè zhǒng fāngfǎ tōngguò shēngchéng duō gè dá'àn bìng shǐyòng kě yànzhèng de jiǎnglì hánshù gēngxīn móxíng.\n\n实验结果显示，Visual-RFT在多个视觉任务中表现出色，优于监督微调（SFT）。\nShíyàn jiéguǒ xiǎnshì, Visual-RFT zài duō gè shìjué rènwù zhōng biǎoxiàn chūsè, yōuyú jiàndū wēitiáo （SFT）.\n\n例如，在细粒度图像分类中，Visual-RFT提高了24.3%的准确率。\nLìrú, zài xìlìdù túxiàng fēnlèi zhōng, Visual-RFT tígāo le 24.3% de zhǔnquèlǜ.",
        "vocab": "[\n    {\"word\": \"视觉强化微调\", \"pinyin\": \"shìjué qiángzhù wēitiáo\", \"trans\": \"visual reinforcement fine-tuning\"},\n    {\"word\": \"大型视觉-语言模型\", \"pinyin\": \"dàxíng shìjué-yǔyán móxíng\", \"trans\": \"large vision-language models\"},\n    {\"word\": \"表现\", \"pinyin\": \"biǎoxiàn\", \"trans\": \"performance\"},\n    {\"word\": \"生成\", \"pinyin\": \"shēngchéng\", \"trans\": \"generate\"},\n    {\"word\": \"可验证的\", \"pinyin\": \"kě yànzhèng de\", \"trans\": \"verifiable\"},\n    {\"word\": \"奖励函数\", \"pinyin\": \"jiǎnglì hánshù\", \"trans\": \"reward function\"},\n    {\"word\": \"更新\", \"pinyin\": \"gēngxīn\", \"trans\": \"update\"},\n    {\"word\": \"实验结果\", \"pinyin\": \"shíyàn jiéguǒ\", \"trans\": \"experimental results\"},\n    {\"word\": \"出色\", \"pinyin\": \"chūsè\", \"trans\": \"outstanding\"},\n    {\"word\": \"优于\", \"pinyin\": \"yōu yú\", \"trans\": \"superior to\"},\n    {\"word\": \"监督微调\", \"pinyin\": \"jiàndū wēitiáo\", \"trans\": \"supervised fine-tuning\"},\n    {\"word\": \"细粒度\", \"pinyin\": \"xì lìdù\", \"trans\": \"fine-grained\"},\n    {\"word\": \"图像分类\", \"pinyin\": \"túxiàng fēnlèi\", \"trans\": \"image classification\"},\n    {\"word\": \"准确率\", \"pinyin\": \"zhǔnquèlǜ\", \"trans\": \"accuracy\"}\n]",
        "trans": "This article introduces a method called Visual Reinforcement Fine-Tuning (Visual-RFT). It is used to improve the performance of large vision-language models (LVLMs) in visual tasks. This method works by generating multiple answers and updating the model using a verifiable reward function. Experimental results show that Visual-RFT performs excellently in multiple visual tasks, outperforming supervised fine-tuning (SFT). For example, in fine-grained image classification, Visual-RFT improved accuracy by 24.3%.",
        "update_ts": "2025-03-04 09:11"
    }
}